gh-104400: Add more tests to pygettext (GH-108173)

2024-11-21 21:09:37 +01:00 · 2024-11-03 15:01:09 +01:00 · 2024-11-03 15:01:09 +01:00 · dcae5cd6ab
commit dcae5cd6ab
parent 556dc9b8a7
8 changed files with 363 additions and 21 deletions
--- a/Lib/test/test_tools/i18n_data/docstrings.pot
+++ b/Lib/test/test_tools/i18n_data/docstrings.pot
@ -0,0 +1,40 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: docstrings.py:7
+#, docstring
+msgid ""
+msgstr ""
+
+#: docstrings.py:18
+#, docstring
+msgid ""
+"multiline\n"
+"    docstring\n"
+"    "
+msgstr ""
+
+#: docstrings.py:25
+#, docstring
+msgid "docstring1"
+msgstr ""
+
+#: docstrings.py:30
+#, docstring
+msgid "Hello, {}!"
+msgstr ""
+
--- a/Lib/test/test_tools/i18n_data/docstrings.py
+++ b/Lib/test/test_tools/i18n_data/docstrings.py
@ -0,0 +1,41 @@
+# Test docstring extraction
+from gettext import gettext as _
+
+
+# Empty docstring
+def test(x):
+    """"""
+
+
+# Leading empty line
+def test2(x):
+
+    """docstring"""  # XXX This should be extracted but isn't.
+
+
+# XXX Multiline docstrings should be cleaned with `inspect.cleandoc`.
+def test3(x):
+    """multiline
+    docstring
+    """
+
+
+# Multiple docstrings - only the first should be extracted
+def test4(x):
+    """docstring1"""
+    """docstring2"""
+
+
+def test5(x):
+    """Hello, {}!""".format("world!")  # XXX This should not be extracted.
+
+
+# Nested docstrings
+def test6(x):
+    def inner(y):
+        """nested docstring"""  # XXX This should be extracted but isn't.
+
+
+class Outer:
+    class Inner:
+        "nested class docstring"  # XXX This should be extracted but isn't.
--- a/Lib/test/test_tools/i18n_data/fileloc.pot
+++ b/Lib/test/test_tools/i18n_data/fileloc.pot
@ -0,0 +1,35 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: fileloc.py:5 fileloc.py:6
+msgid "foo"
+msgstr ""
+
+#: fileloc.py:9
+msgid "bar"
+msgstr ""
+
+#: fileloc.py:14 fileloc.py:18
+#, docstring
+msgid "docstring"
+msgstr ""
+
+#: fileloc.py:22 fileloc.py:26
+#, docstring
+msgid "baz"
+msgstr ""
+
--- a/Lib/test/test_tools/i18n_data/fileloc.py
+++ b/Lib/test/test_tools/i18n_data/fileloc.py
@ -0,0 +1,26 @@
+# Test file locations
+from gettext import gettext as _
+
+# Duplicate strings
+_('foo')
+_('foo')
+
+# Duplicate strings on the same line should only add one location to the output
+_('bar'), _('bar')
+
+
+# Duplicate docstrings
+class A:
+    """docstring"""
+
+
+def f():
+    """docstring"""
+
+
+# Duplicate message and docstring
+_('baz')
+
+
+def g():
+    """baz"""
--- a/Lib/test/test_tools/i18n_data/messages.pot
+++ b/Lib/test/test_tools/i18n_data/messages.pot
@ -0,0 +1,67 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: messages.py:5
+msgid ""
+msgstr ""
+
+#: messages.py:8 messages.py:9
+msgid "parentheses"
+msgstr ""
+
+#: messages.py:12
+msgid "Hello, world!"
+msgstr ""
+
+#: messages.py:15
+msgid ""
+"Hello,\n"
+"    multiline!\n"
+msgstr ""
+
+#: messages.py:29
+msgid "Hello, {}!"
+msgstr ""
+
+#: messages.py:33
+msgid "1"
+msgstr ""
+
+#: messages.py:33
+msgid "2"
+msgstr ""
+
+#: messages.py:34 messages.py:35
+msgid "A"
+msgstr ""
+
+#: messages.py:34 messages.py:35
+msgid "B"
+msgstr ""
+
+#: messages.py:36
+msgid "set"
+msgstr ""
+
+#: messages.py:42
+msgid "nested string"
+msgstr ""
+
+#: messages.py:47
+msgid "baz"
+msgstr ""
+
--- a/Lib/test/test_tools/i18n_data/messages.py
+++ b/Lib/test/test_tools/i18n_data/messages.py
@ -0,0 +1,64 @@
+# Test message extraction
+from gettext import gettext as _
+
+# Empty string
+_("")
+
+# Extra parentheses
+(_("parentheses"))
+((_("parentheses")))
+
+# Multiline strings
+_("Hello, "
+  "world!")
+
+_("""Hello,
+    multiline!
+""")
+
+# Invalid arguments
+_()
+_(None)
+_(1)
+_(False)
+_(x="kwargs are not allowed")
+_("foo", "bar")
+_("something", x="something else")
+
+# .format()
+_("Hello, {}!").format("world")  # valid
+_("Hello, {}!".format("world"))  # invalid
+
+# Nested structures
+_("1"), _("2")
+arr = [_("A"), _("B")]
+obj = {'a': _("A"), 'b': _("B")}
+{{{_('set')}}}
+
+
+# Nested functions and classes
+def test():
+    _("nested string")  # XXX This should be extracted but isn't.
+    [_("nested string")]
+
+
+class Foo:
+    def bar(self):
+        return _("baz")
+
+
+def bar(x=_('default value')):  # XXX This should be extracted but isn't.
+    pass
+
+
+def baz(x=[_('default value')]):  # XXX This should be extracted but isn't.
+    pass
+
+
+# Shadowing _()
+def _(x):
+    pass
+
+
+def _(x="don't extract me"):
+    pass
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@ -1,9 +1,11 @@
 """Tests to cover the Tools/i18n package"""

 import os
+import re
 import sys
 import unittest
 from textwrap import dedent
+from pathlib import Path

 from test.support.script_helper import assert_python_ok
 from test.test_tools import skip_if_missing, toolsdir
@ -12,20 +14,47 @@ from test.support.os_helper import temp_cwd, temp_dir

 skip_if_missing()

+DATA_DIR = Path(__file__).resolve().parent / 'i18n_data'
+
+
+def normalize_POT_file(pot):
+    """Normalize the POT creation timestamp, charset and
+    file locations to make the POT file easier to compare.
+
+    """
+    # Normalize the creation date.
+    date_pattern = re.compile(r'"POT-Creation-Date: .+?\\n"')
+    header = r'"POT-Creation-Date: 2000-01-01 00:00+0000\\n"'
+    pot = re.sub(date_pattern, header, pot)
+
+    # Normalize charset to UTF-8 (currently there's no way to specify the output charset).
+    charset_pattern = re.compile(r'"Content-Type: text/plain; charset=.+?\\n"')
+    charset = r'"Content-Type: text/plain; charset=UTF-8\\n"'
+    pot = re.sub(charset_pattern, charset, pot)
+
+    # Normalize file location path separators in case this test is
+    # running on Windows (which uses '\').
+    fileloc_pattern = re.compile(r'#:.+')
+
+    def replace(match):
+        return match[0].replace(os.sep, "/")
+    pot = re.sub(fileloc_pattern, replace, pot)
+    return pot
+

 class Test_pygettext(unittest.TestCase):
    """Tests for the pygettext.py tool"""

-    script = os.path.join(toolsdir,'i18n', 'pygettext.py')
+    script = Path(toolsdir, 'i18n', 'pygettext.py')

    def get_header(self, data):
        """ utility: return the header of a .po file as a dictionary """
        headers = {}
        for line in data.split('\n'):
-            if not line or line.startswith(('#', 'msgid','msgstr')):
+            if not line or line.startswith(('#', 'msgid', 'msgstr')):
                continue
            line = line.strip('"')
-            key, val = line.split(':',1)
+            key, val = line.split(':', 1)
            headers[key] = val.strip()
        return headers

@ -53,13 +82,18 @@ class Test_pygettext(unittest.TestCase):

        return msgids

+    def assert_POT_equal(self, expected, actual):
+        """Check if two POT files are equal"""
+        self.maxDiff = None
+        self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual))
+
    def extract_docstrings_from_str(self, module_content):
        """ utility: return all msgids extracted from module_content """
        filename = 'test_docstrings.py'
        with temp_cwd(None) as cwd:
            with open(filename, 'w', encoding='utf-8') as fp:
                fp.write(module_content)
-            assert_python_ok(self.script, '-D', filename)
+            assert_python_ok('-Xutf8', self.script, '-D', filename)
            with open('messages.pot', encoding='utf-8') as fp:
                data = fp.read()
        return self.get_msgids(data)
@ -69,7 +103,7 @@ class Test_pygettext(unittest.TestCase):
           http://www.gnu.org/software/gettext/manual/gettext.html#Header-Entry
        """
        with temp_cwd(None) as cwd:
-            assert_python_ok(self.script)
+            assert_python_ok('-Xutf8', self.script)
            with open('messages.pot', encoding='utf-8') as fp:
                data = fp.read()
            header = self.get_header(data)
@ -96,7 +130,7 @@ class Test_pygettext(unittest.TestCase):
        """ Match the date format from xgettext for POT-Creation-Date """
        from datetime import datetime
        with temp_cwd(None) as cwd:
-            assert_python_ok(self.script)
+            assert_python_ok('-Xutf8', self.script)
            with open('messages.pot', encoding='utf-8') as fp:
                data = fp.read()
            header = self.get_header(data)
@ -310,6 +344,20 @@ class Test_pygettext(unittest.TestCase):
        self.assertNotIn('foo', msgids)
        self.assertIn('bar', msgids)

+    def test_pygettext_output(self):
+        """Test that the pygettext output exactly matches snapshots."""
+        for input_file in DATA_DIR.glob('*.py'):
+            output_file = input_file.with_suffix('.pot')
+            with self.subTest(input_file=f'i18n_data/{input_file}'):
+                contents = input_file.read_text(encoding='utf-8')
+                with temp_cwd(None):
+                    Path(input_file.name).write_text(contents)
+                    assert_python_ok('-Xutf8', self.script, '--docstrings', input_file.name)
+                    output = Path('messages.pot').read_text(encoding='utf-8')
+
+                expected = output_file.read_text(encoding='utf-8')
+                self.assert_POT_equal(expected, output)
+
    def test_files_list(self):
        """Make sure the directories are inspected for source files
           bpo-31920
@ -318,21 +366,41 @@ class Test_pygettext(unittest.TestCase):
        text2 = 'Text to translate2'
        text3 = 'Text to ignore'
        with temp_cwd(None), temp_dir(None) as sdir:
-            os.mkdir(os.path.join(sdir, 'pypkg'))
-            with open(os.path.join(sdir, 'pypkg', 'pymod.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text1!r})')
-            os.mkdir(os.path.join(sdir, 'pkg.py'))
-            with open(os.path.join(sdir, 'pkg.py', 'pymod2.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text2!r})')
-            os.mkdir(os.path.join(sdir, 'CVS'))
-            with open(os.path.join(sdir, 'CVS', 'pymod3.py'), 'w',
-                      encoding='utf-8') as sfile:
-                sfile.write(f'_({text3!r})')
-            assert_python_ok(self.script, sdir)
-            with open('messages.pot', encoding='utf-8') as fp:
-                data = fp.read()
+            pymod = Path(sdir, 'pypkg', 'pymod.py')
+            pymod.parent.mkdir()
+            pymod.write_text(f'_({text1!r})', encoding='utf-8')
+
+            pymod2 = Path(sdir, 'pkg.py', 'pymod2.py')
+            pymod2.parent.mkdir()
+            pymod2.write_text(f'_({text2!r})', encoding='utf-8')
+
+            pymod3 = Path(sdir, 'CVS', 'pymod3.py')
+            pymod3.parent.mkdir()
+            pymod3.write_text(f'_({text3!r})', encoding='utf-8')
+
+            assert_python_ok('-Xutf8', self.script, sdir)
+            data = Path('messages.pot').read_text(encoding='utf-8')
            self.assertIn(f'msgid "{text1}"', data)
            self.assertIn(f'msgid "{text2}"', data)
            self.assertNotIn(text3, data)
+
+
+def update_POT_snapshots():
+    for input_file in DATA_DIR.glob('*.py'):
+        output_file = input_file.with_suffix('.pot')
+        contents = input_file.read_bytes()
+        with temp_cwd(None):
+            Path(input_file.name).write_bytes(contents)
+            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings', input_file.name)
+            output = Path('messages.pot').read_text(encoding='utf-8')
+
+        output = normalize_POT_file(output)
+        output_file.write_text(output, encoding='utf-8')
+
+
+if __name__ == '__main__':
+    # To regenerate POT files
+    if len(sys.argv) > 1 and sys.argv[1] == '--snapshot-update':
+        update_POT_snapshots()
+        sys.exit(0)
+    unittest.main()
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@ -2545,6 +2545,7 @@ TESTSUBDIRS=	idlelib/idle_test \
 		test/test_tomllib/data/valid/dates-and-times \
 		test/test_tomllib/data/valid/multiline-basic-str \
 		test/test_tools \
+		test/test_tools/i18n_data \
 		test/test_ttk \
 		test/test_unittest \
 		test/test_unittest/namespace_test_pkg \