From e2f710792b0418b8ca1ca3b8cdf39588c7268495 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 24 Sep 2024 11:01:37 +0300 Subject: [PATCH] gh-124188: Fix PyErr_ProgramTextObject() (GH-124189) * Detect source file encoding. * Use the "replace" error handler even for UTF-8 (default) encoding. * Remove the BOM. * Fix detection of too long lines if they contain NUL. * Return the head rather than the tail for truncated long lines. --- Lib/test/support/script_helper.py | 10 +- Lib/test/test_compiler_codegen.py | 5 +- Lib/test/test_eof.py | 164 +++++++++++++---- Lib/test/test_exceptions.py | 171 +++++++++++++----- ...-09-17-22-06-01.gh-issue-124188.aFqNAB.rst | 2 + Python/errors.c | 93 ++++++---- 6 files changed, 328 insertions(+), 117 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py index d0be3179b0e..46ce950433d 100644 --- a/Lib/test/support/script_helper.py +++ b/Lib/test/support/script_helper.py @@ -234,9 +234,13 @@ def make_script(script_dir, script_basename, source, omit_suffix=False): if not omit_suffix: script_filename += os.extsep + 'py' script_name = os.path.join(script_dir, script_filename) - # The script should be encoded to UTF-8, the default string encoding - with open(script_name, 'w', encoding='utf-8') as script_file: - script_file.write(source) + if isinstance(source, str): + # The script should be encoded to UTF-8, the default string encoding + with open(script_name, 'w', encoding='utf-8') as script_file: + script_file.write(source) + else: + with open(script_name, 'wb') as script_file: + script_file.write(source) importlib.invalidate_caches() return script_name diff --git a/Lib/test/test_compiler_codegen.py b/Lib/test/test_compiler_codegen.py index d82fb85ed25..8a15c400a44 100644 --- a/Lib/test/test_compiler_codegen.py +++ b/Lib/test/test_compiler_codegen.py @@ -152,5 +152,8 @@ class IsolatedCodeGenTests(CodegenTestCase): def test_syntax_error__return_not_in_function(self): snippet = "return 42" - with self.assertRaisesRegex(SyntaxError, "'return' outside function"): + with self.assertRaisesRegex(SyntaxError, "'return' outside function") as cm: self.codegen_test(snippet, None) + self.assertIsNone(cm.exception.text) + self.assertEqual(cm.exception.offset, 1) + self.assertEqual(cm.exception.end_offset, 10) diff --git a/Lib/test/test_eof.py b/Lib/test/test_eof.py index be4fd73bfdc..e377383450e 100644 --- a/Lib/test/test_eof.py +++ b/Lib/test/test_eof.py @@ -1,6 +1,7 @@ """test script for a few new invalid token catches""" import sys +from codecs import BOM_UTF8 from test import support from test.support import os_helper from test.support import script_helper @@ -11,67 +12,158 @@ class EOFTestCase(unittest.TestCase): def test_EOF_single_quote(self): expect = "unterminated string literal (detected at line 1) (, line 1)" for quote in ("'", "\""): - try: + with self.assertRaises(SyntaxError) as cm: eval(f"""{quote}this is a test\ """) - except SyntaxError as msg: - self.assertEqual(str(msg), expect) - self.assertEqual(msg.offset, 1) - else: - raise support.TestFailed + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.offset, 1) def test_EOFS(self): - expect = ("unterminated triple-quoted string literal (detected at line 1) (, line 1)") - try: - eval("""'''this is a test""") - except SyntaxError as msg: - self.assertEqual(str(msg), expect) - self.assertEqual(msg.offset, 1) - else: - raise support.TestFailed + expect = ("unterminated triple-quoted string literal (detected at line 3) (, line 1)") + with self.assertRaises(SyntaxError) as cm: + eval("""ä = '''thîs is \na \ntest""") + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, "ä = '''thîs is ") + self.assertEqual(cm.exception.offset, 5) + + with self.assertRaises(SyntaxError) as cm: + eval("""ä = '''thîs is \na \ntest""".encode()) + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, "ä = '''thîs is ") + self.assertEqual(cm.exception.offset, 5) + + with self.assertRaises(SyntaxError) as cm: + eval(BOM_UTF8 + """ä = '''thîs is \na \ntest""".encode()) + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, "ä = '''thîs is ") + self.assertEqual(cm.exception.offset, 5) + + with self.assertRaises(SyntaxError) as cm: + eval("""# coding: latin1\nä = '''thîs is \na \ntest""".encode('latin1')) + self.assertEqual(str(cm.exception), "unterminated triple-quoted string literal (detected at line 4) (, line 2)") + self.assertEqual(cm.exception.text, "ä = '''thîs is ") + self.assertEqual(cm.exception.offset, 5) def test_EOFS_with_file(self): expect = ("(, line 1)") with os_helper.temp_dir() as temp_dir: - file_name = script_helper.make_script(temp_dir, 'foo', """'''this is \na \ntest""") - rc, out, err = script_helper.assert_python_failure(file_name) - self.assertIn(b'unterminated triple-quoted string literal (detected at line 3)', err) + file_name = script_helper.make_script(temp_dir, 'foo', + """ä = '''thîs is \na \ntest""") + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + " ä = '''thîs is ", + ' ^', + 'SyntaxError: unterminated triple-quoted string literal (detected at line 3)']) + + file_name = script_helper.make_script(temp_dir, 'foo', + """ä = '''thîs is \na \ntest""".encode()) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + " ä = '''thîs is ", + ' ^', + 'SyntaxError: unterminated triple-quoted string literal (detected at line 3)']) + + file_name = script_helper.make_script(temp_dir, 'foo', + BOM_UTF8 + """ä = '''thîs is \na \ntest""".encode()) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + " ä = '''thîs is ", + ' ^', + 'SyntaxError: unterminated triple-quoted string literal (detected at line 3)']) + + file_name = script_helper.make_script(temp_dir, 'foo', + """# coding: latin1\nä = '''thîs is \na \ntest""".encode('latin1')) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + " ä = '''thîs is ", + ' ^', + 'SyntaxError: unterminated triple-quoted string literal (detected at line 4)']) @warnings_helper.ignore_warnings(category=SyntaxWarning) def test_eof_with_line_continuation(self): expect = "unexpected EOF while parsing (, line 1)" - try: + with self.assertRaises(SyntaxError) as cm: compile('"\\Xhh" \\', '', 'exec') - except SyntaxError as msg: - self.assertEqual(str(msg), expect) - else: - raise support.TestFailed + self.assertEqual(str(cm.exception), expect) def test_line_continuation_EOF(self): """A continuation at the end of input must be an error; bpo2180.""" expect = 'unexpected EOF while parsing (, line 1)' - with self.assertRaises(SyntaxError) as excinfo: - exec('x = 5\\') - self.assertEqual(str(excinfo.exception), expect) - with self.assertRaises(SyntaxError) as excinfo: + with self.assertRaises(SyntaxError) as cm: + exec('ä = 5\\') + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, 'ä = 5\\\n') + self.assertEqual(cm.exception.offset, 7) + + with self.assertRaises(SyntaxError) as cm: + exec('ä = 5\\'.encode()) + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, 'ä = 5\\\n') + self.assertEqual(cm.exception.offset, 7) + + with self.assertRaises(SyntaxError) as cm: + exec('# coding:latin1\nä = 5\\'.encode('latin1')) + self.assertEqual(str(cm.exception), + 'unexpected EOF while parsing (, line 2)') + self.assertEqual(cm.exception.text, 'ä = 5\\\n') + self.assertEqual(cm.exception.offset, 7) + + with self.assertRaises(SyntaxError) as cm: + exec(BOM_UTF8 + 'ä = 5\\'.encode()) + self.assertEqual(str(cm.exception), expect) + self.assertEqual(cm.exception.text, 'ä = 5\\\n') + self.assertEqual(cm.exception.offset, 7) + + with self.assertRaises(SyntaxError) as cm: exec('\\') - self.assertEqual(str(excinfo.exception), expect) + self.assertEqual(str(cm.exception), expect) @unittest.skipIf(not sys.executable, "sys.executable required") def test_line_continuation_EOF_from_file_bpo2180(self): """Ensure tok_nextc() does not add too many ending newlines.""" with os_helper.temp_dir() as temp_dir: file_name = script_helper.make_script(temp_dir, 'foo', '\\') - rc, out, err = script_helper.assert_python_failure(file_name) - self.assertIn(b'unexpected EOF while parsing', err) - self.assertIn(b'line 1', err) - self.assertIn(b'\\', err) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-2:], [ + ' \\', + 'SyntaxError: unexpected EOF while parsing']) + self.assertEqual(err[-3][-8:], ', line 1', err) + + file_name = script_helper.make_script(temp_dir, 'foo', 'ä = 6\\') + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + ' ä = 6\\', + ' ^', + 'SyntaxError: unexpected EOF while parsing']) + self.assertEqual(err[-4][-8:], ', line 1', err) + + file_name = script_helper.make_script(temp_dir, 'foo', + '# coding:latin1\n' + 'ä = 7\\'.encode('latin1')) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + ' ä = 7\\', + ' ^', + 'SyntaxError: unexpected EOF while parsing']) + self.assertEqual(err[-4][-8:], ', line 2', err) + + file_name = script_helper.make_script(temp_dir, 'foo', + BOM_UTF8 + 'ä = 8\\'.encode()) + rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name) + err = err.decode().splitlines() + self.assertEqual(err[-3:], [ + ' ä = 8\\', + ' ^', + 'SyntaxError: unexpected EOF while parsing']) + self.assertEqual(err[-4][-8:], ', line 1', err) - file_name = script_helper.make_script(temp_dir, 'foo', 'y = 6\\') - rc, out, err = script_helper.assert_python_failure(file_name) - self.assertIn(b'unexpected EOF while parsing', err) - self.assertIn(b'line 1', err) - self.assertIn(b'y = 6\\', err) if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index e4f2e3a97b8..ba858c49400 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -7,6 +7,7 @@ import unittest import pickle import weakref import errno +from codecs import BOM_UTF8 from textwrap import dedent from test.support import (captured_stderr, check_impl_detail, @@ -2011,16 +2012,20 @@ class ImportErrorTests(unittest.TestCase): self.assertEqual(exc.path, orig.path) +def run_script(source): + if isinstance(source, str): + with open(TESTFN, 'w', encoding='utf-8') as testfile: + testfile.write(dedent(source)) + else: + with open(TESTFN, 'wb') as testfile: + testfile.write(source) + _rc, _out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) + return err.decode('utf-8').splitlines() + class AssertionErrorTests(unittest.TestCase): def tearDown(self): unlink(TESTFN) - def write_source(self, source): - with open(TESTFN, 'w') as testfile: - testfile.write(dedent(source)) - _rc, _out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) - return err.decode('utf-8').splitlines() - @force_not_colorized def test_assertion_error_location(self): cases = [ @@ -2052,11 +2057,32 @@ class AssertionErrorTests(unittest.TestCase): 'AssertionError', ], ), - ('assert 1 > 2, "message"', + ('assert 1 > 2, "messäge"', [ - ' assert 1 > 2, "message"', + ' assert 1 > 2, "messäge"', ' ^^^^^', - 'AssertionError: message', + 'AssertionError: messäge', + ], + ), + ('assert 1 > 2, "messäge"'.encode(), + [ + ' assert 1 > 2, "messäge"', + ' ^^^^^', + 'AssertionError: messäge', + ], + ), + ('# coding: latin1\nassert 1 > 2, "messäge"'.encode('latin1'), + [ + ' assert 1 > 2, "messäge"', + ' ^^^^^', + 'AssertionError: messäge', + ], + ), + (BOM_UTF8 + 'assert 1 > 2, "messäge"'.encode(), + [ + ' assert 1 > 2, "messäge"', + ' ^^^^^', + 'AssertionError: messäge', ], ), @@ -2094,8 +2120,8 @@ class AssertionErrorTests(unittest.TestCase): ), ] for source, expected in cases: - with self.subTest(source): - result = self.write_source(source) + with self.subTest(source=source): + result = run_script(source) self.assertEqual(result[-3:], expected) @force_not_colorized @@ -2125,12 +2151,14 @@ class AssertionErrorTests(unittest.TestCase): ), ] for source, expected in cases: - with self.subTest(source): - result = self.write_source(source) + with self.subTest(source=source): + result = run_script(source) self.assertEqual(result[-len(expected):], expected) class SyntaxErrorTests(unittest.TestCase): + maxDiff = None + @force_not_colorized def test_range_of_offsets(self): cases = [ @@ -2223,45 +2251,106 @@ class SyntaxErrorTests(unittest.TestCase): the_exception = exc def test_encodings(self): + self.addCleanup(unlink, TESTFN) source = ( '# -*- coding: cp437 -*-\n' '"┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))\n' ) - try: - with open(TESTFN, 'w', encoding='cp437') as testfile: - testfile.write(source) - rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) - err = err.decode('utf-8').splitlines() - - self.assertEqual(err[-3], ' "┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))') - self.assertEqual(err[-2], ' ^^^^^^^^^^^^^^^^^^^') - finally: - unlink(TESTFN) + err = run_script(source.encode('cp437')) + self.assertEqual(err[-3], ' "┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))') + self.assertEqual(err[-2], ' ^^^^^^^^^^^^^^^^^^^') # Check backwards tokenizer errors source = '# -*- coding: ascii -*-\n\n(\n' - try: - with open(TESTFN, 'w', encoding='ascii') as testfile: - testfile.write(source) - rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) - err = err.decode('utf-8').splitlines() - - self.assertEqual(err[-3], ' (') - self.assertEqual(err[-2], ' ^') - finally: - unlink(TESTFN) + err = run_script(source) + self.assertEqual(err[-3], ' (') + self.assertEqual(err[-2], ' ^') def test_non_utf8(self): # Check non utf-8 characters - try: - with open(TESTFN, 'bw') as testfile: - testfile.write(b"\x89") - rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN) - err = err.decode('utf-8').splitlines() + self.addCleanup(unlink, TESTFN) + err = run_script(b"\x89") + self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1]) - self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1]) - finally: - unlink(TESTFN) + def test_string_source(self): + def try_compile(source): + with self.assertRaises(SyntaxError) as cm: + compile(source, '', 'exec') + return cm.exception + + exc = try_compile('return "ä"') + self.assertEqual(str(exc), "'return' outside function (, line 1)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + exc = try_compile('return "ä"'.encode()) + self.assertEqual(str(exc), "'return' outside function (, line 1)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + exc = try_compile(BOM_UTF8 + 'return "ä"'.encode()) + self.assertEqual(str(exc), "'return' outside function (, line 1)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + exc = try_compile('# coding: latin1\nreturn "ä"'.encode('latin1')) + self.assertEqual(str(exc), "'return' outside function (, line 2)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + exc = try_compile('return "ä" #' + 'ä'*1000) + self.assertEqual(str(exc), "'return' outside function (, line 1)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + exc = try_compile('return "ä" # ' + 'ä'*1000) + self.assertEqual(str(exc), "'return' outside function (, line 1)") + self.assertIsNone(exc.text) + self.assertEqual(exc.offset, 1) + self.assertEqual(exc.end_offset, 12) + + def test_file_source(self): + self.addCleanup(unlink, TESTFN) + err = run_script('return "ä"') + self.assertEqual(err[-3:], [ + ' return "ä"', + ' ^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + + err = run_script('return "ä"'.encode()) + self.assertEqual(err[-3:], [ + ' return "ä"', + ' ^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + + err = run_script(BOM_UTF8 + 'return "ä"'.encode()) + self.assertEqual(err[-3:], [ + ' return "ä"', + ' ^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + + err = run_script('# coding: latin1\nreturn "ä"'.encode('latin1')) + self.assertEqual(err[-3:], [ + ' return "ä"', + ' ^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + + err = run_script('return "ä" #' + 'ä'*1000) + self.assertEqual(err[-2:], [ + ' ^^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + self.assertEqual(err[-3][:100], ' return "ä" #' + 'ä'*84) + + err = run_script('return "ä" # ' + 'ä'*1000) + self.assertEqual(err[-2:], [ + ' ^^^^^^^^^^^', + "SyntaxError: 'return' outside function"]) + self.assertEqual(err[-3][:100], ' return "ä" # ' + 'ä'*83) def test_attributes_new_constructor(self): args = ("bad.py", 1, 2, "abcdefg", 1, 100) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst new file mode 100644 index 00000000000..0c2935fbe00 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst @@ -0,0 +1,2 @@ +Fix reading and decoding a line from the source file witn non-UTF-8 encoding +for syntax errors raised in the compiler. diff --git a/Python/errors.c b/Python/errors.c index 29249ac41c6..9e2a3ce062a 100644 --- a/Python/errors.c +++ b/Python/errors.c @@ -1903,44 +1903,44 @@ _PyErr_EmitSyntaxWarning(PyObject *msg, PyObject *filename, int lineno, int col_ functionality in tb_displayline() in traceback.c. */ static PyObject * -err_programtext(PyThreadState *tstate, FILE *fp, int lineno, const char* encoding) +err_programtext(FILE *fp, int lineno, const char* encoding) { - int i; char linebuf[1000]; - if (fp == NULL) { - return NULL; - } + size_t line_size = 0; - for (i = 0; i < lineno; i++) { - char *pLastChar = &linebuf[sizeof(linebuf) - 2]; - do { - *pLastChar = '\0'; - if (Py_UniversalNewlineFgets(linebuf, sizeof linebuf, - fp, NULL) == NULL) { - goto after_loop; - } - /* fgets read *something*; if it didn't get as - far as pLastChar, it must have found a newline - or hit the end of the file; if pLastChar is \n, - it obviously found a newline; else we haven't - yet seen a newline, so must continue */ - } while (*pLastChar != '\0' && *pLastChar != '\n'); - } - -after_loop: - fclose(fp); - if (i == lineno) { - PyObject *res; - if (encoding != NULL) { - res = PyUnicode_Decode(linebuf, strlen(linebuf), encoding, "replace"); - } else { - res = PyUnicode_FromString(linebuf); + for (int i = 0; i < lineno; ) { + line_size = 0; + if (_Py_UniversalNewlineFgetsWithSize(linebuf, sizeof(linebuf), + fp, NULL, &line_size) == NULL) + { + /* Error or EOF. */ + return NULL; } - if (res == NULL) - _PyErr_Clear(tstate); - return res; + /* fgets read *something*; if it didn't fill the + whole buffer, it must have found a newline + or hit the end of the file; if the last character is \n, + it obviously found a newline; else we haven't + yet seen a newline, so must continue */ + if (i + 1 < lineno + && line_size == sizeof(linebuf) - 1 + && linebuf[sizeof(linebuf) - 2] != '\n') + { + continue; + } + i++; } - return NULL; + + const char *line = linebuf; + /* Skip BOM. */ + if (lineno == 1 && line_size >= 3 && memcmp(line, "\xef\xbb\xbf", 3) == 0) { + line += 3; + line_size -= 3; + } + PyObject *res = PyUnicode_Decode(line, line_size, encoding, "replace"); + if (res == NULL) { + PyErr_Clear(); + } + return res; } PyObject * @@ -1960,20 +1960,41 @@ PyErr_ProgramText(const char *filename, int lineno) return res; } +/* Function from Parser/tokenizer/file_tokenizer.c */ +extern char* _PyTokenizer_FindEncodingFilename(int, PyObject *); + PyObject * _PyErr_ProgramDecodedTextObject(PyObject *filename, int lineno, const char* encoding) { + char *found_encoding = NULL; if (filename == NULL || lineno <= 0) { return NULL; } - PyThreadState *tstate = _PyThreadState_GET(); FILE *fp = _Py_fopen_obj(filename, "r" PY_STDIOTEXTMODE); if (fp == NULL) { - _PyErr_Clear(tstate); + PyErr_Clear(); return NULL; } - return err_programtext(tstate, fp, lineno, encoding); + if (encoding == NULL) { + int fd = fileno(fp); + found_encoding = _PyTokenizer_FindEncodingFilename(fd, filename); + encoding = found_encoding; + if (encoding == NULL) { + PyErr_Clear(); + encoding = "utf-8"; + } + /* Reset position */ + if (lseek(fd, 0, SEEK_SET) == (off_t)-1) { + fclose(fp); + PyMem_Free(found_encoding); + return NULL; + } + } + PyObject *res = err_programtext(fp, lineno, encoding); + fclose(fp); + PyMem_Free(found_encoding); + return res; } PyObject *