From e2f710792b0418b8ca1ca3b8cdf39588c7268495 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Tue, 24 Sep 2024 11:01:37 +0300
Subject: [PATCH] gh-124188: Fix PyErr_ProgramTextObject() (GH-124189)

* Detect source file encoding.
* Use the "replace" error handler even for UTF-8 (default) encoding.
* Remove the BOM.
* Fix detection of too long lines if they contain NUL.
* Return the head rather than the tail for truncated long lines.
---
 Lib/test/support/script_helper.py             |  10 +-
 Lib/test/test_compiler_codegen.py             |   5 +-
 Lib/test/test_eof.py                          | 164 +++++++++++++----
 Lib/test/test_exceptions.py                   | 171 +++++++++++++-----
 ...-09-17-22-06-01.gh-issue-124188.aFqNAB.rst |   2 +
 Python/errors.c                               |  93 ++++++----
 6 files changed, 328 insertions(+), 117 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst
diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py
index d0be3179b0e..46ce950433d 100644
--- a/Lib/test/support/script_helper.py
+++ b/Lib/test/support/script_helper.py
@@ -234,9 +234,13 @@ def make_script(script_dir, script_basename, source, omit_suffix=False):
     if not omit_suffix:
         script_filename += os.extsep + 'py'
     script_name = os.path.join(script_dir, script_filename)
-    # The script should be encoded to UTF-8, the default string encoding
-    with open(script_name, 'w', encoding='utf-8') as script_file:
-        script_file.write(source)
+    if isinstance(source, str):
+        # The script should be encoded to UTF-8, the default string encoding
+        with open(script_name, 'w', encoding='utf-8') as script_file:
+            script_file.write(source)
+    else:
+        with open(script_name, 'wb') as script_file:
+            script_file.write(source)
     importlib.invalidate_caches()
     return script_name
 
diff --git a/Lib/test/test_compiler_codegen.py b/Lib/test/test_compiler_codegen.py
index d82fb85ed25..8a15c400a44 100644
--- a/Lib/test/test_compiler_codegen.py
+++ b/Lib/test/test_compiler_codegen.py
@@ -152,5 +152,8 @@ class IsolatedCodeGenTests(CodegenTestCase):
 
     def test_syntax_error__return_not_in_function(self):
         snippet = "return 42"
-        with self.assertRaisesRegex(SyntaxError, "'return' outside function"):
+        with self.assertRaisesRegex(SyntaxError, "'return' outside function") as cm:
             self.codegen_test(snippet, None)
+        self.assertIsNone(cm.exception.text)
+        self.assertEqual(cm.exception.offset, 1)
+        self.assertEqual(cm.exception.end_offset, 10)
diff --git a/Lib/test/test_eof.py b/Lib/test/test_eof.py
index be4fd73bfdc..e377383450e 100644
--- a/Lib/test/test_eof.py
+++ b/Lib/test/test_eof.py
@@ -1,6 +1,7 @@
 """test script for a few new invalid token catches"""
 
 import sys
+from codecs import BOM_UTF8
 from test import support
 from test.support import os_helper
 from test.support import script_helper
@@ -11,67 +12,158 @@ class EOFTestCase(unittest.TestCase):
     def test_EOF_single_quote(self):
         expect = "unterminated string literal (detected at line 1) (<string>, line 1)"
         for quote in ("'", "\""):
-            try:
+            with self.assertRaises(SyntaxError) as cm:
                 eval(f"""{quote}this is a test\
                 """)
-            except SyntaxError as msg:
-                self.assertEqual(str(msg), expect)
-                self.assertEqual(msg.offset, 1)
-            else:
-                raise support.TestFailed
+            self.assertEqual(str(cm.exception), expect)
+            self.assertEqual(cm.exception.offset, 1)
 
     def test_EOFS(self):
-        expect = ("unterminated triple-quoted string literal (detected at line 1) (<string>, line 1)")
-        try:
-            eval("""'''this is a test""")
-        except SyntaxError as msg:
-            self.assertEqual(str(msg), expect)
-            self.assertEqual(msg.offset, 1)
-        else:
-            raise support.TestFailed
+        expect = ("unterminated triple-quoted string literal (detected at line 3) (<string>, line 1)")
+        with self.assertRaises(SyntaxError) as cm:
+            eval("""ä = '''thîs is \na \ntest""")
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, "ä = '''thîs is ")
+        self.assertEqual(cm.exception.offset, 5)
+
+        with self.assertRaises(SyntaxError) as cm:
+            eval("""ä = '''thîs is \na \ntest""".encode())
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, "ä = '''thîs is ")
+        self.assertEqual(cm.exception.offset, 5)
+
+        with self.assertRaises(SyntaxError) as cm:
+            eval(BOM_UTF8 + """ä = '''thîs is \na \ntest""".encode())
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, "ä = '''thîs is ")
+        self.assertEqual(cm.exception.offset, 5)
+
+        with self.assertRaises(SyntaxError) as cm:
+            eval("""# coding: latin1\nä = '''thîs is \na \ntest""".encode('latin1'))
+        self.assertEqual(str(cm.exception), "unterminated triple-quoted string literal (detected at line 4) (<string>, line 2)")
+        self.assertEqual(cm.exception.text, "ä = '''thîs is ")
+        self.assertEqual(cm.exception.offset, 5)
 
     def test_EOFS_with_file(self):
         expect = ("(<string>, line 1)")
         with os_helper.temp_dir() as temp_dir:
-            file_name = script_helper.make_script(temp_dir, 'foo', """'''this is \na \ntest""")
-            rc, out, err = script_helper.assert_python_failure(file_name)
-        self.assertIn(b'unterminated triple-quoted string literal (detected at line 3)', err)
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  """ä = '''thîs is \na \ntest""")
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                "    ä = '''thîs is ",
+                '        ^',
+                'SyntaxError: unterminated triple-quoted string literal (detected at line 3)'])
+
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  """ä = '''thîs is \na \ntest""".encode())
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                "    ä = '''thîs is ",
+                '        ^',
+                'SyntaxError: unterminated triple-quoted string literal (detected at line 3)'])
+
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  BOM_UTF8 + """ä = '''thîs is \na \ntest""".encode())
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                "    ä = '''thîs is ",
+                '        ^',
+                'SyntaxError: unterminated triple-quoted string literal (detected at line 3)'])
+
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  """# coding: latin1\nä = '''thîs is \na \ntest""".encode('latin1'))
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                "    ä = '''thîs is ",
+                '        ^',
+                'SyntaxError: unterminated triple-quoted string literal (detected at line 4)'])
 
     @warnings_helper.ignore_warnings(category=SyntaxWarning)
     def test_eof_with_line_continuation(self):
         expect = "unexpected EOF while parsing (<string>, line 1)"
-        try:
+        with self.assertRaises(SyntaxError) as cm:
             compile('"\\Xhh" \\', '<string>', 'exec')
-        except SyntaxError as msg:
-            self.assertEqual(str(msg), expect)
-        else:
-            raise support.TestFailed
+        self.assertEqual(str(cm.exception), expect)
 
     def test_line_continuation_EOF(self):
         """A continuation at the end of input must be an error; bpo2180."""
         expect = 'unexpected EOF while parsing (<string>, line 1)'
-        with self.assertRaises(SyntaxError) as excinfo:
-            exec('x = 5\\')
-        self.assertEqual(str(excinfo.exception), expect)
-        with self.assertRaises(SyntaxError) as excinfo:
+        with self.assertRaises(SyntaxError) as cm:
+            exec('ä = 5\\')
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, 'ä = 5\\\n')
+        self.assertEqual(cm.exception.offset, 7)
+
+        with self.assertRaises(SyntaxError) as cm:
+            exec('ä = 5\\'.encode())
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, 'ä = 5\\\n')
+        self.assertEqual(cm.exception.offset, 7)
+
+        with self.assertRaises(SyntaxError) as cm:
+            exec('# coding:latin1\nä = 5\\'.encode('latin1'))
+        self.assertEqual(str(cm.exception),
+                         'unexpected EOF while parsing (<string>, line 2)')
+        self.assertEqual(cm.exception.text, 'ä = 5\\\n')
+        self.assertEqual(cm.exception.offset, 7)
+
+        with self.assertRaises(SyntaxError) as cm:
+            exec(BOM_UTF8 + 'ä = 5\\'.encode())
+        self.assertEqual(str(cm.exception), expect)
+        self.assertEqual(cm.exception.text, 'ä = 5\\\n')
+        self.assertEqual(cm.exception.offset, 7)
+
+        with self.assertRaises(SyntaxError) as cm:
             exec('\\')
-        self.assertEqual(str(excinfo.exception), expect)
+        self.assertEqual(str(cm.exception), expect)
 
     @unittest.skipIf(not sys.executable, "sys.executable required")
     def test_line_continuation_EOF_from_file_bpo2180(self):
         """Ensure tok_nextc() does not add too many ending newlines."""
         with os_helper.temp_dir() as temp_dir:
             file_name = script_helper.make_script(temp_dir, 'foo', '\\')
-            rc, out, err = script_helper.assert_python_failure(file_name)
-            self.assertIn(b'unexpected EOF while parsing', err)
-            self.assertIn(b'line 1', err)
-            self.assertIn(b'\\', err)
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-2:], [
+                '    \\',
+                'SyntaxError: unexpected EOF while parsing'])
+            self.assertEqual(err[-3][-8:], ', line 1', err)
+
+            file_name = script_helper.make_script(temp_dir, 'foo', 'ä = 6\\')
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                '    ä = 6\\',
+                '          ^',
+                'SyntaxError: unexpected EOF while parsing'])
+            self.assertEqual(err[-4][-8:], ', line 1', err)
+
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  '# coding:latin1\n'
+                                                  'ä = 7\\'.encode('latin1'))
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                '    ä = 7\\',
+                '          ^',
+                'SyntaxError: unexpected EOF while parsing'])
+            self.assertEqual(err[-4][-8:], ', line 2', err)
+
+            file_name = script_helper.make_script(temp_dir, 'foo',
+                                                  BOM_UTF8 + 'ä = 8\\'.encode())
+            rc, out, err = script_helper.assert_python_failure('-X', 'utf8', file_name)
+            err = err.decode().splitlines()
+            self.assertEqual(err[-3:], [
+                '    ä = 8\\',
+                '          ^',
+                'SyntaxError: unexpected EOF while parsing'])
+            self.assertEqual(err[-4][-8:], ', line 1', err)
 
-            file_name = script_helper.make_script(temp_dir, 'foo', 'y = 6\\')
-            rc, out, err = script_helper.assert_python_failure(file_name)
-            self.assertIn(b'unexpected EOF while parsing', err)
-            self.assertIn(b'line 1', err)
-            self.assertIn(b'y = 6\\', err)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index e4f2e3a97b8..ba858c49400 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -7,6 +7,7 @@ import unittest
 import pickle
 import weakref
 import errno
+from codecs import BOM_UTF8
 from textwrap import dedent
 
 from test.support import (captured_stderr, check_impl_detail,
@@ -2011,16 +2012,20 @@ class ImportErrorTests(unittest.TestCase):
                 self.assertEqual(exc.path, orig.path)
 
 
+def run_script(source):
+    if isinstance(source, str):
+        with open(TESTFN, 'w', encoding='utf-8') as testfile:
+            testfile.write(dedent(source))
+    else:
+        with open(TESTFN, 'wb') as testfile:
+            testfile.write(source)
+    _rc, _out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
+    return err.decode('utf-8').splitlines()
+
 class AssertionErrorTests(unittest.TestCase):
     def tearDown(self):
         unlink(TESTFN)
 
-    def write_source(self, source):
-        with open(TESTFN, 'w') as testfile:
-            testfile.write(dedent(source))
-        _rc, _out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
-        return err.decode('utf-8').splitlines()
-
     @force_not_colorized
     def test_assertion_error_location(self):
         cases = [
@@ -2052,11 +2057,32 @@ class AssertionErrorTests(unittest.TestCase):
                     'AssertionError',
                 ],
             ),
-            ('assert 1 > 2, "message"',
+            ('assert 1 > 2, "messäge"',
                 [
-                    '    assert 1 > 2, "message"',
+                    '    assert 1 > 2, "messäge"',
                     '           ^^^^^',
-                    'AssertionError: message',
+                    'AssertionError: messäge',
+                ],
+            ),
+            ('assert 1 > 2, "messäge"'.encode(),
+                [
+                    '    assert 1 > 2, "messäge"',
+                    '           ^^^^^',
+                    'AssertionError: messäge',
+                ],
+            ),
+            ('# coding: latin1\nassert 1 > 2, "messäge"'.encode('latin1'),
+                [
+                    '    assert 1 > 2, "messäge"',
+                    '           ^^^^^',
+                    'AssertionError: messäge',
+                ],
+            ),
+            (BOM_UTF8 + 'assert 1 > 2, "messäge"'.encode(),
+                [
+                    '    assert 1 > 2, "messäge"',
+                    '           ^^^^^',
+                    'AssertionError: messäge',
                 ],
             ),
 
@@ -2094,8 +2120,8 @@ class AssertionErrorTests(unittest.TestCase):
             ),
         ]
         for source, expected in cases:
-            with self.subTest(source):
-                result = self.write_source(source)
+            with self.subTest(source=source):
+                result = run_script(source)
                 self.assertEqual(result[-3:], expected)
 
     @force_not_colorized
@@ -2125,12 +2151,14 @@ class AssertionErrorTests(unittest.TestCase):
             ),
         ]
         for source, expected in cases:
-            with self.subTest(source):
-                result = self.write_source(source)
+            with self.subTest(source=source):
+                result = run_script(source)
                 self.assertEqual(result[-len(expected):], expected)
 
 
 class SyntaxErrorTests(unittest.TestCase):
+    maxDiff = None
+
     @force_not_colorized
     def test_range_of_offsets(self):
         cases = [
@@ -2223,45 +2251,106 @@ class SyntaxErrorTests(unittest.TestCase):
                     the_exception = exc
 
     def test_encodings(self):
+        self.addCleanup(unlink, TESTFN)
         source = (
             '# -*- coding: cp437 -*-\n'
             '"┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))\n'
         )
-        try:
-            with open(TESTFN, 'w', encoding='cp437') as testfile:
-                testfile.write(source)
-            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
-            err = err.decode('utf-8').splitlines()
-
-            self.assertEqual(err[-3], '    "┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))')
-            self.assertEqual(err[-2], '                          ^^^^^^^^^^^^^^^^^^^')
-        finally:
-            unlink(TESTFN)
+        err = run_script(source.encode('cp437'))
+        self.assertEqual(err[-3], '    "┬ó┬ó┬ó┬ó┬ó┬ó" + f(4, x for x in range(1))')
+        self.assertEqual(err[-2], '                          ^^^^^^^^^^^^^^^^^^^')
 
         # Check backwards tokenizer errors
         source = '# -*- coding: ascii -*-\n\n(\n'
-        try:
-            with open(TESTFN, 'w', encoding='ascii') as testfile:
-                testfile.write(source)
-            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
-            err = err.decode('utf-8').splitlines()
-
-            self.assertEqual(err[-3], '    (')
-            self.assertEqual(err[-2], '    ^')
-        finally:
-            unlink(TESTFN)
+        err = run_script(source)
+        self.assertEqual(err[-3], '    (')
+        self.assertEqual(err[-2], '    ^')
 
     def test_non_utf8(self):
         # Check non utf-8 characters
-        try:
-            with open(TESTFN, 'bw') as testfile:
-                testfile.write(b"\x89")
-            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
-            err = err.decode('utf-8').splitlines()
+        self.addCleanup(unlink, TESTFN)
+        err = run_script(b"\x89")
+        self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1])
 
-            self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1])
-        finally:
-            unlink(TESTFN)
+    def test_string_source(self):
+        def try_compile(source):
+            with self.assertRaises(SyntaxError) as cm:
+                compile(source, '<string>', 'exec')
+            return cm.exception
+
+        exc = try_compile('return "ä"')
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 1)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+        exc = try_compile('return "ä"'.encode())
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 1)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+        exc = try_compile(BOM_UTF8 + 'return "ä"'.encode())
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 1)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+        exc = try_compile('# coding: latin1\nreturn "ä"'.encode('latin1'))
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 2)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+        exc = try_compile('return "ä" #' + 'ä'*1000)
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 1)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+        exc = try_compile('return "ä" # ' + 'ä'*1000)
+        self.assertEqual(str(exc), "'return' outside function (<string>, line 1)")
+        self.assertIsNone(exc.text)
+        self.assertEqual(exc.offset, 1)
+        self.assertEqual(exc.end_offset, 12)
+
+    def test_file_source(self):
+        self.addCleanup(unlink, TESTFN)
+        err = run_script('return "ä"')
+        self.assertEqual(err[-3:], [
+                         '    return "ä"',
+                         '    ^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+
+        err = run_script('return "ä"'.encode())
+        self.assertEqual(err[-3:], [
+                         '    return "ä"',
+                         '    ^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+
+        err = run_script(BOM_UTF8 + 'return "ä"'.encode())
+        self.assertEqual(err[-3:], [
+                         '    return "ä"',
+                         '    ^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+
+        err = run_script('# coding: latin1\nreturn "ä"'.encode('latin1'))
+        self.assertEqual(err[-3:], [
+                         '    return "ä"',
+                         '    ^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+
+        err = run_script('return "ä" #' + 'ä'*1000)
+        self.assertEqual(err[-2:], [
+                         '    ^^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+        self.assertEqual(err[-3][:100], '    return "ä" #' + 'ä'*84)
+
+        err = run_script('return "ä" # ' + 'ä'*1000)
+        self.assertEqual(err[-2:], [
+                         '    ^^^^^^^^^^^',
+                         "SyntaxError: 'return' outside function"])
+        self.assertEqual(err[-3][:100], '    return "ä" # ' + 'ä'*83)
 
     def test_attributes_new_constructor(self):
         args = ("bad.py", 1, 2, "abcdefg", 1, 100)
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst
new file mode 100644
index 00000000000..0c2935fbe00
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-17-22-06-01.gh-issue-124188.aFqNAB.rst
@@ -0,0 +1,2 @@
+Fix reading and decoding a line from the source file witn non-UTF-8 encoding
+for syntax errors raised in the compiler.
diff --git a/Python/errors.c b/Python/errors.c
index 29249ac41c6..9e2a3ce062a 100644
--- a/Python/errors.c
+++ b/Python/errors.c
@@ -1903,44 +1903,44 @@ _PyErr_EmitSyntaxWarning(PyObject *msg, PyObject *filename, int lineno, int col_
    functionality in tb_displayline() in traceback.c. */
 
 static PyObject *
-err_programtext(PyThreadState *tstate, FILE *fp, int lineno, const char* encoding)
+err_programtext(FILE *fp, int lineno, const char* encoding)
 {
-    int i;
     char linebuf[1000];
-    if (fp == NULL) {
-        return NULL;
-    }
+    size_t line_size = 0;
 
-    for (i = 0; i < lineno; i++) {
-        char *pLastChar = &linebuf[sizeof(linebuf) - 2];
-        do {
-            *pLastChar = '\0';
-            if (Py_UniversalNewlineFgets(linebuf, sizeof linebuf,
-                                         fp, NULL) == NULL) {
-                goto after_loop;
-            }
-            /* fgets read *something*; if it didn't get as
-               far as pLastChar, it must have found a newline
-               or hit the end of the file; if pLastChar is \n,
-               it obviously found a newline; else we haven't
-               yet seen a newline, so must continue */
-        } while (*pLastChar != '\0' && *pLastChar != '\n');
-    }
-
-after_loop:
-    fclose(fp);
-    if (i == lineno) {
-        PyObject *res;
-        if (encoding != NULL) {
-            res = PyUnicode_Decode(linebuf, strlen(linebuf), encoding, "replace");
-        } else {
-            res = PyUnicode_FromString(linebuf);
+    for (int i = 0; i < lineno; ) {
+        line_size = 0;
+        if (_Py_UniversalNewlineFgetsWithSize(linebuf, sizeof(linebuf),
+                                              fp, NULL, &line_size) == NULL)
+        {
+            /* Error or EOF. */
+            return NULL;
         }
-        if (res == NULL)
-            _PyErr_Clear(tstate);
-        return res;
+        /* fgets read *something*; if it didn't fill the
+           whole buffer, it must have found a newline
+           or hit the end of the file; if the last character is \n,
+           it obviously found a newline; else we haven't
+           yet seen a newline, so must continue */
+        if (i + 1 < lineno
+            && line_size == sizeof(linebuf) - 1
+            && linebuf[sizeof(linebuf) - 2] != '\n')
+        {
+            continue;
+        }
+        i++;
     }
-    return NULL;
+
+    const char *line = linebuf;
+    /* Skip BOM. */
+    if (lineno == 1 && line_size >= 3 && memcmp(line, "\xef\xbb\xbf", 3) == 0) {
+        line += 3;
+        line_size -= 3;
+    }
+    PyObject *res = PyUnicode_Decode(line, line_size, encoding, "replace");
+    if (res == NULL) {
+        PyErr_Clear();
+    }
+    return res;
 }
 
 PyObject *
@@ -1960,20 +1960,41 @@ PyErr_ProgramText(const char *filename, int lineno)
     return res;
 }
 
+/* Function from Parser/tokenizer/file_tokenizer.c */
+extern char* _PyTokenizer_FindEncodingFilename(int, PyObject *);
+
 PyObject *
 _PyErr_ProgramDecodedTextObject(PyObject *filename, int lineno, const char* encoding)
 {
+    char *found_encoding = NULL;
     if (filename == NULL || lineno <= 0) {
         return NULL;
     }
 
-    PyThreadState *tstate = _PyThreadState_GET();
     FILE *fp = _Py_fopen_obj(filename, "r" PY_STDIOTEXTMODE);
     if (fp == NULL) {
-        _PyErr_Clear(tstate);
+        PyErr_Clear();
         return NULL;
     }
-    return err_programtext(tstate, fp, lineno, encoding);
+    if (encoding == NULL) {
+        int fd = fileno(fp);
+        found_encoding = _PyTokenizer_FindEncodingFilename(fd, filename);
+        encoding = found_encoding;
+        if (encoding == NULL) {
+            PyErr_Clear();
+            encoding = "utf-8";
+        }
+        /* Reset position */
+        if (lseek(fd, 0, SEEK_SET) == (off_t)-1) {
+            fclose(fp);
+            PyMem_Free(found_encoding);
+            return NULL;
+        }
+    }
+    PyObject *res = err_programtext(fp, lineno, encoding);
+    fclose(fp);
+    PyMem_Free(found_encoding);
+    return res;
 }
 
 PyObject *