bpo-43950: handle wide unicode characters in tracebacks (#28150)

2024-11-24 00:38:00 +01:00 · 2023-10-26 00:05:29 -07:00 · 2023-10-26 00:05:29 -07:00 · 78e6d72e38
commit 78e6d72e38
parent 90a1b2859f
2 changed files with 98 additions and 14 deletions
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@ -924,8 +924,63 @@ class TracebackErrorLocationCaretTestBase:
            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
            "    callable()",
            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
-            "    print(1, ｗｗｗ(",
-            "             ^^^^",
+            f"    print(1, ｗｗｗ(",
+            f"             ^^^^^^^",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_with_wide_characters_term_highlight(self):
+        def f():
+            说明说明 = 1
+            şçöğıĤellö = 0 # not wide but still non-ascii
+            return 说明说明 / şçöğıĤellö
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
+            f"    return 说明说明 / şçöğıĤellö",
+            f"           ~~~~~~~~~^~~~~~~~~~~~",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_with_emojis_term_highlight(self):
+        def f():
+            return "✨🐍" + func_说明说明("📗🚛",
+                "📗🚛") + "🐍"
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
+            f'    return "✨🐍" + func_说明说明("📗🚛",',
+            f"                    ^^^^^^^^^^^^^",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_wide_chars_subscript(self):
+        def f():
+            my_dct = {
+                "✨🚛✨": {
+                    "说明": {
+                        "🐍🐍🐍": None
+                    }
+                }
+            }
+            return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
+            f'    return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
+            f"           ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
        ]
        self.assertEqual(actual, expected)

--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@ -485,7 +485,8 @@ class StackSummary(list):
            stripped_line = frame_summary.line.strip()
            row.append('    {}\n'.format(stripped_line))

-            orig_line_len = len(frame_summary._original_line)
+            line = frame_summary._original_line
+            orig_line_len = len(line)
            frame_line_len = len(frame_summary.line.lstrip())
            stripped_characters = orig_line_len - frame_line_len
            if (
@ -493,31 +494,40 @@ class StackSummary(list):
                and frame_summary.end_colno is not None
            ):
                start_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.colno) + 1
+                    line, frame_summary.colno)
                end_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.end_colno) + 1
+                    line, frame_summary.end_colno)
+                code_segment = line[start_offset:end_offset]

                anchors = None
                if frame_summary.lineno == frame_summary.end_lineno:
                    with suppress(Exception):
-                        anchors = _extract_caret_anchors_from_line_segment(
-                            frame_summary._original_line[start_offset - 1:end_offset - 1]
-                        )
+                        anchors = _extract_caret_anchors_from_line_segment(code_segment)
                else:
-                    end_offset = stripped_characters + len(stripped_line)
+                    # Don't count the newline since the anchors only need to
+                    # go up until the last character of the line.
+                    end_offset = len(line.rstrip())

                # show indicators if primary char doesn't span the frame line
                if end_offset - start_offset < len(stripped_line) or (
                        anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
+                    # When showing this on a terminal, some of the non-ASCII characters
+                    # might be rendered as double-width characters, so we need to take
+                    # that into account when calculating the length of the line.
+                    dp_start_offset = _display_width(line, start_offset) + 1
+                    dp_end_offset = _display_width(line, end_offset) + 1
+
                    row.append('    ')
-                    row.append(' ' * (start_offset - stripped_characters))
+                    row.append(' ' * (dp_start_offset - stripped_characters))

                    if anchors:
-                        row.append(anchors.primary_char * (anchors.left_end_offset))
-                        row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
-                        row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
+                        dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
+                        dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
+                        row.append(anchors.primary_char * dp_left_end_offset)
+                        row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
+                        row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
                    else:
-                        row.append('^' * (end_offset - start_offset))
+                        row.append('^' * (dp_end_offset - dp_start_offset))

                    row.append('\n')

@ -638,6 +648,25 @@ def _extract_caret_anchors_from_line_segment(segment):

    return None

+_WIDE_CHAR_SPECIFIERS = "WF"
+
+def _display_width(line, offset):
+    """Calculate the extra amount of width space the given source
+    code segment might take if it were to be displayed on a fixed
+    width output device. Supports wide unicode characters and emojis."""
+
+    # Fast track for ASCII-only strings
+    if line.isascii():
+        return offset
+
+    import unicodedata
+
+    return sum(
+        2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
+        for char in line[:offset]
+    )
+
+

 class _ExceptionPrintContext:
    def __init__(self):