From 78e6d72e38ef4b490f0098b644454031f20ae361 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Thu, 26 Oct 2023 00:05:29 -0700 Subject: [PATCH] bpo-43950: handle wide unicode characters in tracebacks (#28150) --- Lib/test/test_traceback.py | 59 ++++++++++++++++++++++++++++++++++++-- Lib/traceback.py | 53 ++++++++++++++++++++++++++-------- 2 files changed, 98 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py index 43f15ab6179..0c5d7c9c8c5 100644 --- a/Lib/test/test_traceback.py +++ b/Lib/test/test_traceback.py @@ -924,8 +924,63 @@ class TracebackErrorLocationCaretTestBase: f" File \"{__file__}\", line {self.callable_line}, in get_exception", " callable()", f" File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f", - " print(1, www(", - " ^^^^", + f" print(1, www(", + f" ^^^^^^^", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_with_wide_characters_term_highlight(self): + def f(): + 说明说明 = 1 + şçöğıĤellö = 0 # not wide but still non-ascii + return 说明说明 / şçöğıĤellö + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f", + f" return 说明说明 / şçöğıĤellö", + f" ~~~~~~~~~^~~~~~~~~~~~", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_with_emojis_term_highlight(self): + def f(): + return "✨🐍" + func_说明说明("📗🚛", + "📗🚛") + "🐍" + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f", + f' return "✨🐍" + func_说明说明("📗🚛",', + f" ^^^^^^^^^^^^^", + ] + self.assertEqual(actual, expected) + + def test_byte_offset_wide_chars_subscript(self): + def f(): + my_dct = { + "✨🚛✨": { + "说明": { + "🐍🐍🐍": None + } + } + } + return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"] + + actual = self.get_exception(f) + expected = [ + f"Traceback (most recent call last):", + f" File \"{__file__}\", line {self.callable_line}, in get_exception", + f" callable()", + f" File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f", + f' return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]', + f" ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^", ] self.assertEqual(actual, expected) diff --git a/Lib/traceback.py b/Lib/traceback.py index 4f0dff9bed0..0d41c3432ed 100644 --- a/Lib/traceback.py +++ b/Lib/traceback.py @@ -485,7 +485,8 @@ class StackSummary(list): stripped_line = frame_summary.line.strip() row.append(' {}\n'.format(stripped_line)) - orig_line_len = len(frame_summary._original_line) + line = frame_summary._original_line + orig_line_len = len(line) frame_line_len = len(frame_summary.line.lstrip()) stripped_characters = orig_line_len - frame_line_len if ( @@ -493,31 +494,40 @@ class StackSummary(list): and frame_summary.end_colno is not None ): start_offset = _byte_offset_to_character_offset( - frame_summary._original_line, frame_summary.colno) + 1 + line, frame_summary.colno) end_offset = _byte_offset_to_character_offset( - frame_summary._original_line, frame_summary.end_colno) + 1 + line, frame_summary.end_colno) + code_segment = line[start_offset:end_offset] anchors = None if frame_summary.lineno == frame_summary.end_lineno: with suppress(Exception): - anchors = _extract_caret_anchors_from_line_segment( - frame_summary._original_line[start_offset - 1:end_offset - 1] - ) + anchors = _extract_caret_anchors_from_line_segment(code_segment) else: - end_offset = stripped_characters + len(stripped_line) + # Don't count the newline since the anchors only need to + # go up until the last character of the line. + end_offset = len(line.rstrip()) # show indicators if primary char doesn't span the frame line if end_offset - start_offset < len(stripped_line) or ( anchors and anchors.right_start_offset - anchors.left_end_offset > 0): + # When showing this on a terminal, some of the non-ASCII characters + # might be rendered as double-width characters, so we need to take + # that into account when calculating the length of the line. + dp_start_offset = _display_width(line, start_offset) + 1 + dp_end_offset = _display_width(line, end_offset) + 1 + row.append(' ') - row.append(' ' * (start_offset - stripped_characters)) + row.append(' ' * (dp_start_offset - stripped_characters)) if anchors: - row.append(anchors.primary_char * (anchors.left_end_offset)) - row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset)) - row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset)) + dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset) + dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset) + row.append(anchors.primary_char * dp_left_end_offset) + row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset)) + row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset)) else: - row.append('^' * (end_offset - start_offset)) + row.append('^' * (dp_end_offset - dp_start_offset)) row.append('\n') @@ -638,6 +648,25 @@ def _extract_caret_anchors_from_line_segment(segment): return None +_WIDE_CHAR_SPECIFIERS = "WF" + +def _display_width(line, offset): + """Calculate the extra amount of width space the given source + code segment might take if it were to be displayed on a fixed + width output device. Supports wide unicode characters and emojis.""" + + # Fast track for ASCII-only strings + if line.isascii(): + return offset + + import unicodedata + + return sum( + 2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1 + for char in line[:offset] + ) + + class _ExceptionPrintContext: def __init__(self):