From 819830f34a11ecaa3aada174ca8eedeb3f260630 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 11 Nov 2024 18:27:26 +0200 Subject: [PATCH] gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557) * upper-case non-BMP character was ignored * the ASCII flag was ignored when matching a character range whose upper bound is beyond the BMP region --- Lib/re/_compiler.py | 23 +++++--- Lib/test/test_re.py | 55 +++++++++++++++++++ ...-11-07-22-41-47.gh-issue-126505.iztYE1.rst | 4 ++ 3 files changed, 73 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 29109f8812e..20dd561d1c1 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -255,11 +255,11 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): while True: try: if op is LITERAL: - if fixup: - lo = fixup(av) - charmap[lo] = 1 - if fixes and lo in fixes: - for k in fixes[lo]: + if fixup: # IGNORECASE and not LOCALE + av = fixup(av) + charmap[av] = 1 + if fixes and av in fixes: + for k in fixes[av]: charmap[k] = 1 if not hascased and iscased(av): hascased = True @@ -267,7 +267,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) - if fixup: + if fixup: # IGNORECASE and not LOCALE if fixes: for i in map(fixup, r): charmap[i] = 1 @@ -298,8 +298,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # Character set contains non-BMP character codes. # For range, all BMP characters in the range are already # proceeded. - if fixup: - hascased = True + if fixup: # IGNORECASE and not LOCALE # For now, IN_UNI_IGNORE+LITERAL and # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP # characters, because two characters (at least one of @@ -310,7 +309,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # Also, both c.lower() and c.lower().upper() are single # characters for every non-BMP character. if op is RANGE: - op = RANGE_UNI_IGNORE + if fixes: # not ASCII + op = RANGE_UNI_IGNORE + hascased = True + else: + assert op is LITERAL + if not hascased and iscased(av): + hascased = True tail.append((op, av)) break diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index ff95f54026e..7bc702ec89a 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1136,6 +1136,39 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(br'[19a]', b'a', re.I)) self.assertTrue(re.match(br'[19a]', b'A', re.I)) self.assertTrue(re.match(br'[19A]', b'a', re.I)) + self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I)) + self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I)) + self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I)) + self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I)) + self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I)) + self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I)) + self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I)) + self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I)) + self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I)) + self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I)) + self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I)) + self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I)) + + self.assertTrue(re.match(br'[19A]', b'A', re.I)) + self.assertTrue(re.match(br'[19a]', b'a', re.I)) + self.assertTrue(re.match(br'[19a]', b'A', re.I)) + self.assertTrue(re.match(br'[19A]', b'a', re.I)) + self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A)) + self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A)) + self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A)) + self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A)) + self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A)) + self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A)) + self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A)) + self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A)) + self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A)) + self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A)) + self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A)) + self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A)) + self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A)) + self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A)) + self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A)) + self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A)) # Two different characters have the same lowercase. assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K' @@ -1172,8 +1205,10 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(br'[9-a]', b'_', re.I)) self.assertIsNone(re.match(br'[9-A]', b'_', re.I)) self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I)) + self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I)) self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I)) self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I)) + self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I)) self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I)) self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I)) self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I)) @@ -1184,6 +1219,26 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) + self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A)) + self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A)) + self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A)) + self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A)) + self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A)) + self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A)) + self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A)) + self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A)) + self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A)) + self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A)) + self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A)) + self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A)) + + self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A)) + self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A)) + self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A)) + self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A)) + self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A)) + self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A)) + # Two different characters have the same lowercase. assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K' self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) diff --git a/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst b/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst new file mode 100644 index 00000000000..0a0f893a268 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst @@ -0,0 +1,4 @@ +Fix bugs in compiling case-insensitive :mod:`regular expressions ` with +character classes containing non-BMP characters: upper-case non-BMP +character did was ignored and the ASCII flag was ignored when +matching a character range whose upper bound is beyond the BMP region.