From 64d684016b160a52650416d5276d24302127f8e6 Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Sun, 18 Jan 2026 22:34:19 +0900 Subject: [PATCH 1/2] Bump re to 3.14.2 --- Lib/re/__init__.py | 2 +- Lib/re/_compiler.py | 40 ++++++++++++++++++++---------- Lib/re/_constants.py | 4 ++- Lib/re/_parser.py | 19 ++------------ Lib/test/test_re.py | 59 ++++++++++++++++++++++---------------------- 5 files changed, 63 insertions(+), 61 deletions(-) diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index 7e8abbf6ffe..af2808a77da 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -61,7 +61,7 @@ resulting RE will match the second character. \number Matches the contents of the group of the same number. \A Matches only at the start of the string. - \Z Matches only at the end of the string. + \z Matches only at the end of the string. \b Matches the empty string, but only at the start or end of a word. \B Matches the empty string, but not at the start or end of a word. \d Matches any decimal digit; equivalent to the set [0-9] in diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 1b1aaa7714b..20dd561d1c1 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -28,6 +28,8 @@ POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), } +_CHARSET_ALL = [(NEGATE, None)] + def _combine_flags(flags, add_flags, del_flags, TYPE_FLAGS=_parser.TYPE_FLAGS): if add_flags & TYPE_FLAGS: @@ -84,17 +86,22 @@ def _compile(code, pattern, flags): code[skip] = _len(code) - skip elif op is IN: charset, hascased = _optimize_charset(av, iscased, tolower, fixes) - if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: - emit(IN_LOC_IGNORE) - elif not hascased: - emit(IN) - elif not fixes: # ascii - emit(IN_IGNORE) + if not charset: + emit(FAILURE) + elif charset == _CHARSET_ALL: + emit(ANY_ALL) else: - emit(IN_UNI_IGNORE) - skip = _len(code); emit(0) - _compile_charset(charset, flags, code) - code[skip] = _len(code) - skip + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif not hascased: + emit(IN) + elif not fixes: # ascii + emit(IN_IGNORE) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + _compile_charset(charset, flags, code) + code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: emit(ANY_ALL) @@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap[i] = 1 elif op is NEGATE: out.append((op, av)) + elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail: + # Optimize [\s\S] etc. + out = [] if out else _CHARSET_ALL + return out, False else: tail.append((op, av)) except IndexError: @@ -524,13 +535,18 @@ def _compile_info(code, pattern, flags): # look for a literal prefix prefix = [] prefix_skip = 0 - charset = [] # not used + charset = None # not used if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): # look for literal prefix prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) # if no prefix, look for charset prefix if not prefix: charset = _get_charset_prefix(pattern, flags) + if charset: + charset, hascased = _optimize_charset(charset) + assert not hascased + if charset == _CHARSET_ALL: + charset = None ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -565,8 +581,6 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: - charset, hascased = _optimize_charset(charset) - assert not hascased _compile_charset(charset, flags, code) code[skip] = len(code) - skip diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index 9c3c294ba44..d6f32302d37 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -15,7 +15,7 @@ MAGIC = 20230612 -from _sre import MAXREPEAT, MAXGROUPS +from _sre import MAXREPEAT, MAXGROUPS # noqa: F401 # SRE standard exception (access as sre.error) # should this really be here? @@ -206,6 +206,8 @@ def _makecodes(*names): CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK } +CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2])) + # flags SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_LOCALE = 4 # honour system locale diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index f3c779340fe..35ab7ede2a7 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -49,7 +49,8 @@ r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string + r"\z": (AT, AT_END_STRING), # end of string + r"\Z": (AT, AT_END_STRING), # end of string (obsolete) } FLAGS = { @@ -807,14 +808,6 @@ def _parse(source, state, verbose, nested, first=False): state.grouprefpos[condgroup] = ( source.tell() - len(condname) - 1 ) - if not (condname.isdecimal() and condname.isascii()): - import warnings - warnings.warn( - "bad character in group name %s at position %d" % - (repr(condname) if source.istext else ascii(condname), - source.tell() - len(condname) - 1), - DeprecationWarning, stacklevel=nested + 6 - ) state.checklookbehindgroup(condgroup, source) item_yes = _parse(source, state, verbose, nested + 1) if source.match("|"): @@ -1038,14 +1031,6 @@ def addgroup(index, pos): if index >= MAXGROUPS: raise s.error("invalid group reference %d" % index, len(name) + 1) - if not (name.isdecimal() and name.isascii()): - import warnings - warnings.warn( - "bad character in group name %s at position %d" % - (repr(name) if s.istext else ascii(name), - s.tell() - len(name) - 1), - DeprecationWarning, stacklevel=5 - ) addgroup(index, len(name) + 1) elif c == "0": if s.next in OCTDIGITS: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 04c8ee71a99..8b935dc7f12 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1,12 +1,11 @@ from test.support import (gc_collect, bigmemtest, _2G, cpython_only, captured_stdout, - check_disallow_instantiation, is_emscripten, is_wasi, + check_disallow_instantiation, linked_to_musl, warnings_helper, SHORT_TIMEOUT, Stopwatch, requires_resource) import locale import re import string import sys -import time import unittest import warnings from re import Scanner @@ -14,7 +13,7 @@ # some platforms lack working multiprocessing try: - import _multiprocessing + import _multiprocessing # noqa: F401 except ImportError: multiprocessing = None else: @@ -621,6 +620,7 @@ def test_re_fullmatch(self): self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) self.assertIsNone(re.fullmatch(r"a+", "ab")) self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) + self.assertIsNone(re.fullmatch(r"abc\z", "abc\n")) self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) @@ -806,6 +806,8 @@ def test_special_escapes(self): self.assertEqual(re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1), "bx") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") + self.assertEqual(re.search(r"^\Aabc\z$", "abc", re.M).group(0), "abc") + self.assertIsNone(re.search(r"^\Aabc\z$", "\nabc\n", re.M)) self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) self.assertEqual(re.search(br"\b(b.)\b", @@ -817,6 +819,8 @@ def test_special_escapes(self): self.assertEqual(re.search(br"\B(b.)\B", b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") + self.assertEqual(re.search(br"^\Aabc\z$", b"abc", re.M).group(0), b"abc") + self.assertIsNone(re.search(br"^\Aabc\z$", b"\nabc\n", re.M)) self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) self.assertEqual(re.search(r"\d\D\w\W\s\S", @@ -840,7 +844,7 @@ def test_other_escapes(self): self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') self.assertIsNone(re.match(r"[\^a]+", 'b')) re.purge() # for warnings - for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': + for c in 'ceghijklmopqyCEFGHIJKLMNOPQRTVXY': with self.subTest(c): self.assertRaises(re.PatternError, re.compile, '\\%c' % c) for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': @@ -888,6 +892,8 @@ def test_named_unicode_escapes(self): self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) + # TODO: RUSTPYTHON; re.search(r"\B", "") now returns a match in CPython 3.14 + @unittest.expectedFailure def test_word_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc") @@ -983,18 +989,15 @@ def test_word_boundaries(self): self.assertIsNone(re.fullmatch(br".+\B", b"abc", re.LOCALE)) self.assertIsNone(re.fullmatch(r".+\B", "ьюя")) self.assertTrue(re.fullmatch(r".+\B", "ьюя", re.ASCII)) - # However, an empty string contains no word boundaries, and also no - # non-boundaries. + # However, an empty string contains no word boundaries. self.assertIsNone(re.search(r"\b", "")) self.assertIsNone(re.search(r"\b", "", re.ASCII)) self.assertIsNone(re.search(br"\b", b"")) self.assertIsNone(re.search(br"\b", b"", re.LOCALE)) - # This one is questionable and different from the perlre behaviour, - # but describes current behavior. - self.assertIsNone(re.search(r"\B", "")) - self.assertIsNone(re.search(r"\B", "", re.ASCII)) - self.assertIsNone(re.search(br"\B", b"")) - self.assertIsNone(re.search(br"\B", b"", re.LOCALE)) + self.assertTrue(re.search(r"\B", "")) + self.assertTrue(re.search(r"\B", "", re.ASCII)) + self.assertTrue(re.search(br"\B", b"")) + self.assertTrue(re.search(br"\B", b"", re.LOCALE)) # A single word-character string has two boundaries, but no # non-boundary gaps. self.assertEqual(len(re.findall(r"\b", "a")), 2) @@ -1423,7 +1426,7 @@ def test_pickling(self): newpat = pickle.loads(pickled) self.assertEqual(newpat, oldpat) # current pickle expects the _compile() reconstructor in re module - from re import _compile + from re import _compile # noqa: F401 @unittest.expectedFailure # TODO: RUSTPYTHON def test_copying(self): @@ -1755,7 +1758,7 @@ def test_bug_6561(self): for x in not_decimal_digits: self.assertIsNone(re.match(r'^\d$', x)) - @unittest.expectedFailure # TODO: RUSTPYTHON a = array.array(typecode)\n ValueError: bad typecode (must be b, B, u, h, H, i, I, l, L, q, Q, f or d) + @unittest.expectedFailure # TODO: RUSTPYTHON; a = array.array(typecode)\n ValueError: bad typecode (must be b, B, u, h, H, i, I, l, L, q, Q, f or d) @warnings_helper.ignore_warnings(category=DeprecationWarning) # gh-80480 array('u') def test_empty_array(self): # SF buf 1647541 @@ -2185,10 +2188,9 @@ def test_bug_20998(self): self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3)) @unittest.expectedFailure # TODO: RUSTPYTHON; self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))\n AssertionError: None is not true - @unittest.skipIf( - is_emscripten or is_wasi, - "musl libc issue on Emscripten/WASI, bpo-46390" - ) + @unittest.skipIf(linked_to_musl(), "musl libc issue, bpo-46390") + @unittest.skipIf(sys.platform.startswith("sunos"), + "test doesn't work on Solaris, gh-91214") def test_locale_caching(self): # Issue #22410 oldlocale = locale.setlocale(locale.LC_CTYPE) @@ -2225,10 +2227,9 @@ def check_en_US_utf8(self): self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5')) - @unittest.skipIf( - is_emscripten or is_wasi, - "musl libc issue on Emscripten/WASI, bpo-46390" - ) + @unittest.skipIf(linked_to_musl(), "musl libc issue, bpo-46390") + @unittest.skipIf(sys.platform.startswith("sunos"), + "test doesn't work on Solaris, gh-91214") def test_locale_compiled(self): oldlocale = locale.setlocale(locale.LC_CTYPE) self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale) @@ -2632,8 +2633,8 @@ def test_findall_atomic_grouping(self): @unittest.expectedFailure # TODO: RUSTPYTHON def test_bug_gh91616(self): - self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer - self.assertTrue(re.fullmatch(r'(?s:(?=(?P.*?\.))(?P=g0).*)\Z', "a.txt")) + self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\z', "a.txt")) # reproducer + self.assertTrue(re.fullmatch(r'(?s:(?=(?P.*?\.))(?P=g0).*)\z', "a.txt")) def test_bug_gh100061(self): # gh-100061 @@ -2655,7 +2656,7 @@ def test_bug_gh100061(self): self.assertEqual(re.match("(?>(?:ab?c){1,3})", "aca").span(), (0, 2)) self.assertEqual(re.match("(?:ab?c){1,3}+", "aca").span(), (0, 2)) - @unittest.expectedFailure # TODO: RUSTPYTHON; self.assertEqual(re.match('((x)|y|z){3}+', 'xyz').groups(), ('z', 'x'))\n AssertionError: Tuples differ: ('x', 'x') != ('z', 'x') + @unittest.expectedFailure # TODO: RUSTPYTHON; self.assertEqual(re.match('((x)|y|z){3}+', 'xyz').groups(), ('z', 'x'))\n AssertionError: Tuples differ: ('x', 'x') != ('z', 'x') def test_bug_gh101955(self): # Possessive quantifier with nested alternative with capture groups self.assertEqual(re.match('((x)|y|z)*+', 'xyz').groups(), ('z', 'x')) @@ -2893,11 +2894,11 @@ def test_long_pattern(self): pattern = 'Very %spattern' % ('long ' * 1000) r = repr(re.compile(pattern)) self.assertLess(len(r), 300) - self.assertEqual(r[:30], "re.compile('Very long long lon") + self.assertStartsWith(r, "re.compile('Very long long lon") r = repr(re.compile(pattern, re.I)) self.assertLess(len(r), 300) - self.assertEqual(r[:30], "re.compile('Very long long lon") - self.assertEqual(r[-16:], ", re.IGNORECASE)") + self.assertStartsWith(r, "re.compile('Very long long lon") + self.assertEndsWith(r, ", re.IGNORECASE)") def test_flags_repr(self): self.assertEqual(repr(re.I), "re.IGNORECASE") @@ -2977,7 +2978,7 @@ def test_deprecated_modules(self): self.assertEqual(mod.__name__, name) self.assertEqual(mod.__package__, '') for attr in deprecated[name]: - self.assertTrue(hasattr(mod, attr)) + self.assertHasAttr(mod, attr) del sys.modules[name] @cpython_only From fbf1f60cb008bb8860993e8eb1466133d0a8dc97 Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Tue, 20 Jan 2026 14:14:04 +0900 Subject: [PATCH 2/2] Unmark resolved test --- Lib/test/test_warnings/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_warnings/__init__.py b/Lib/test/test_warnings/__init__.py index abdf7b32df2..87632821a8e 100644 --- a/Lib/test/test_warnings/__init__.py +++ b/Lib/test/test_warnings/__init__.py @@ -241,7 +241,6 @@ def test_once(self): 42) self.assertEqual(len(w), 0) - @unittest.expectedFailure # TODO: RUSTPYTHON re.PatternError: bad escape \z at position 15 def test_filter_module(self): MS_WINDOWS = (sys.platform == 'win32') with self.module.catch_warnings(record=True) as w: