Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Lib/re/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
resulting RE will match the second character.
\number Matches the contents of the group of the same number.
\A Matches only at the start of the string.
\Z Matches only at the end of the string.
\z Matches only at the end of the string.
\b Matches the empty string, but only at the start or end of a word.
\B Matches the empty string, but not at the start or end of a word.
\d Matches any decimal digit; equivalent to the set [0-9] in
Expand Down
40 changes: 27 additions & 13 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}

_CHARSET_ALL = [(NEGATE, None)]

def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=_parser.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
Expand Down Expand Up @@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
if not charset:
emit(FAILURE)
elif charset == _CHARSET_ALL:
emit(ANY_ALL)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
Expand Down Expand Up @@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
# Optimize [\s\S] etc.
out = [] if out else _CHARSET_ALL
return out, False
else:
tail.append((op, av))
except IndexError:
Expand Down Expand Up @@ -524,13 +535,18 @@ def _compile_info(code, pattern, flags):
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
charset = None # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
if charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
if charset == _CHARSET_ALL:
charset = None
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
Expand Down Expand Up @@ -565,8 +581,6 @@ def _compile_info(code, pattern, flags):
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip

Expand Down
4 changes: 3 additions & 1 deletion Lib/re/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

MAGIC = 20230612

from _sre import MAXREPEAT, MAXGROUPS
from _sre import MAXREPEAT, MAXGROUPS # noqa: F401

# SRE standard exception (access as sre.error)
# should this really be here?
Expand Down Expand Up @@ -206,6 +206,8 @@ def _makecodes(*names):
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}

CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))

# flags
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
Expand Down
19 changes: 2 additions & 17 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\Z": (AT, AT_END_STRING), # end of string
r"\z": (AT, AT_END_STRING), # end of string
r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
}

FLAGS = {
Expand Down Expand Up @@ -807,14 +808,6 @@ def _parse(source, state, verbose, nested, first=False):
state.grouprefpos[condgroup] = (
source.tell() - len(condname) - 1
)
if not (condname.isdecimal() and condname.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(condname) if source.istext else ascii(condname),
source.tell() - len(condname) - 1),
DeprecationWarning, stacklevel=nested + 6
)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"):
Expand Down Expand Up @@ -1038,14 +1031,6 @@ def addgroup(index, pos):
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
if not (name.isdecimal() and name.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(name) if s.istext else ascii(name),
s.tell() - len(name) - 1),
DeprecationWarning, stacklevel=5
)
addgroup(index, len(name) + 1)
elif c == "0":
if s.next in OCTDIGITS:
Expand Down
59 changes: 30 additions & 29 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
from test.support import (gc_collect, bigmemtest, _2G,
cpython_only, captured_stdout,
check_disallow_instantiation, is_emscripten, is_wasi,
check_disallow_instantiation, linked_to_musl,
warnings_helper, SHORT_TIMEOUT, Stopwatch, requires_resource)
import locale
import re
import string
import sys
import time
import unittest
import warnings
from re import Scanner
from weakref import proxy

# some platforms lack working multiprocessing
try:
import _multiprocessing
import _multiprocessing # noqa: F401
except ImportError:
multiprocessing = None
else:
Expand Down Expand Up @@ -621,6 +620,7 @@ def test_re_fullmatch(self):
self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
self.assertIsNone(re.fullmatch(r"a+", "ab"))
self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
self.assertIsNone(re.fullmatch(r"abc\z", "abc\n"))
self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
Expand Down Expand Up @@ -806,6 +806,8 @@ def test_special_escapes(self):
self.assertEqual(re.search(r"\B(b.)\B",
"abc bcd bc abxd", re.ASCII).group(1), "bx")
self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
self.assertEqual(re.search(r"^\Aabc\z$", "abc", re.M).group(0), "abc")
self.assertIsNone(re.search(r"^\Aabc\z$", "\nabc\n", re.M))
self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
self.assertEqual(re.search(br"\b(b.)\b",
Expand All @@ -817,6 +819,8 @@ def test_special_escapes(self):
self.assertEqual(re.search(br"\B(b.)\B",
b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
self.assertEqual(re.search(br"^\Aabc\z$", b"abc", re.M).group(0), b"abc")
self.assertIsNone(re.search(br"^\Aabc\z$", b"\nabc\n", re.M))
self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
self.assertEqual(re.search(r"\d\D\w\W\s\S",
Expand All @@ -840,7 +844,7 @@ def test_other_escapes(self):
self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
self.assertIsNone(re.match(r"[\^a]+", 'b'))
re.purge() # for warnings
for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
for c in 'ceghijklmopqyCEFGHIJKLMNOPQRTVXY':
with self.subTest(c):
self.assertRaises(re.PatternError, re.compile, '\\%c' % c)
for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
Expand Down Expand Up @@ -888,6 +892,8 @@ def test_named_unicode_escapes(self):
self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)

# TODO: RUSTPYTHON; re.search(r"\B", "") now returns a match in CPython 3.14
@unittest.expectedFailure
def test_word_boundaries(self):
# See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc")
Expand Down Expand Up @@ -983,18 +989,15 @@ def test_word_boundaries(self):
self.assertIsNone(re.fullmatch(br".+\B", b"abc", re.LOCALE))
self.assertIsNone(re.fullmatch(r".+\B", "ьюя"))
self.assertTrue(re.fullmatch(r".+\B", "ьюя", re.ASCII))
# However, an empty string contains no word boundaries, and also no
# non-boundaries.
# However, an empty string contains no word boundaries.
self.assertIsNone(re.search(r"\b", ""))
self.assertIsNone(re.search(r"\b", "", re.ASCII))
self.assertIsNone(re.search(br"\b", b""))
self.assertIsNone(re.search(br"\b", b"", re.LOCALE))
# This one is questionable and different from the perlre behaviour,
# but describes current behavior.
self.assertIsNone(re.search(r"\B", ""))
self.assertIsNone(re.search(r"\B", "", re.ASCII))
self.assertIsNone(re.search(br"\B", b""))
self.assertIsNone(re.search(br"\B", b"", re.LOCALE))
self.assertTrue(re.search(r"\B", ""))
self.assertTrue(re.search(r"\B", "", re.ASCII))
self.assertTrue(re.search(br"\B", b""))
self.assertTrue(re.search(br"\B", b"", re.LOCALE))
# A single word-character string has two boundaries, but no
# non-boundary gaps.
self.assertEqual(len(re.findall(r"\b", "a")), 2)
Expand Down Expand Up @@ -1423,7 +1426,7 @@ def test_pickling(self):
newpat = pickle.loads(pickled)
self.assertEqual(newpat, oldpat)
# current pickle expects the _compile() reconstructor in re module
from re import _compile
from re import _compile # noqa: F401

@unittest.expectedFailure # TODO: RUSTPYTHON
def test_copying(self):
Expand Down Expand Up @@ -1755,7 +1758,7 @@ def test_bug_6561(self):
for x in not_decimal_digits:
self.assertIsNone(re.match(r'^\d$', x))

@unittest.expectedFailure # TODO: RUSTPYTHON a = array.array(typecode)\n ValueError: bad typecode (must be b, B, u, h, H, i, I, l, L, q, Q, f or d)
@unittest.expectedFailure # TODO: RUSTPYTHON; a = array.array(typecode)\n ValueError: bad typecode (must be b, B, u, h, H, i, I, l, L, q, Q, f or d)
@warnings_helper.ignore_warnings(category=DeprecationWarning) # gh-80480 array('u')
def test_empty_array(self):
# SF buf 1647541
Expand Down Expand Up @@ -2185,10 +2188,9 @@ def test_bug_20998(self):
self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))

@unittest.expectedFailure # TODO: RUSTPYTHON; self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))\n AssertionError: None is not true
@unittest.skipIf(
is_emscripten or is_wasi,
"musl libc issue on Emscripten/WASI, bpo-46390"
)
@unittest.skipIf(linked_to_musl(), "musl libc issue, bpo-46390")
@unittest.skipIf(sys.platform.startswith("sunos"),
"test doesn't work on Solaris, gh-91214")
def test_locale_caching(self):
# Issue #22410
oldlocale = locale.setlocale(locale.LC_CTYPE)
Expand Down Expand Up @@ -2225,10 +2227,9 @@ def check_en_US_utf8(self):
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))

@unittest.skipIf(
is_emscripten or is_wasi,
"musl libc issue on Emscripten/WASI, bpo-46390"
)
@unittest.skipIf(linked_to_musl(), "musl libc issue, bpo-46390")
@unittest.skipIf(sys.platform.startswith("sunos"),
"test doesn't work on Solaris, gh-91214")
def test_locale_compiled(self):
oldlocale = locale.setlocale(locale.LC_CTYPE)
self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
Expand Down Expand Up @@ -2632,8 +2633,8 @@ def test_findall_atomic_grouping(self):

@unittest.expectedFailure # TODO: RUSTPYTHON
def test_bug_gh91616(self):
self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer
self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\Z', "a.txt"))
self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\z', "a.txt")) # reproducer
self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\z', "a.txt"))

def test_bug_gh100061(self):
# gh-100061
Expand All @@ -2655,7 +2656,7 @@ def test_bug_gh100061(self):
self.assertEqual(re.match("(?>(?:ab?c){1,3})", "aca").span(), (0, 2))
self.assertEqual(re.match("(?:ab?c){1,3}+", "aca").span(), (0, 2))

@unittest.expectedFailure # TODO: RUSTPYTHON; self.assertEqual(re.match('((x)|y|z){3}+', 'xyz').groups(), ('z', 'x'))\n AssertionError: Tuples differ: ('x', 'x') != ('z', 'x')
@unittest.expectedFailure # TODO: RUSTPYTHON; self.assertEqual(re.match('((x)|y|z){3}+', 'xyz').groups(), ('z', 'x'))\n AssertionError: Tuples differ: ('x', 'x') != ('z', 'x')
def test_bug_gh101955(self):
# Possessive quantifier with nested alternative with capture groups
self.assertEqual(re.match('((x)|y|z)*+', 'xyz').groups(), ('z', 'x'))
Expand Down Expand Up @@ -2893,11 +2894,11 @@ def test_long_pattern(self):
pattern = 'Very %spattern' % ('long ' * 1000)
r = repr(re.compile(pattern))
self.assertLess(len(r), 300)
self.assertEqual(r[:30], "re.compile('Very long long lon")
self.assertStartsWith(r, "re.compile('Very long long lon")
r = repr(re.compile(pattern, re.I))
self.assertLess(len(r), 300)
self.assertEqual(r[:30], "re.compile('Very long long lon")
self.assertEqual(r[-16:], ", re.IGNORECASE)")
self.assertStartsWith(r, "re.compile('Very long long lon")
self.assertEndsWith(r, ", re.IGNORECASE)")

def test_flags_repr(self):
self.assertEqual(repr(re.I), "re.IGNORECASE")
Expand Down Expand Up @@ -2977,7 +2978,7 @@ def test_deprecated_modules(self):
self.assertEqual(mod.__name__, name)
self.assertEqual(mod.__package__, '')
for attr in deprecated[name]:
self.assertTrue(hasattr(mod, attr))
self.assertHasAttr(mod, attr)
del sys.modules[name]

@cpython_only
Expand Down
1 change: 0 additions & 1 deletion Lib/test/test_warnings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ def test_once(self):
42)
self.assertEqual(len(w), 0)

@unittest.expectedFailure # TODO: RUSTPYTHON re.PatternError: bad escape \z at position 15
def test_filter_module(self):
MS_WINDOWS = (sys.platform == 'win32')
with self.module.catch_warnings(record=True) as w:
Expand Down
Loading