mirror of
https://github.com/python/cpython.git
synced 2024-11-24 00:38:00 +01:00
898ff03e1e
Compiled regular expression objects with the re.LOCALE flag no longer depend on the locale at compile time. Only the locale at matching time affects the result of matching.
231 lines
6.5 KiB
Python
231 lines
6.5 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# various symbols used by the regular expression engine.
|
|
# run this script to update the _sre include files!
|
|
#
|
|
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the sre.py file for information on usage and redistribution.
|
|
#
|
|
|
|
"""Internal support module for sre"""
|
|
|
|
# update when constants are added or removed
|
|
|
|
MAGIC = 20170530
|
|
|
|
from _sre import MAXREPEAT, MAXGROUPS
|
|
|
|
# SRE standard exception (access as sre.error)
|
|
# should this really be here?
|
|
|
|
class error(Exception):
|
|
def __init__(self, msg, pattern=None, pos=None):
|
|
self.msg = msg
|
|
self.pattern = pattern
|
|
self.pos = pos
|
|
if pattern is not None and pos is not None:
|
|
msg = '%s at position %d' % (msg, pos)
|
|
if isinstance(pattern, str):
|
|
newline = '\n'
|
|
else:
|
|
newline = b'\n'
|
|
self.lineno = pattern.count(newline, 0, pos) + 1
|
|
self.colno = pos - pattern.rfind(newline, 0, pos)
|
|
if newline in pattern:
|
|
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
|
|
else:
|
|
self.lineno = self.colno = None
|
|
super().__init__(msg)
|
|
|
|
|
|
class _NamedIntConstant(int):
|
|
def __new__(cls, value, name):
|
|
self = super(_NamedIntConstant, cls).__new__(cls, value)
|
|
self.name = name
|
|
return self
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
__repr__ = __str__
|
|
|
|
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
|
|
|
|
def _makecodes(names):
|
|
names = names.strip().split()
|
|
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
|
|
globals().update({item.name: item for item in items})
|
|
return items
|
|
|
|
# operators
|
|
# failure=0 success=1 (just because it looks better that way :-)
|
|
OPCODES = _makecodes("""
|
|
FAILURE SUCCESS
|
|
|
|
ANY ANY_ALL
|
|
ASSERT ASSERT_NOT
|
|
AT
|
|
BRANCH
|
|
CALL
|
|
CATEGORY
|
|
CHARSET BIGCHARSET
|
|
GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE
|
|
IN IN_IGNORE
|
|
INFO
|
|
JUMP
|
|
LITERAL LITERAL_IGNORE
|
|
MARK
|
|
MAX_UNTIL
|
|
MIN_UNTIL
|
|
NOT_LITERAL NOT_LITERAL_IGNORE
|
|
NEGATE
|
|
RANGE
|
|
REPEAT
|
|
REPEAT_ONE
|
|
SUBPATTERN
|
|
MIN_REPEAT_ONE
|
|
RANGE_IGNORE
|
|
LITERAL_LOC_IGNORE
|
|
NOT_LITERAL_LOC_IGNORE
|
|
IN_LOC_IGNORE
|
|
|
|
MIN_REPEAT MAX_REPEAT
|
|
""")
|
|
del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
|
|
|
|
# positions
|
|
ATCODES = _makecodes("""
|
|
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
|
|
AT_BOUNDARY AT_NON_BOUNDARY
|
|
AT_END AT_END_LINE AT_END_STRING
|
|
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
|
|
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
|
|
""")
|
|
|
|
# categories
|
|
CHCODES = _makecodes("""
|
|
CATEGORY_DIGIT CATEGORY_NOT_DIGIT
|
|
CATEGORY_SPACE CATEGORY_NOT_SPACE
|
|
CATEGORY_WORD CATEGORY_NOT_WORD
|
|
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
|
|
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
|
|
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
|
|
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
|
|
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
|
|
CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
|
|
""")
|
|
|
|
|
|
# replacement operations for "ignore case" mode
|
|
OP_IGNORE = {
|
|
GROUPREF: GROUPREF_IGNORE,
|
|
IN: IN_IGNORE,
|
|
LITERAL: LITERAL_IGNORE,
|
|
NOT_LITERAL: NOT_LITERAL_IGNORE,
|
|
RANGE: RANGE_IGNORE,
|
|
}
|
|
|
|
OP_LOC_IGNORE = {
|
|
LITERAL: LITERAL_LOC_IGNORE,
|
|
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
|
|
}
|
|
|
|
AT_MULTILINE = {
|
|
AT_BEGINNING: AT_BEGINNING_LINE,
|
|
AT_END: AT_END_LINE
|
|
}
|
|
|
|
AT_LOCALE = {
|
|
AT_BOUNDARY: AT_LOC_BOUNDARY,
|
|
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
|
|
}
|
|
|
|
AT_UNICODE = {
|
|
AT_BOUNDARY: AT_UNI_BOUNDARY,
|
|
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
|
|
}
|
|
|
|
CH_LOCALE = {
|
|
CATEGORY_DIGIT: CATEGORY_DIGIT,
|
|
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
|
|
CATEGORY_SPACE: CATEGORY_SPACE,
|
|
CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
|
|
CATEGORY_WORD: CATEGORY_LOC_WORD,
|
|
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
|
|
CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
|
|
CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
|
|
}
|
|
|
|
CH_UNICODE = {
|
|
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
|
|
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
|
|
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
|
|
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
|
|
CATEGORY_WORD: CATEGORY_UNI_WORD,
|
|
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
|
|
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
|
|
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
|
|
}
|
|
|
|
# flags
|
|
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
|
|
SRE_FLAG_IGNORECASE = 2 # case insensitive
|
|
SRE_FLAG_LOCALE = 4 # honour system locale
|
|
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
|
|
SRE_FLAG_DOTALL = 16 # treat target as a single string
|
|
SRE_FLAG_UNICODE = 32 # use unicode "locale"
|
|
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
|
|
SRE_FLAG_DEBUG = 128 # debugging
|
|
SRE_FLAG_ASCII = 256 # use ascii "locale"
|
|
|
|
# flags for INFO primitive
|
|
SRE_INFO_PREFIX = 1 # has prefix
|
|
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
|
|
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
|
|
|
|
if __name__ == "__main__":
|
|
def dump(f, d, prefix):
|
|
items = sorted(d)
|
|
for item in items:
|
|
f.write("#define %s_%s %d\n" % (prefix, item, item))
|
|
with open("sre_constants.h", "w") as f:
|
|
f.write("""\
|
|
/*
|
|
* Secret Labs' Regular Expression Engine
|
|
*
|
|
* regular expression matching engine
|
|
*
|
|
* NOTE: This file is generated by sre_constants.py. If you need
|
|
* to change anything in here, edit sre_constants.py and run it.
|
|
*
|
|
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
|
*
|
|
* See the _sre.c file for information on usage and redistribution.
|
|
*/
|
|
|
|
""")
|
|
|
|
f.write("#define SRE_MAGIC %d\n" % MAGIC)
|
|
|
|
dump(f, OPCODES, "SRE_OP")
|
|
dump(f, ATCODES, "SRE")
|
|
dump(f, CHCODES, "SRE")
|
|
|
|
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
|
|
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
|
|
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
|
|
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
|
|
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
|
|
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
|
|
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
|
|
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
|
|
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
|
|
|
|
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
|
|
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
|
|
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
|
|
|
|
print("done")
|