diff options
| author | Serhiy Storchaka <storchaka@gmail.com> | 2017-10-24 23:31:42 +0300 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2017-10-24 23:31:42 +0300 | 
| commit | 3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132 (patch) | |
| tree | aa741f0d09293f6dfe9668a5b328658ce13c8279 | |
| parent | fdd9b217c60b454ac6a82f02c8b0b551caeac88b (diff) | |
| download | cpython-git-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.gz | |
bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)
| -rw-r--r-- | Doc/library/re.rst | 58 | ||||
| -rw-r--r-- | Doc/whatsnew/3.7.rst | 7 | ||||
| -rw-r--r-- | Lib/sre_compile.py | 59 | ||||
| -rw-r--r-- | Lib/sre_constants.py | 40 | ||||
| -rw-r--r-- | Lib/sre_parse.py | 24 | ||||
| -rw-r--r-- | Lib/test/test_re.py | 22 | ||||
| -rw-r--r-- | Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst | 2 | ||||
| -rw-r--r-- | Modules/_sre.c | 37 | ||||
| -rw-r--r-- | Modules/sre.h | 4 | ||||
| -rw-r--r-- | Modules/sre_constants.h | 51 | ||||
| -rw-r--r-- | Modules/sre_lib.h | 136 | 
11 files changed, 300 insertions, 140 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 3dd3a0f80d..e0cb626305 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -245,16 +245,32 @@ The special characters are:     *cannot* be retrieved after performing a match or referenced later in the     pattern. -``(?imsx-imsx:...)`` -   (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``, -   optionally followed by ``'-'`` followed by one or more letters from the -   same set.)  The letters set or removes the corresponding flags: -   :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S` -   (dot matches all), and :const:`re.X` (verbose), for the part of the -   expression.  (The flags are described in :ref:`contents-of-module-re`.) +``(?aiLmsux-imsx:...)`` +   (Zero or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``, +   ``'s'``, ``'u'``, ``'x'``, optionally followed by ``'-'`` followed by +   one or more letters from the ``'i'``, ``'m'``, ``'s'``, ``'x'``.) +   The letters set or remove the corresponding flags: +   :const:`re.A` (ASCII-only matching), :const:`re.I` (ignore case), +   :const:`re.L` (locale dependent), :const:`re.M` (multi-line), +   :const:`re.S` (dot matches all), :const:`re.U` (Unicode matching), +   and :const:`re.X` (verbose), for the part of the expression. +   (The flags are described in :ref:`contents-of-module-re`.) + +   The letters ``'a'``, ``'L'`` and ``'u'`` are mutually exclusive when used +   as inline flags, so they can't be combined or follow ``'-'``.  Instead, +   when one of them appears in an inline group, it overrides the matching mode +   in the enclosing group.  In Unicode patterns ``(?a:...)`` switches to +   ASCII-only matching, and ``(?u:...)`` switches to Unicode matching +   (default).  In byte pattern ``(?L:...)`` switches to locale depending +   matching, and ``(?a:...)`` switches to ASCII-only matching (default). +   This override is only in effect for the narrow inline group, and the +   original matching mode is restored outside of the group.     .. versionadded:: 3.6 +   .. versionchanged:: 3.7 +      The letters ``'a'``, ``'L'`` and ``'u'`` also can be used in a group. +  ``(?P<name>...)``     Similar to regular parentheses, but the substring matched by the group is     accessible via the symbolic group name *name*.  Group names must be valid @@ -384,9 +400,7 @@ character ``'$'``.        Matches any Unicode decimal digit (that is, any character in        Unicode character category [Nd]).  This includes ``[0-9]``, and        also many other digit characters.  If the :const:`ASCII` flag is -      used only ``[0-9]`` is matched (but the flag affects the entire -      regular expression, so in such cases using an explicit ``[0-9]`` -      may be a better choice). +      used only ``[0-9]`` is matched.     For 8-bit (bytes) patterns:        Matches any decimal digit; this is equivalent to ``[0-9]``. @@ -394,9 +408,7 @@ character ``'$'``.  ``\D``     Matches any character which is not a decimal digit. This is     the opposite of ``\d``. If the :const:`ASCII` flag is used this -   becomes the equivalent of ``[^0-9]`` (but the flag affects the entire -   regular expression, so in such cases using an explicit ``[^0-9]`` may -   be a better choice). +   becomes the equivalent of ``[^0-9]``.  ``\s``     For Unicode (str) patterns: @@ -404,9 +416,7 @@ character ``'$'``.        ``[ \t\n\r\f\v]``, and also many other characters, for example the        non-breaking spaces mandated by typography rules in many        languages). If the :const:`ASCII` flag is used, only -      ``[ \t\n\r\f\v]`` is matched (but the flag affects the entire -      regular expression, so in such cases using an explicit -      ``[ \t\n\r\f\v]`` may be a better choice). +      ``[ \t\n\r\f\v]`` is matched.     For 8-bit (bytes) patterns:        Matches characters considered whitespace in the ASCII character set; @@ -415,18 +425,14 @@ character ``'$'``.  ``\S``     Matches any character which is not a whitespace character. This is     the opposite of ``\s``. If the :const:`ASCII` flag is used this -   becomes the equivalent of ``[^ \t\n\r\f\v]`` (but the flag affects the entire -   regular expression, so in such cases using an explicit ``[^ \t\n\r\f\v]`` may -   be a better choice). +   becomes the equivalent of ``[^ \t\n\r\f\v]``.  ``\w``     For Unicode (str) patterns:        Matches Unicode word characters; this includes most characters        that can be part of a word in any language, as well as numbers and        the underscore. If the :const:`ASCII` flag is used, only -      ``[a-zA-Z0-9_]`` is matched (but the flag affects the entire -      regular expression, so in such cases using an explicit -      ``[a-zA-Z0-9_]`` may be a better choice). +      ``[a-zA-Z0-9_]`` is matched.     For 8-bit (bytes) patterns:        Matches characters considered alphanumeric in the ASCII character set; @@ -437,9 +443,7 @@ character ``'$'``.  ``\W``     Matches any character which is not a word character. This is     the opposite of ``\w``. If the :const:`ASCII` flag is used this -   becomes the equivalent of ``[^a-zA-Z0-9_]`` (but the flag affects the -   entire regular expression, so in such cases using an explicit -   ``[^a-zA-Z0-9_]`` may be a better choice).  If the :const:`LOCALE` flag is +   becomes the equivalent of ``[^a-zA-Z0-9_]``.  If the :const:`LOCALE` flag is     used, matches characters considered alphanumeric in the current locale     and the underscore. @@ -563,9 +567,7 @@ form.     letter I with dot above), 'ı' (U+0131, Latin small letter dotless i),     'ſ' (U+017F, Latin small letter long s) and 'K' (U+212A, Kelvin sign).     If the :const:`ASCII` flag is used, only letters 'a' to 'z' -   and 'A' to 'Z' are matched (but the flag affects the entire regular -   expression, so in such cases using an explicit ``(?-i:[a-zA-Z])`` may be -   a better choice). +   and 'A' to 'Z' are matched.  .. data:: L            LOCALE diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 46121dcf30..17e4e0a881 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -296,6 +296,13 @@ pdb  argument.  If given, this is printed to the console just before debugging  begins. +re +-- + +The flags :const:`re.ASCII`, :const:`re.LOCALE` and :const:`re.UNICODE` +can be set within the scope of a group. +(Contributed by Serhiy Storchaka in :issue:`31690`.) +  string  ------ diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 144620c6d1..e5216b792f 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -62,6 +62,12 @@ _equivalences = (  _ignorecase_fixes = {i: tuple(j for j in t if i != j)                       for t in _equivalences for i in t} +def _combine_flags(flags, add_flags, del_flags, +                   TYPE_FLAGS=sre_parse.TYPE_FLAGS): +    if add_flags & TYPE_FLAGS: +        flags &= ~TYPE_FLAGS +    return (flags | add_flags) & ~del_flags +  def _compile(code, pattern, flags):      # internal: compile a (sub)pattern      emit = code.append @@ -87,15 +93,21 @@ def _compile(code, pattern, flags):                  emit(op)                  emit(av)              elif flags & SRE_FLAG_LOCALE: -                emit(OP_LOC_IGNORE[op]) +                emit(OP_LOCALE_IGNORE[op])                  emit(av)              elif not iscased(av):                  emit(op)                  emit(av)              else:                  lo = tolower(av) -                if fixes and lo in fixes: -                    emit(IN_IGNORE) +                if not fixes:  # ascii +                    emit(OP_IGNORE[op]) +                    emit(lo) +                elif lo not in fixes: +                    emit(OP_UNICODE_IGNORE[op]) +                    emit(lo) +                else: +                    emit(IN_UNI_IGNORE)                      skip = _len(code); emit(0)                      if op is NOT_LITERAL:                          emit(NEGATE) @@ -104,17 +116,16 @@ def _compile(code, pattern, flags):                          emit(k)                      emit(FAILURE)                      code[skip] = _len(code) - skip -                else: -                    emit(OP_IGNORE[op]) -                    emit(lo)          elif op is IN:              charset, hascased = _optimize_charset(av, iscased, tolower, fixes)              if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:                  emit(IN_LOC_IGNORE) -            elif hascased: +            elif not hascased: +                emit(IN) +            elif not fixes:  # ascii                  emit(IN_IGNORE)              else: -                emit(IN) +                emit(IN_UNI_IGNORE)              skip = _len(code); emit(0)              _compile_charset(charset, flags, code)              code[skip] = _len(code) - skip @@ -153,8 +164,8 @@ def _compile(code, pattern, flags):              if group:                  emit(MARK)                  emit((group-1)*2) -            # _compile_info(code, p, (flags | add_flags) & ~del_flags) -            _compile(code, p, (flags | add_flags) & ~del_flags) +            # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) +            _compile(code, p, _combine_flags(flags, add_flags, del_flags))              if group:                  emit(MARK)                  emit((group-1)*2+1) @@ -210,10 +221,14 @@ def _compile(code, pattern, flags):                  av = CH_UNICODE[av]              emit(av)          elif op is GROUPREF: -            if flags & SRE_FLAG_IGNORECASE: -                emit(OP_IGNORE[op]) -            else: +            if not flags & SRE_FLAG_IGNORECASE:                  emit(op) +            elif flags & SRE_FLAG_LOCALE: +                emit(GROUPREF_LOC_IGNORE) +            elif not fixes:  # ascii +                emit(GROUPREF_IGNORE) +            else: +                emit(GROUPREF_UNI_IGNORE)              emit(av-1)          elif op is GROUPREF_EXISTS:              emit(op) @@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code):              pass          elif op is LITERAL:              emit(av) -        elif op is RANGE or op is RANGE_IGNORE: +        elif op is RANGE or op is RANGE_UNI_IGNORE:              emit(av[0])              emit(av[1])          elif op is CHARSET: @@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):                      hascased = True                      # There are only two ranges of cased non-BMP characters:                      # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), -                    # and for both ranges RANGE_IGNORE works. +                    # and for both ranges RANGE_UNI_IGNORE works.                      if op is RANGE: -                        op = RANGE_IGNORE +                        op = RANGE_UNI_IGNORE                  tail.append((op, av))              break @@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags):              prefixappend(av)          elif op is SUBPATTERN:              group, add_flags, del_flags, p = av -            flags1 = (flags | add_flags) & ~del_flags +            flags1 = _combine_flags(flags, add_flags, del_flags)              if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:                  break              prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) @@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags):          if op is not SUBPATTERN:              break          group, add_flags, del_flags, pattern = av -        flags = (flags | add_flags) & ~del_flags +        flags = _combine_flags(flags, add_flags, del_flags)          if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:              return None @@ -631,6 +646,7 @@ def dis(code):                  print_(op)              elif op in (LITERAL, NOT_LITERAL,                          LITERAL_IGNORE, NOT_LITERAL_IGNORE, +                        LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,                          LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):                  arg = code[i]                  i += 1 @@ -647,12 +663,12 @@ def dis(code):                  arg = str(CHCODES[arg])                  assert arg[:9] == 'CATEGORY_'                  print_(op, arg[9:]) -            elif op in (IN, IN_IGNORE, IN_LOC_IGNORE): +            elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):                  skip = code[i]                  print_(op, skip, to=i+skip)                  dis_(i+1, i+skip)                  i += skip -            elif op in (RANGE, RANGE_IGNORE): +            elif op in (RANGE, RANGE_UNI_IGNORE):                  lo, hi = code[i: i+2]                  i += 2                  print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) @@ -671,7 +687,8 @@ def dis(code):                      print_2(_hex_code(code[i: i + 256//_CODEBITS]))                      i += 256//_CODEBITS                  level -= 1 -            elif op in (MARK, GROUPREF, GROUPREF_IGNORE): +            elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, +                        GROUPREF_LOC_IGNORE):                  arg = code[i]                  i += 1                  print_(op, arg) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 1daa7bd00f..13deb00bc8 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@  # update when constants are added or removed -MAGIC = 20170530 +MAGIC = 20171005  from _sre import MAXREPEAT, MAXGROUPS @@ -84,25 +84,37 @@ OPCODES = _makecodes("""      CALL      CATEGORY      CHARSET BIGCHARSET -    GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE -    IN IN_IGNORE +    GROUPREF GROUPREF_EXISTS +    IN      INFO      JUMP -    LITERAL LITERAL_IGNORE +    LITERAL      MARK      MAX_UNTIL      MIN_UNTIL -    NOT_LITERAL NOT_LITERAL_IGNORE +    NOT_LITERAL      NEGATE      RANGE      REPEAT      REPEAT_ONE      SUBPATTERN      MIN_REPEAT_ONE -    RANGE_IGNORE + +    GROUPREF_IGNORE +    IN_IGNORE +    LITERAL_IGNORE +    NOT_LITERAL_IGNORE + +    GROUPREF_LOC_IGNORE +    IN_LOC_IGNORE      LITERAL_LOC_IGNORE      NOT_LITERAL_LOC_IGNORE -    IN_LOC_IGNORE + +    GROUPREF_UNI_IGNORE +    IN_UNI_IGNORE +    LITERAL_UNI_IGNORE +    NOT_LITERAL_UNI_IGNORE +    RANGE_UNI_IGNORE      MIN_REPEAT MAX_REPEAT  """) @@ -113,7 +125,9 @@ ATCODES = _makecodes("""      AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING      AT_BOUNDARY AT_NON_BOUNDARY      AT_END AT_END_LINE AT_END_STRING +      AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY +      AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY  """) @@ -123,7 +137,9 @@ CHCODES = _makecodes("""      CATEGORY_SPACE CATEGORY_NOT_SPACE      CATEGORY_WORD CATEGORY_NOT_WORD      CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK +      CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD +      CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT      CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE      CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD @@ -133,18 +149,20 @@ CHCODES = _makecodes("""  # replacement operations for "ignore case" mode  OP_IGNORE = { -    GROUPREF: GROUPREF_IGNORE, -    IN: IN_IGNORE,      LITERAL: LITERAL_IGNORE,      NOT_LITERAL: NOT_LITERAL_IGNORE, -    RANGE: RANGE_IGNORE,  } -OP_LOC_IGNORE = { +OP_LOCALE_IGNORE = {      LITERAL: LITERAL_LOC_IGNORE,      NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,  } +OP_UNICODE_IGNORE = { +    LITERAL: LITERAL_UNI_IGNORE, +    NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} +  AT_MULTILINE = {      AT_BEGINNING: AT_BEGINNING_LINE,      AT_END: AT_END_LINE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 545252074f..8527412293 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -65,8 +65,8 @@ FLAGS = {      "u": SRE_FLAG_UNICODE,  } -GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | -                SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE  class Verbose(Exception):      pass @@ -822,7 +822,19 @@ def _parse_flags(source, state, char):      del_flags = 0      if char != "-":          while True: -            add_flags |= FLAGS[char] +            flag = FLAGS[char] +            if source.istext: +                if char == 'L': +                    msg = "bad inline flags: cannot use 'L' flag with a str pattern" +                    raise source.error(msg) +            else: +                if char == 'u': +                    msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" +                    raise source.error(msg) +            add_flags |= flag +            if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: +                msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" +                raise source.error(msg)              char = sourceget()              if char is None:                  raise source.error("missing -, : or )") @@ -844,7 +856,11 @@ def _parse_flags(source, state, char):              msg = "unknown flag" if char.isalpha() else "missing flag"              raise source.error(msg, len(char))          while True: -            del_flags |= FLAGS[char] +            flag = FLAGS[char] +            if flag & TYPE_FLAGS: +                msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" +                raise source.error(msg) +            del_flags |= flag              char = sourceget()              if char is None:                  raise source.error("missing :") diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 9cb426a04d..fc015e4ed9 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1470,11 +1470,11 @@ class ReTests(unittest.TestCase):              self.assertIsNone(pat.match(b'\xe0'))          # Incompatibilities          self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) -        self.assertRaises(ValueError, re.compile, br'(?u)\w') +        self.assertRaises(re.error, re.compile, br'(?u)\w')          self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)          self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)          self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) -        self.assertRaises(ValueError, re.compile, r'(?au)\w') +        self.assertRaises(re.error, re.compile, r'(?au)\w')      def test_locale_flag(self):          import locale @@ -1516,11 +1516,11 @@ class ReTests(unittest.TestCase):              self.assertIsNone(pat.match(bletter))          # Incompatibilities          self.assertRaises(ValueError, re.compile, '', re.LOCALE) -        self.assertRaises(ValueError, re.compile, '(?L)') +        self.assertRaises(re.error, re.compile, '(?L)')          self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)          self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)          self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) -        self.assertRaises(ValueError, re.compile, b'(?aL)') +        self.assertRaises(re.error, re.compile, b'(?aL)')      def test_scoped_flags(self):          self.assertTrue(re.match(r'(?i:a)b', 'Ab')) @@ -1535,12 +1535,18 @@ class ReTests(unittest.TestCase):          self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))          self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) -        self.checkPatternError(r'(?a:\w)', -                               'bad inline flags: cannot turn on global flag', 3) +        self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) +        self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) +        self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) +          self.checkPatternError(r'(?a)(?-a:\w)', -                               'bad inline flags: cannot turn off global flag', 8) +                "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)          self.checkPatternError(r'(?i-i:a)', -                               'bad inline flags: flag turned on and off', 5) +                'bad inline flags: flag turned on and off', 5) +        self.checkPatternError(r'(?au:a)', +                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) +        self.checkPatternError(br'(?aL:a)', +                "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)          self.checkPatternError(r'(?-', 'missing flag', 3)          self.checkPatternError(r'(?-+', 'missing flag', 3) diff --git a/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst b/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst new file mode 100644 index 0000000000..1505615d27 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-10-05-15-14-46.bpo-31690.f0XteV.rst @@ -0,0 +1,2 @@ +Allow the flags re.ASCII, re.LOCALE, and re.UNICODE to be used as group flags +for regular expressions. diff --git a/Modules/_sre.c b/Modules/_sre.c index c42ab2668f..a9b6b50e84 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -97,12 +97,12 @@ static const char copyright[] =  #define SRE_IS_WORD(ch)\      ((ch) < 128 && (Py_ISALNUM(ch) || (ch) == '_')) -static unsigned int sre_lower(unsigned int ch) +static unsigned int sre_lower_ascii(unsigned int ch)  {      return ((ch) < 128 ? Py_TOLOWER(ch) : ch);  } -static unsigned int sre_upper(unsigned int ch) +static unsigned int sre_upper_ascii(unsigned int ch)  {      return ((ch) < 128 ? Py_TOUPPER(ch) : ch);  } @@ -188,6 +188,15 @@ sre_category(SRE_CODE category, unsigned int ch)      return 0;  } +LOCAL(int) +char_loc_ignore(SRE_CODE pattern, SRE_CODE ch) +{ +    return ch == pattern +        || (SRE_CODE) sre_lower_locale(ch) == pattern +        || (SRE_CODE) sre_upper_locale(ch) == pattern; +} + +  /* helpers */  static void @@ -286,7 +295,7 @@ _sre_ascii_iscased_impl(PyObject *module, int character)  /*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/  {      unsigned int ch = (unsigned int)character; -    return ch != sre_lower(ch) || ch != sre_upper(ch); +    return ch != sre_lower_ascii(ch) || ch != sre_upper_ascii(ch);  }  /*[clinic input] @@ -317,7 +326,7 @@ static int  _sre_ascii_tolower_impl(PyObject *module, int character)  /*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/  { -    return sre_lower(character); +    return sre_lower_ascii(character);  }  /*[clinic input] @@ -448,19 +457,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,      state->pos = start;      state->endpos = end; -    if (pattern->flags & SRE_FLAG_LOCALE) { -        state->lower = sre_lower_locale; -        state->upper = sre_upper_locale; -    } -    else if (pattern->flags & SRE_FLAG_UNICODE) { -        state->lower = sre_lower_unicode; -        state->upper = sre_upper_unicode; -    } -    else { -        state->lower = sre_lower; -        state->upper = sre_upper; -    } -      return string;    err:      PyMem_Del(state->mark); @@ -1533,7 +1529,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)              break;          case SRE_OP_RANGE: -        case SRE_OP_RANGE_IGNORE: +        case SRE_OP_RANGE_UNI_IGNORE:              GET_ARG;              GET_ARG;              break; @@ -1630,6 +1626,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)          case SRE_OP_NOT_LITERAL:          case SRE_OP_LITERAL_IGNORE:          case SRE_OP_NOT_LITERAL_IGNORE: +        case SRE_OP_LITERAL_UNI_IGNORE: +        case SRE_OP_NOT_LITERAL_UNI_IGNORE:          case SRE_OP_LITERAL_LOC_IGNORE:          case SRE_OP_NOT_LITERAL_LOC_IGNORE:              GET_ARG; @@ -1669,6 +1667,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)          case SRE_OP_IN:          case SRE_OP_IN_IGNORE: +        case SRE_OP_IN_UNI_IGNORE:          case SRE_OP_IN_LOC_IGNORE:              GET_SKIP;              /* Stop 1 before the end; we check the FAILURE below */ @@ -1805,6 +1804,8 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)          case SRE_OP_GROUPREF:          case SRE_OP_GROUPREF_IGNORE: +        case SRE_OP_GROUPREF_UNI_IGNORE: +        case SRE_OP_GROUPREF_LOC_IGNORE:              GET_ARG;              if (arg >= (size_t)groups)                  FAIL; diff --git a/Modules/sre.h b/Modules/sre.h index 9af5e40574..585d2841a6 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -52,8 +52,6 @@ typedef struct {      Py_ssize_t mark[1];  } MatchObject; -typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); -  typedef struct SRE_REPEAT_T {      Py_ssize_t count;      SRE_CODE* pattern; /* points to REPEAT operator arguments */ @@ -83,8 +81,6 @@ typedef struct {      Py_buffer buffer;      /* current repeat context */      SRE_REPEAT *repeat; -    /* hooks */ -    SRE_TOLOWER_HOOK lower, upper;  } SRE_STATE;  typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 6d6d21efd0..c8ccb32d21 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@   * See the _sre.c file for information on usage and redistribution.   */ -#define SRE_MAGIC 20170530 +#define SRE_MAGIC 20171005  #define SRE_OP_FAILURE 0  #define SRE_OP_SUCCESS 1  #define SRE_OP_ANY 2 @@ -26,28 +26,33 @@  #define SRE_OP_BIGCHARSET 11  #define SRE_OP_GROUPREF 12  #define SRE_OP_GROUPREF_EXISTS 13 -#define SRE_OP_GROUPREF_IGNORE 14 -#define SRE_OP_IN 15 -#define SRE_OP_IN_IGNORE 16 -#define SRE_OP_INFO 17 -#define SRE_OP_JUMP 18 -#define SRE_OP_LITERAL 19 -#define SRE_OP_LITERAL_IGNORE 20 -#define SRE_OP_MARK 21 -#define SRE_OP_MAX_UNTIL 22 -#define SRE_OP_MIN_UNTIL 23 -#define SRE_OP_NOT_LITERAL 24 -#define SRE_OP_NOT_LITERAL_IGNORE 25 -#define SRE_OP_NEGATE 26 -#define SRE_OP_RANGE 27 -#define SRE_OP_REPEAT 28 -#define SRE_OP_REPEAT_ONE 29 -#define SRE_OP_SUBPATTERN 30 -#define SRE_OP_MIN_REPEAT_ONE 31 -#define SRE_OP_RANGE_IGNORE 32 -#define SRE_OP_LITERAL_LOC_IGNORE 33 -#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34 -#define SRE_OP_IN_LOC_IGNORE 35 +#define SRE_OP_IN 14 +#define SRE_OP_INFO 15 +#define SRE_OP_JUMP 16 +#define SRE_OP_LITERAL 17 +#define SRE_OP_MARK 18 +#define SRE_OP_MAX_UNTIL 19 +#define SRE_OP_MIN_UNTIL 20 +#define SRE_OP_NOT_LITERAL 21 +#define SRE_OP_NEGATE 22 +#define SRE_OP_RANGE 23 +#define SRE_OP_REPEAT 24 +#define SRE_OP_REPEAT_ONE 25 +#define SRE_OP_SUBPATTERN 26 +#define SRE_OP_MIN_REPEAT_ONE 27 +#define SRE_OP_GROUPREF_IGNORE 28 +#define SRE_OP_IN_IGNORE 29 +#define SRE_OP_LITERAL_IGNORE 30 +#define SRE_OP_NOT_LITERAL_IGNORE 31 +#define SRE_OP_GROUPREF_LOC_IGNORE 32 +#define SRE_OP_IN_LOC_IGNORE 33 +#define SRE_OP_LITERAL_LOC_IGNORE 34 +#define SRE_OP_NOT_LITERAL_LOC_IGNORE 35 +#define SRE_OP_GROUPREF_UNI_IGNORE 36 +#define SRE_OP_IN_UNI_IGNORE 37 +#define SRE_OP_LITERAL_UNI_IGNORE 38 +#define SRE_OP_NOT_LITERAL_UNI_IGNORE 39 +#define SRE_OP_RANGE_UNI_IGNORE 40  #define SRE_AT_BEGINNING 0  #define SRE_AT_BEGINNING_LINE 1  #define SRE_AT_BEGINNING_STRING 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index b540d219dd..e13b90e8bc 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -101,14 +101,6 @@ SRE(at)(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)  }  LOCAL(int) -SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch) -{ -    return ch == pattern -        || (SRE_CODE) state->lower(ch) == pattern -        || (SRE_CODE) state->upper(ch) == pattern; -} - -LOCAL(int)  SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)  {      /* check if character is a member of the given set */ @@ -150,14 +142,14 @@ SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)              set += 2;              break; -        case SRE_OP_RANGE_IGNORE: -            /* <RANGE_IGNORE> <lower> <upper> */ +        case SRE_OP_RANGE_UNI_IGNORE: +            /* <RANGE_UNI_IGNORE> <lower> <upper> */          {              SRE_CODE uch;              /* ch is already lower cased */              if (set[0] <= ch && ch <= set[1])                  return ok; -            uch = state->upper(ch); +            uch = sre_upper_unicode(ch);              if (set[0] <= uch && uch <= set[1])                  return ok;              set += 2; @@ -199,11 +191,11 @@ LOCAL(int)  SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)  {      SRE_CODE lo, up; -    lo = state->lower(ch); +    lo = sre_lower_locale(ch);      if (SRE(charset)(state, set, lo))         return 1; -    up = state->upper(ch); +    up = sre_upper_locale(ch);      return up != lo && SRE(charset)(state, set, up);  } @@ -263,7 +255,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)          /* repeated literal */          chr = pattern[1];          TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr)); -        while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr) +        while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) == chr) +            ptr++; +        break; + +    case SRE_OP_LITERAL_UNI_IGNORE: +        /* repeated literal */ +        chr = pattern[1]; +        TRACE(("|%p|%p|COUNT LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); +        while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) == chr)              ptr++;          break; @@ -271,7 +271,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)          /* repeated literal */          chr = pattern[1];          TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); -        while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr)) +        while (ptr < end && char_loc_ignore(chr, *ptr))              ptr++;          break; @@ -293,7 +293,15 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)          /* repeated non-literal */          chr = pattern[1];          TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr)); -        while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr) +        while (ptr < end && (SRE_CODE) sre_lower_ascii(*ptr) != chr) +            ptr++; +        break; + +    case SRE_OP_NOT_LITERAL_UNI_IGNORE: +        /* repeated non-literal */ +        chr = pattern[1]; +        TRACE(("|%p|%p|COUNT NOT_LITERAL_UNI_IGNORE %d\n", pattern, ptr, chr)); +        while (ptr < end && (SRE_CODE) sre_lower_unicode(*ptr) != chr)              ptr++;          break; @@ -301,7 +309,7 @@ SRE(count)(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)          /* repeated non-literal */          chr = pattern[1];          TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr)); -        while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr)) +        while (ptr < end && !char_loc_ignore(chr, *ptr))              ptr++;          break; @@ -687,7 +695,17 @@ entrance:              TRACE(("|%p|%p|LITERAL_IGNORE %d\n",                     ctx->pattern, ctx->ptr, ctx->pattern[0]));              if (ctx->ptr >= end || -                state->lower(*ctx->ptr) != *ctx->pattern) +                sre_lower_ascii(*ctx->ptr) != *ctx->pattern) +                RETURN_FAILURE; +            ctx->pattern++; +            ctx->ptr++; +            break; + +        case SRE_OP_LITERAL_UNI_IGNORE: +            TRACE(("|%p|%p|LITERAL_UNI_IGNORE %d\n", +                   ctx->pattern, ctx->ptr, ctx->pattern[0])); +            if (ctx->ptr >= end || +                sre_lower_unicode(*ctx->ptr) != *ctx->pattern)                  RETURN_FAILURE;              ctx->pattern++;              ctx->ptr++; @@ -697,7 +715,7 @@ entrance:              TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",                     ctx->pattern, ctx->ptr, ctx->pattern[0]));              if (ctx->ptr >= end -                || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) +                || !char_loc_ignore(*ctx->pattern, *ctx->ptr))                  RETURN_FAILURE;              ctx->pattern++;              ctx->ptr++; @@ -707,7 +725,17 @@ entrance:              TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",                     ctx->pattern, ctx->ptr, *ctx->pattern));              if (ctx->ptr >= end || -                state->lower(*ctx->ptr) == *ctx->pattern) +                sre_lower_ascii(*ctx->ptr) == *ctx->pattern) +                RETURN_FAILURE; +            ctx->pattern++; +            ctx->ptr++; +            break; + +        case SRE_OP_NOT_LITERAL_UNI_IGNORE: +            TRACE(("|%p|%p|NOT_LITERAL_UNI_IGNORE %d\n", +                   ctx->pattern, ctx->ptr, *ctx->pattern)); +            if (ctx->ptr >= end || +                sre_lower_unicode(*ctx->ptr) == *ctx->pattern)                  RETURN_FAILURE;              ctx->pattern++;              ctx->ptr++; @@ -717,7 +745,7 @@ entrance:              TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",                     ctx->pattern, ctx->ptr, *ctx->pattern));              if (ctx->ptr >= end -                || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr)) +                || char_loc_ignore(*ctx->pattern, *ctx->ptr))                  RETURN_FAILURE;              ctx->pattern++;              ctx->ptr++; @@ -727,7 +755,17 @@ entrance:              TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));              if (ctx->ptr >= end                  || !SRE(charset)(state, ctx->pattern+1, -                                 (SRE_CODE)state->lower(*ctx->ptr))) +                                 (SRE_CODE)sre_lower_ascii(*ctx->ptr))) +                RETURN_FAILURE; +            ctx->pattern += ctx->pattern[0]; +            ctx->ptr++; +            break; + +        case SRE_OP_IN_UNI_IGNORE: +            TRACE(("|%p|%p|IN_UNI_IGNORE\n", ctx->pattern, ctx->ptr)); +            if (ctx->ptr >= end +                || !SRE(charset)(state, ctx->pattern+1, +                                 (SRE_CODE)sre_lower_unicode(*ctx->ptr)))                  RETURN_FAILURE;              ctx->pattern += ctx->pattern[0];              ctx->ptr++; @@ -1135,7 +1173,59 @@ entrance:                          RETURN_FAILURE;                      while (p < e) {                          if (ctx->ptr >= end || -                            state->lower(*ctx->ptr) != state->lower(*p)) +                            sre_lower_ascii(*ctx->ptr) != sre_lower_ascii(*p)) +                            RETURN_FAILURE; +                        p++; +                        ctx->ptr++; +                    } +                } +            } +            ctx->pattern++; +            break; + +        case SRE_OP_GROUPREF_UNI_IGNORE: +            /* match backreference */ +            TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", ctx->pattern, +                   ctx->ptr, ctx->pattern[0])); +            i = ctx->pattern[0]; +            { +                Py_ssize_t groupref = i+i; +                if (groupref >= state->lastmark) { +                    RETURN_FAILURE; +                } else { +                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; +                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; +                    if (!p || !e || e < p) +                        RETURN_FAILURE; +                    while (p < e) { +                        if (ctx->ptr >= end || +                            sre_lower_unicode(*ctx->ptr) != sre_lower_unicode(*p)) +                            RETURN_FAILURE; +                        p++; +                        ctx->ptr++; +                    } +                } +            } +            ctx->pattern++; +            break; + +        case SRE_OP_GROUPREF_LOC_IGNORE: +            /* match backreference */ +            TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", ctx->pattern, +                   ctx->ptr, ctx->pattern[0])); +            i = ctx->pattern[0]; +            { +                Py_ssize_t groupref = i+i; +                if (groupref >= state->lastmark) { +                    RETURN_FAILURE; +                } else { +                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref]; +                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1]; +                    if (!p || !e || e < p) +                        RETURN_FAILURE; +                    while (p < e) { +                        if (ctx->ptr >= end || +                            sre_lower_locale(*ctx->ptr) != sre_lower_locale(*p))                              RETURN_FAILURE;                          p++;                          ctx->ptr++;  | 
