diff options
author | Mark Dickinson <dickinsm@gmail.com> | 2009-07-28 17:22:36 +0000 |
---|---|---|
committer | Mark Dickinson <dickinsm@gmail.com> | 2009-07-28 17:22:36 +0000 |
commit | 1f268285ff810681612c8f7c91c1faeb70535f52 (patch) | |
tree | 8531e2c66af998125d53694c82626e8a1a204364 | |
parent | 6bd13fbbc8b4917023d124d6523685d456a8e92a (diff) | |
download | cpython-git-1f268285ff810681612c8f7c91c1faeb70535f52.tar.gz |
Issue #6561: '\d' in a regular expression should match only Unicode
character category [Nd], not [No].
-rw-r--r-- | Doc/library/re.rst | 11 | ||||
-rw-r--r-- | Lib/test/test_re.py | 21 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Modules/_sre.c | 2 |
4 files changed, 32 insertions, 6 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 3b2f70ba8a..cdb9951dc8 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -338,11 +338,12 @@ the second character. For example, ``\$`` matches the character ``'$'``. ``\d`` For Unicode (str) patterns: - Matches any Unicode digit (which includes ``[0-9]``, and also many - other digit characters). If the :const:`ASCII` flag is used only - ``[0-9]`` is matched (but the flag affects the entire regular - expression, so in such cases using an explicit ``[0-9]`` may be a - better choice). + Matches any Unicode decimal digit (that is, any character in + Unicode character category [Nd]). This includes ``[0-9]``, and + also many other digit characters. If the :const:`ASCII` flag is + used only ``[0-9]`` is matched (but the flag affects the entire + regular expression, so in such cases using an explicit ``[0-9]`` + may be a better choice). For 8-bit (bytes) patterns: Matches any decimal digit; this is equivalent to ``[0-9]``. diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 383b56ac7a..8b4d268993 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -605,6 +605,27 @@ class ReTests(unittest.TestCase): self.assertEqual(next(iter).span(), (4, 4)) self.assertRaises(StopIteration, next, iter) + def test_bug_6561(self): + # '\d' should match characters in Unicode category 'Nd' + # (Number, Decimal Digit), but not those in 'Nl' (Number, + # Letter) or 'No' (Number, Other). + decimal_digits = [ + '\u0037', # '\N{DIGIT SEVEN}', category 'Nd' + '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd' + '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd' + ] + for x in decimal_digits: + self.assertEqual(re.match('^\d$', x).group(0), x) + + not_decimal_digits = [ + '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl' + '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl' + '\u2082', # '\N{SUBSCRIPT TWO}', category 'No' + '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No' + ] + for x in not_decimal_digits: + self.assertIsNone(re.match('^\d$', x)) + def test_empty_array(self): # SF buf 1647541 import array @@ -108,6 +108,10 @@ Library Extension Modules ----------------- +- Issue #6561: '\d' in a regex now matches only characters with + Unicode category 'Nd' (Number, Decimal Digit). Previously it also + matched characters with category 'No'. + - Issue #4509: Array objects are no longer modified after an operation failing due to the resize restriction in-place when the object has exported buffers. diff --git a/Modules/_sre.c b/Modules/_sre.c index 45b92f319d..596fd19dfd 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -168,7 +168,7 @@ static unsigned int sre_lower_locale(unsigned int ch) #if defined(HAVE_UNICODE) -#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) +#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) |