summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Dickinson <dickinsm@gmail.com>2009-07-28 20:35:03 +0000
committerMark Dickinson <dickinsm@gmail.com>2009-07-28 20:35:03 +0000
commitfe67bd91685f89fbf95ee9727ce03d20dea3e9b8 (patch)
treea05a8cfba86d1994b8e8af49ec5cef9924760008
parent8d31f5413ccbd1857bac21887272f06a84cca619 (diff)
downloadcpython-git-fe67bd91685f89fbf95ee9727ce03d20dea3e9b8.tar.gz
Issue #6561: '\d' regular expression should not match characters of
category [No]; only those of category [Nd]. (Backport of r74237 from py3k.)
-rw-r--r--Doc/library/re.rst3
-rw-r--r--Lib/test/test_re.py21
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_sre.c2
4 files changed, 28 insertions, 2 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 2d5e1956ce..df63f9bec2 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -332,7 +332,8 @@ the second character. For example, ``\$`` matches the character ``'$'``.
``\d``
When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match
- whatever is classified as a digit in the Unicode character properties database.
+ whatever is classified as a decimal digit in the Unicode character properties
+ database.
``\D``
When the :const:`UNICODE` flag is not specified, matches any non-digit
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 4f543d93dd..c4cc8208ed 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -636,6 +636,27 @@ class ReTests(unittest.TestCase):
self.assertEqual(iter.next().span(), (4, 4))
self.assertRaises(StopIteration, iter.next)
+ def test_bug_6561(self):
+ # '\d' should match characters in Unicode category 'Nd'
+ # (Number, Decimal Digit), but not those in 'Nl' (Number,
+ # Letter) or 'No' (Number, Other).
+ decimal_digits = [
+ u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
+ u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
+ u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+ ]
+ for x in decimal_digits:
+ self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
+
+ not_decimal_digits = [
+ u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+ u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+ u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
+ u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+ ]
+ for x in not_decimal_digits:
+ self.assertIsNone(re.match('^\d$', x, re.UNICODE))
+
def test_empty_array(self):
# SF buf 1647541
import array
diff --git a/Misc/NEWS b/Misc/NEWS
index 25bbabb72c..6731fb93e1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -1205,6 +1205,10 @@ C-API
Extension Modules
-----------------
+- Issue #6561: '\d' in a regex now matches only characters with
+ Unicode category 'Nd' (Number, Decimal Digit). Previously it also
+ matched characters with category 'No'.
+
- Issue #1523: Remove deprecated overflow wrapping for struct.pack
with an integer format code ('bBhHiIlLqQ'). Packing an out-of-range
integer now consistently raises struct.error.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 1aea53bf94..0d9ee24eae 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -172,7 +172,7 @@ static unsigned int sre_lower_locale(unsigned int ch)
#if defined(HAVE_UNICODE)
-#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
+#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))