summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWonsup Yoon <pusnow@me.com>2018-06-15 21:03:14 +0900
committerXiang Zhang <angwerzx@126.com>2018-06-15 20:03:14 +0800
commitd134809cd3764c6a634eab7bb8995e3e2eff14d5 (patch)
tree6bcc3ec615c093c71b96ce1ce52594bacdc75466
parentceeef10cdbc08561f9954e13bbed1cb2299a8c72 (diff)
downloadcpython-git-d134809cd3764c6a634eab7bb8995e3e2eff14d5.tar.gz
bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958)
Hangul composition check boundaries are wrong for the second character ([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3) instead of [0x11A7, 0x11C3]).
-rw-r--r--Lib/test/test_unicodedata.py13
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst1
-rw-r--r--Modules/unicodedata.c10
4 files changed, 22 insertions, 3 deletions
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index 99dd0dec9d..170778fa97 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -208,6 +208,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
b = 'C\u0338' * 20 + '\xC7'
self.assertEqual(self.db.normalize('NFC', a), b)
+ def test_issue29456(self):
+ # Fix #29456
+ u1176_str_a = '\u1100\u1176\u11a8'
+ u1176_str_b = '\u1100\u1176\u11a8'
+ u11a7_str_a = '\u1100\u1175\u11a7'
+ u11a7_str_b = '\uae30\u11a7'
+ u11c3_str_a = '\u1100\u1175\u11c3'
+ u11c3_str_b = '\uae30\u11c3'
+ self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
+ self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
+ self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
+
+
def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, b'a')
diff --git a/Misc/ACKS b/Misc/ACKS
index 96aad5073a..25d1db0781 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1800,6 +1800,7 @@ Jason Yeo
EungJun Yi
Bob Yodlowski
Danny Yoo
+Wonsup Yoon
Rory Yorke
George Yoshida
Kazuhiro Yoshida
diff --git a/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst
new file mode 100644
index 0000000000..9b30bf654b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-08-24-17-55-39.bpo-29456.XaB3MP.rst
@@ -0,0 +1 @@
+Fix bugs in hangul normalization: u1176, u11a7 and u11c3
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 7a9a964a0f..e8788f5036 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
if (LBase <= code && code < (LBase+LCount) &&
i + 1 < len &&
VBase <= PyUnicode_READ(kind, data, i+1) &&
- PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
+ PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
+ /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
+ and V character is a modern vowel (0x1161 ~ 0x1175). */
int LIndex, VIndex;
LIndex = code - LBase;
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
code = SBase + (LIndex*VCount+VIndex)*TCount;
i+=2;
if (i < len &&
- TBase <= PyUnicode_READ(kind, data, i) &&
- PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
+ TBase < PyUnicode_READ(kind, data, i) &&
+ PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
+ /* check T character is a modern trailing consonant
+ (0x11A8 ~ 0x11C2). */
code += PyUnicode_READ(kind, data, i)-TBase;
i++;
}