diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-01-15 14:42:59 +0200 |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-01-15 14:42:59 +0200 |
commit | 9599745e2cad406d6e8e38b3bb5727a3d2335a3e (patch) | |
tree | a31d32a4da73498e3f28e5dd40cc877ea8b67c29 | |
parent | 90b5d9288d2a25372471cbee9b49edc2e2193178 (diff) | |
download | cpython-git-9599745e2cad406d6e8e38b3bb5727a3d2335a3e.tar.gz |
Issue #14850: Now a chamap decoder treates U+FFFE as "undefined mapping"
in any mapping, not only in an unicode string.
-rw-r--r-- | Lib/test/test_codecs.py | 56 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 46 |
3 files changed, 79 insertions, 26 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 5baf225259..e6c39b7391 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1551,6 +1551,14 @@ class CharmapTest(unittest.TestCase): (u"abc", 3) ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab" + ) + + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe" + ) + self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"), (u"ab\ufffd", 3) @@ -1566,10 +1574,6 @@ class CharmapTest(unittest.TestCase): (u"ab", 3) ) - self.assertRaises(UnicodeDecodeError, - codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab" - ) - self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"), (u"ab", 3) @@ -1611,6 +1615,17 @@ class CharmapTest(unittest.TestCase): {0: u'a', 1: u'b'} ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, "\x00\x01\x02", "strict", + {0: u'a', 1: u'b', 2: None} + ) + + # Issue #14850 + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, "\x00\x01\x02", "strict", + {0: u'a', 1: u'b', 2: u'\ufffe'} + ) + self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "replace", {0: u'a', 1: u'b'}), @@ -1623,6 +1638,13 @@ class CharmapTest(unittest.TestCase): (u"ab\ufffd", 3) ) + # Issue #14850 + self.assertEqual( + codecs.charmap_decode("\x00\x01\x02", "replace", + {0: u'a', 1: u'b', 2: u'\ufffe'}), + (u"ab\ufffd", 3) + ) + self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "ignore", {0: u'a', 1: u'b'}), @@ -1635,7 +1657,14 @@ class CharmapTest(unittest.TestCase): (u"ab", 3) ) - allbytes = bytes(range(256)) + # Issue #14850 + self.assertEqual( + codecs.charmap_decode("\x00\x01\x02", "ignore", + {0: u'a', 1: u'b', 2: u'\ufffe'}), + (u"ab", 3) + ) + + allbytes = "".join(chr(i) for i in xrange(256)) self.assertEqual( codecs.charmap_decode(allbytes, "ignore", {}), (u"", len(allbytes)) @@ -1669,6 +1698,11 @@ class CharmapTest(unittest.TestCase): {0: a, 1: b}, ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, "\x00\x01\x02", "strict", + {0: a, 1: b, 2: 0xFFFE}, + ) + self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "replace", {0: a, 1: b}), @@ -1676,11 +1710,23 @@ class CharmapTest(unittest.TestCase): ) self.assertEqual( + codecs.charmap_decode("\x00\x01\x02", "replace", + {0: a, 1: b, 2: 0xFFFE}), + (u"ab\ufffd", 3) + ) + + self.assertEqual( codecs.charmap_decode("\x00\x01\x02", "ignore", {0: a, 1: b}), (u"ab", 3) ) + self.assertEqual( + codecs.charmap_decode("\x00\x01\x02", "ignore", + {0: a, 1: b, 2: 0xFFFE}), + (u"ab", 3) + ) + class WithStmtTest(unittest.TestCase): def test_encodedfile(self): @@ -9,6 +9,9 @@ What's New in Python 2.7.4 Core and Builtins ----------------- +- Issue #14850: Now a chamap decoder treates U+FFFE as "undefined mapping" + in any mapping, not only in an unicode string. + - Issue #11461: Fix the incremental UTF-16 decoder. Original patch by Amaury Forgeot d'Arc. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1c6e55d1bf..79e1b600e3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4121,15 +4121,18 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, if (PyErr_ExceptionMatches(PyExc_LookupError)) { /* No mapping found means: mapping is undefined. */ PyErr_Clear(); - x = Py_None; - Py_INCREF(x); + goto Undefined; } else goto onError; } /* Apply mapping */ + if (x == Py_None) + goto Undefined; if (PyInt_Check(x)) { long value = PyInt_AS_LONG(x); + if (value == 0xFFFE) + goto Undefined; if (value < 0 || value > 0x10FFFF) { PyErr_SetString(PyExc_TypeError, "character mapping must be in range(0x110000)"); @@ -4162,29 +4165,16 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, #endif *p++ = (Py_UNICODE)value; } - else if (x == Py_None) { - /* undefined mapping */ - outpos = p-PyUnicode_AS_UNICODE(v); - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "charmap", "character maps to <undefined>", - starts, size, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) { - Py_DECREF(x); - goto onError; - } - Py_DECREF(x); - continue; - } else if (PyUnicode_Check(x)) { Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); - if (targetsize == 1) + if (targetsize == 1) { /* 1-1 mapping */ - *p++ = *PyUnicode_AS_UNICODE(x); - + Py_UNICODE value = *PyUnicode_AS_UNICODE(x); + if (value == 0xFFFE) + goto Undefined; + *p++ = value; + } else if (targetsize > 1) { /* 1-n mapping */ if (targetsize > extrachars) { @@ -4218,6 +4208,20 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, } Py_DECREF(x); ++s; + continue; +Undefined: + /* undefined mapping */ + Py_XDECREF(x); + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to <undefined>", + starts, size, &startinpos, &endinpos, &exc, &s, + &v, &outpos, &p)) { + goto onError; + } } } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) |