diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2012-11-04 23:21:38 +0200 |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2012-11-04 23:21:38 +0200 |
commit | f7ed5d111bbe30b36b2629a87d9fcc291de4aafb (patch) | |
tree | a492034add126dd4216104511a035b4c2d7d8253 /Objects/unicodeobject.c | |
parent | 55b5d5c9190fe59da1551edc51b2e27a2a84790e (diff) | |
download | cpython-git-f7ed5d111bbe30b36b2629a87d9fcc291de4aafb.tar.gz |
#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 10 |
1 files changed, 4 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f61f9d0df1..665f03d884 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, goto End; errmsg = "unexpected end of data"; startinpos = s - starts; - endinpos = startinpos + 1; - while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) - endinpos++; + endinpos = end - starts; break; case 1: errmsg = "invalid start byte"; @@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s, endinpos = startinpos + 1; break; case 2: + case 3: + case 4: errmsg = "invalid continuation byte"; startinpos = s - starts; - endinpos = startinpos + 1; - while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80) - endinpos++; + endinpos = startinpos + ch - 1; break; default: if (unicode_putchar(&unicode, &outpos, ch) < 0) |