summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2012-11-04 23:21:38 +0200
committerEzio Melotti <ezio.melotti@gmail.com>2012-11-04 23:21:38 +0200
commitf7ed5d111bbe30b36b2629a87d9fcc291de4aafb (patch)
treea492034add126dd4216104511a035b4c2d7d8253 /Objects/unicodeobject.c
parent55b5d5c9190fe59da1551edc51b2e27a2a84790e (diff)
downloadcpython-git-f7ed5d111bbe30b36b2629a87d9fcc291de4aafb.tar.gz
#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c10
1 files changed, 4 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f61f9d0df1..665f03d884 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
goto End;
errmsg = "unexpected end of data";
startinpos = s - starts;
- endinpos = startinpos + 1;
- while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
- endinpos++;
+ endinpos = end - starts;
break;
case 1:
errmsg = "invalid start byte";
@@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
endinpos = startinpos + 1;
break;
case 2:
+ case 3:
+ case 4:
errmsg = "invalid continuation byte";
startinpos = s - starts;
- endinpos = startinpos + 1;
- while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
- endinpos++;
+ endinpos = startinpos + ch - 1;
break;
default:
if (unicode_putchar(&unicode, &outpos, ch) < 0)