#8271: the utf-8 decoder now outputs the correct number of U+FFFD characters when used with the "replace" error handler on invalid utf-8 sequences. Patch by Serhiy Storchaka, tests by Ezio Melotti.

author: Ezio Melotti <ezio.melotti@gmail.com> 2012-11-04 23:21:38 +0200
committer: Ezio Melotti <ezio.melotti@gmail.com> 2012-11-04 23:21:38 +0200
commit: f7ed5d111bbe30b36b2629a87d9fcc291de4aafb (patch)
tree: a492034add126dd4216104511a035b4c2d7d8253 /Objects/unicodeobject.c
parent: 55b5d5c9190fe59da1551edc51b2e27a2a84790e (diff)
download: cpython-git-f7ed5d111bbe30b36b2629a87d9fcc291de4aafb.tar.gz
1 files changed, 4 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f61f9d0df1..665f03d884 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4759,9 +4759,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
                 goto End;
             errmsg = "unexpected end of data";
             startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = end - starts;
             break;
         case 1:
             errmsg = "invalid start byte";
@@ -4769,11 +4767,11 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             endinpos = startinpos + 1;
             break;
         case 2:
+        case 3:
+        case 4:
             errmsg = "invalid continuation byte";
             startinpos = s - starts;
-            endinpos = startinpos + 1;
-            while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
-                endinpos++;
+            endinpos = startinpos + ch - 1;
             break;
         default:
             if (unicode_putchar(&unicode, &outpos, ch) < 0)
author	Ezio Melotti <ezio.melotti@gmail.com>	2012-11-04 23:21:38 +0200
committer	Ezio Melotti <ezio.melotti@gmail.com>	2012-11-04 23:21:38 +0200
commit	f7ed5d111bbe30b36b2629a87d9fcc291de4aafb (patch)
tree	a492034add126dd4216104511a035b4c2d7d8253 /Objects/unicodeobject.c
parent	55b5d5c9190fe59da1551edc51b2e27a2a84790e (diff)
download	cpython-git-f7ed5d111bbe30b36b2629a87d9fcc291de4aafb.tar.gz