Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,

ignore and replace. Initial patch written by Naoki Inada. The decoder is now up to 60 times as fast for these error handlers. Add also unit tests for the ASCII decoder.
author: Victor Stinner <victor.stinner@gmail.com> 2015-09-21 23:06:27 +0200
committer: Victor Stinner <victor.stinner@gmail.com> 2015-09-21 23:06:27 +0200
commit: f96418de05fc8710f9dc1e3a19878f8e744956f2 (patch)
tree: 6882ce050257fcb75357680e20d92d58f4e9acc9 /Objects/unicodeobject.c
parent: ba4529593877f2016215149912496e030782a57e (diff)
download: cpython-git-f96418de05fc8710f9dc1e3a19878f8e744956f2.tar.gz
1 files changed, 61 insertions, 5 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0709789991..b8840e1e36 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)
 
 /* --- 7-bit ASCII Codec -------------------------------------------------- */
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL)
+        return _Py_ERROR_OTHER;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 PyObject *
 PyUnicode_DecodeASCII(const char *s,
                       Py_ssize_t size,
@@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
+            continue;
         }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we must switch to UCS2 at the first write */
+            if (kind < PyUnicode_2BYTE_KIND) {
+                if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+                                             0xffff) < 0)
+                    return NULL;
+                kind = writer.kind;
+                data = writer.data;
+            }
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
author	Victor Stinner <victor.stinner@gmail.com>	2015-09-21 23:06:27 +0200
committer	Victor Stinner <victor.stinner@gmail.com>	2015-09-21 23:06:27 +0200
commit	f96418de05fc8710f9dc1e3a19878f8e744956f2 (patch)
tree	6882ce050257fcb75357680e20d92d58f4e9acc9 /Objects/unicodeobject.c
parent	ba4529593877f2016215149912496e030782a57e (diff)
download	cpython-git-f96418de05fc8710f9dc1e3a19878f8e744956f2.tar.gz