summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-09-21 23:06:27 +0200
committerVictor Stinner <victor.stinner@gmail.com>2015-09-21 23:06:27 +0200
commitf96418de05fc8710f9dc1e3a19878f8e744956f2 (patch)
tree6882ce050257fcb75357680e20d92d58f4e9acc9 /Objects/unicodeobject.c
parentba4529593877f2016215149912496e030782a57e (diff)
downloadcpython-git-f96418de05fc8710f9dc1e3a19878f8e744956f2.tar.gz
Issue #24870: Optimize the ASCII decoder for error handlers: surrogateescape,
ignore and replace. Initial patch written by Naoki Inada. The decoder is now up to 60 times as fast for these error handlers. Add also unit tests for the ASCII decoder.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c66
1 files changed, 61 insertions, 5 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0709789991..b8840e1e36 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6644,6 +6644,28 @@ PyUnicode_AsLatin1String(PyObject *unicode)
/* --- 7-bit ASCII Codec -------------------------------------------------- */
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL)
+ return _Py_ERROR_OTHER;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ return _Py_ERROR_OTHER;
+}
+
PyObject *
PyUnicode_DecodeASCII(const char *s,
Py_ssize_t size,
@@ -6657,8 +6679,9 @@ PyUnicode_DecodeASCII(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6687,12 +6710,45 @@ PyUnicode_DecodeASCII(const char *s,
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we must switch to UCS2 at the first write */
+ if (kind < PyUnicode_2BYTE_KIND) {
+ if (_PyUnicodeWriter_Prepare(&writer, size - writer.pos,
+ 0xffff) < 0)
+ return NULL;
+ kind = writer.kind;
+ data = writer.data;
+ }
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
@@ -6701,13 +6757,13 @@ PyUnicode_DecodeASCII(const char *s,
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}