summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2014-04-05 11:44:04 +0200
committerVictor Stinner <victor.stinner@gmail.com>2014-04-05 11:44:04 +0200
commit89a76abf20889551ec1ed64dee1a4161a435db5b (patch)
treedf9f731767053c66cd84bb64ee8dce9b77f938f6 /Objects/unicodeobject.c
parent5a29f2584396fd96796a783564bfda3045f19729 (diff)
downloadcpython-git-89a76abf20889551ec1ed64dee1a4161a435db5b.tar.gz
Issue #21118: Optimize str.translate() for ASCII => ASCII translation
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c121
1 files changed, 120 insertions, 1 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 91f6620824..0386a87125 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8545,6 +8545,116 @@ charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
return 1;
}
+static int
+unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
+ Py_UCS1 *translate)
+{
+ PyObject *item;
+ int ret = 0;
+
+ item = NULL;
+ if (charmaptranslate_lookup(ch, mapping, &item)) {
+ return -1;
+ }
+
+ if (item == Py_None) {
+ /* deletion: skip fast translate */
+ goto exit;
+ }
+
+ if (item == NULL) {
+ /* not found => default to 1:1 mapping */
+ translate[ch] = ch;
+ return 1;
+ }
+
+ if (PyLong_Check(item)) {
+ long replace = (Py_UCS4)PyLong_AS_LONG(item);
+ if (replace == -1) {
+ Py_DECREF(item);
+ return -1;
+ }
+ if (replace < 0 || 127 < replace) {
+ /* invalid character or character outside ASCII:
+ skip the fast translate */
+ goto exit;
+ }
+ translate[ch] = (Py_UCS1)replace;
+ }
+ else if (PyUnicode_Check(item)) {
+ Py_UCS4 replace;
+
+ if (PyUnicode_READY(item) == -1) {
+ Py_DECREF(item);
+ return -1;
+ }
+ if (PyUnicode_GET_LENGTH(item) != 1)
+ goto exit;
+
+ replace = PyUnicode_READ_CHAR(item, 0);
+ if (replace > 127)
+ goto exit;
+ translate[ch] = (Py_UCS1)replace;
+ }
+ else {
+ /* not a long or unicode */
+ goto exit;
+ }
+ Py_DECREF(item);
+ item = NULL;
+ ret = 1;
+
+exit:
+ Py_XDECREF(item);
+ return ret;
+}
+
+/* Fast path for ascii => ascii translation. Return 1 if the whole string
+ was translated into writer, return 0 if the input string was partially
+ translated into writer, raise an exception and return -1 on error. */
+static int
+unicode_fast_translate(PyObject *input, PyObject *mapping,
+ _PyUnicodeWriter *writer)
+{
+ Py_UCS1 translate[128], ch, ch2;
+ Py_ssize_t len;
+ Py_UCS1 *in, *end, *out;
+ int res;
+
+ if (PyUnicode_READY(input) == -1)
+ return -1;
+ if (!PyUnicode_IS_ASCII(input))
+ return 0;
+ len = PyUnicode_GET_LENGTH(input);
+
+ memset(translate, 0xff, 128);
+
+ in = PyUnicode_1BYTE_DATA(input);
+ end = in + len;
+
+ assert(PyUnicode_IS_ASCII(writer->buffer));
+ assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
+ out = PyUnicode_1BYTE_DATA(writer->buffer);
+
+ for (; in < end; in++, out++) {
+ ch = *in;
+ ch2 = translate[ch];
+ if (ch2 == 0xff) {
+ res = unicode_fast_translate_lookup(mapping, ch, translate);
+ if (res < 0)
+ return -1;
+ if (res == 0) {
+ writer->pos = in - PyUnicode_1BYTE_DATA(input);
+ return 0;
+ }
+ ch2 = translate[ch];
+ }
+ *out = ch2;
+ }
+ writer->pos = len;
+ return 1;
+}
+
PyObject *
_PyUnicode_TranslateCharmap(PyObject *input,
PyObject *mapping,
@@ -8561,6 +8671,7 @@ _PyUnicode_TranslateCharmap(PyObject *input,
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
int ignore;
+ int res;
if (mapping == NULL) {
PyErr_BadArgument();
@@ -8584,9 +8695,17 @@ _PyUnicode_TranslateCharmap(PyObject *input,
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
+ res = unicode_fast_translate(input, mapping, &writer);
+ if (res < 0) {
+ _PyUnicodeWriter_Dealloc(&writer);
+ return NULL;
+ }
+ if (res == 1)
+ return _PyUnicodeWriter_Finish(&writer);
+
ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
- i = 0;
+ i = writer.pos;
while (i<size) {
/* try to encode it */
int translate;