summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c266
1 files changed, 95 insertions, 171 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e38ded0fbc..067a945b05 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -8495,76 +8495,54 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
return -1;
}
}
-/* ensure that *outobj is at least requiredsize characters long,
- if not reallocate and adjust various state variables.
- Return 0 on success, -1 on error */
-static int
-charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
- Py_ssize_t requiredsize)
-{
- Py_ssize_t oldsize = *psize;
- Py_UCS4 *new_outobj;
- if (requiredsize > oldsize) {
- /* exponentially overallocate to minimize reallocations */
- if (requiredsize < 2 * oldsize)
- requiredsize = 2 * oldsize;
- new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
- if (new_outobj == 0)
- return -1;
- *outobj = new_outobj;
- *psize = requiredsize;
- }
- return 0;
-}
-/* lookup the character, put the result in the output string and adjust
- various state variables. Return a new reference to the object that
- was put in the output buffer in *result, or Py_None, if the mapping was
- undefined (in which case no character was written).
- The called must decref result.
- Return 0 on success, -1 on error. */
+
+/* lookup the character, write the result into the writer.
+ Return 1 if the result was written into the writer, return 0 if the mapping
+ was undefined, raise an exception return -1 on error. */
static int
-charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
- PyObject *mapping, Py_UCS4 **output,
- Py_ssize_t *osize, Py_ssize_t *opos,
- PyObject **res)
+charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
+ _PyUnicodeWriter *writer)
{
- Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
- if (charmaptranslate_lookup(curinp, mapping, res))
+ PyObject *item;
+
+ if (charmaptranslate_lookup(ch, mapping, &item))
return -1;
- if (*res==NULL) {
+
+ if (item == NULL) {
/* not found => default to 1:1 mapping */
- (*output)[(*opos)++] = curinp;
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
+ return -1;
+ }
+ return 1;
}
- else if (*res==Py_None)
- ;
- else if (PyLong_Check(*res)) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
+
+ if (item == Py_None) {
+ Py_DECREF(item);
+ return 0;
}
- else if (PyUnicode_Check(*res)) {
- Py_ssize_t repsize;
- if (PyUnicode_READY(*res) == -1)
+
+ if (PyLong_Check(item)) {
+ Py_UCS4 ch = (Py_UCS4)PyLong_AS_LONG(item);
+ if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
+ Py_DECREF(item);
return -1;
- repsize = PyUnicode_GET_LENGTH(*res);
- if (repsize==1) {
- /* no overflow check, because we know that the space is enough */
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
- }
- else if (repsize!=0) {
- /* more than one character */
- Py_ssize_t requiredsize = *opos +
- (PyUnicode_GET_LENGTH(input) - ipos) +
- repsize - 1;
- Py_ssize_t i;
- if (charmaptranslate_makespace(output, osize, requiredsize))
- return -1;
- for(i = 0; i < repsize; i++)
- (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
}
+ Py_DECREF(item);
+ return 1;
}
- else
+
+ if (!PyUnicode_Check(item)) {
+ Py_DECREF(item);
return -1;
- return 0;
+ }
+
+ if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
+ Py_DECREF(item);
+ return -1;
+ }
+
+ Py_DECREF(item);
+ return 1;
}
PyObject *
@@ -8573,22 +8551,16 @@ _PyUnicode_TranslateCharmap(PyObject *input,
const char *errors)
{
/* input object */
- char *idata;
+ char *data;
Py_ssize_t size, i;
int kind;
/* output buffer */
- Py_UCS4 *output = NULL;
- Py_ssize_t osize;
- PyObject *res;
- /* current output position */
- Py_ssize_t opos;
+ _PyUnicodeWriter writer;
+ /* error handler */
char *reason = "character maps to <undefined>";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ int ignore;
if (mapping == NULL) {
PyErr_BadArgument();
@@ -8597,10 +8569,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,
if (PyUnicode_READY(input) == -1)
return NULL;
- idata = (char*)PyUnicode_DATA(input);
+ data = (char*)PyUnicode_DATA(input);
kind = PyUnicode_KIND(input);
size = PyUnicode_GET_LENGTH(input);
- i = 0;
if (size == 0) {
Py_INCREF(input);
@@ -8609,121 +8580,74 @@ _PyUnicode_TranslateCharmap(PyObject *input,
/* allocate enough for a simple 1:1 translation without
replacements, if we need more, we'll resize */
- osize = size;
- output = PyMem_Malloc(osize * sizeof(Py_UCS4));
- opos = 0;
- if (output == NULL) {
- PyErr_NoMemory();
+ _PyUnicodeWriter_Init(&writer);
+ if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
goto onError;
- }
+ ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
+
+ i = 0;
while (i<size) {
/* try to encode it */
- PyObject *x = NULL;
- if (charmaptranslate_output(input, i, mapping,
- &output, &osize, &opos, &x)) {
- Py_XDECREF(x);
+ int translate;
+ PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
+ Py_ssize_t newpos;
+ /* startpos for collecting untranslatable chars */
+ Py_ssize_t collstart;
+ Py_ssize_t collend;
+ Py_ssize_t coll;
+ Py_UCS4 ch;
+
+ ch = PyUnicode_READ(kind, data, i);
+ translate = charmaptranslate_output(ch, mapping, &writer);
+ if (translate < 0)
goto onError;
- }
- Py_XDECREF(x);
- if (x!=Py_None) /* it worked => adjust input pointer */
+
+ if (translate != 0) {
+ /* it worked => adjust input pointer */
++i;
- else { /* untranslatable character */
- PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
- Py_ssize_t repsize;
- Py_ssize_t newpos;
- Py_ssize_t uni2;
- /* startpos for collecting untranslatable chars */
- Py_ssize_t collstart = i;
- Py_ssize_t collend = i+1;
- Py_ssize_t coll;
-
- /* find all untranslatable characters */
- while (collend < size) {
- if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
- goto onError;
- Py_XDECREF(x);
- if (x!=Py_None)
- break;
- ++collend;
- }
- /* cache callback name lookup
- * (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
- make_translate_exception(&exc,
- input, collstart, collend, reason);
- if (exc != NULL)
- PyCodec_StrictErrors(exc);
+ continue;
+ }
+
+ /* untranslatable character */
+ collstart = i;
+ collend = i+1;
+
+ /* find all untranslatable characters */
+ while (collend < size) {
+ PyObject *x;
+ ch = PyUnicode_READ(kind, data, collend);
+ if (charmaptranslate_lookup(ch, mapping, &x))
goto onError;
- case 2: /* replace */
- /* No need to check for space, this is a 1:1 replacement */
- for (coll = collstart; coll<collend; coll++)
- output[opos++] = '?';
- /* fall through */
- case 3: /* ignore */
- i = collend;
- break;
- case 4: /* xmlcharrefreplace */
- /* generate replacement (temporarily (mis)uses i) */
- for (i = collstart; i < collend; ++i) {
- char buffer[2+29+1+1];
- char *cp;
- sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
- if (charmaptranslate_makespace(&output, &osize,
- opos+strlen(buffer)+(size-collend)))
- goto onError;
- for (cp = buffer; *cp; ++cp)
- output[opos++] = *cp;
- }
- i = collend;
+ Py_XDECREF(x);
+ if (x != Py_None)
break;
- default:
- repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
- reason, input, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL)
- goto onError;
- if (PyUnicode_READY(repunicode) == -1) {
- Py_DECREF(repunicode);
- goto onError;
- }
- /* generate replacement */
- repsize = PyUnicode_GET_LENGTH(repunicode);
- if (charmaptranslate_makespace(&output, &osize,
- opos+repsize+(size-collend))) {
- Py_DECREF(repunicode);
- goto onError;
- }
- for (uni2 = 0; repsize-->0; ++uni2)
- output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
- i = newpos;
+ ++collend;
+ }
+
+ if (ignore) {
+ i = collend;
+ }
+ else {
+ repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
+ reason, input, &exc,
+ collstart, collend, &newpos);
+ if (repunicode == NULL)
+ goto onError;
+ if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Py_DECREF(repunicode);
+ goto onError;
}
+ Py_DECREF(repunicode);
+ i = newpos;
}
}
- res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
- if (!res)
- goto onError;
- PyMem_Free(output);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
- return res;
+ return _PyUnicodeWriter_Finish(&writer);
onError:
- PyMem_Free(output);
+ _PyUnicodeWriter_Dealloc(&writer);
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
return NULL;