summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c835
1 files changed, 544 insertions, 291 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 0b78301f23..895a4e88f3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include "Python.h"
#include "ucnhash.h"
#include "bytes_methods.h"
+#include "stringlib/eq.h"
#ifdef MS_WINDOWS
#include <windows.h>
@@ -162,6 +163,14 @@ extern "C" {
*_to++ = (to_type) *_iter++; \
} while (0)
+#ifdef MS_WINDOWS
+ /* On Windows, overallocate by 50% is the best factor */
+# define OVERALLOCATE_FACTOR 2
+#else
+ /* On Linux, overallocate by 25% is the best factor */
+# define OVERALLOCATE_FACTOR 4
+#endif
+
/* This dictionary holds all interned unicode strings. Note that references
to strings in this dictionary are *not* counted in the string's ob_refcnt.
When the interned string reaches a refcnt of 0 the string deallocation
@@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = {
#include "clinic/unicodeobject.c.h"
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_STRICT,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_BACKSLASHREPLACE,
+ _Py_ERROR_SURROGATEPASS,
+ _Py_ERROR_XMLCHARREFREPLACE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+ if (errors == NULL || strcmp(errors, "strict") == 0)
+ return _Py_ERROR_STRICT;
+ if (strcmp(errors, "surrogateescape") == 0)
+ return _Py_ERROR_SURROGATEESCAPE;
+ if (strcmp(errors, "replace") == 0)
+ return _Py_ERROR_REPLACE;
+ if (strcmp(errors, "ignore") == 0)
+ return _Py_ERROR_IGNORE;
+ if (strcmp(errors, "backslashreplace") == 0)
+ return _Py_ERROR_BACKSLASHREPLACE;
+ if (strcmp(errors, "surrogatepass") == 0)
+ return _Py_ERROR_SURROGATEPASS;
+ if (strcmp(errors, "xmlcharrefreplace") == 0)
+ return _Py_ERROR_XMLCHARREFREPLACE;
+ return _Py_ERROR_OTHER;
+}
+
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
This function is kept for backward compatibility with the old API. */
Py_UNICODE
@@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode)
return _PyUnicode_Copy(unicode);
}
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0x100)
+ incr = 2+2;
+ else if (ch < 0x10000)
+ incr = 2+4;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+8;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ str = _PyBytesWriter_Prepare(writer, str, size);
+ if (str == NULL)
+ return NULL;
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ ch = PyUnicode_READ(kind, data, i);
+ *str++ = '\\';
+ if (ch >= 0x00010000) {
+ *str++ = 'U';
+ *str++ = Py_hexdigits[(ch>>28)&0xf];
+ *str++ = Py_hexdigits[(ch>>24)&0xf];
+ *str++ = Py_hexdigits[(ch>>20)&0xf];
+ *str++ = Py_hexdigits[(ch>>16)&0xf];
+ *str++ = Py_hexdigits[(ch>>12)&0xf];
+ *str++ = Py_hexdigits[(ch>>8)&0xf];
+ }
+ else if (ch >= 0x100) {
+ *str++ = 'u';
+ *str++ = Py_hexdigits[(ch>>12)&0xf];
+ *str++ = Py_hexdigits[(ch>>8)&0xf];
+ }
+ else
+ *str++ = 'x';
+ *str++ = Py_hexdigits[(ch>>4)&0xf];
+ *str++ = Py_hexdigits[ch&0xf];
+ }
+ return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+ ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+ PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+ Py_ssize_t size, i;
+ Py_UCS4 ch;
+ enum PyUnicode_Kind kind;
+ void *data;
+
+ assert(PyUnicode_IS_READY(unicode));
+ kind = PyUnicode_KIND(unicode);
+ data = PyUnicode_DATA(unicode);
+
+ size = 0;
+ /* determine replacement size */
+ for (i = collstart; i < collend; ++i) {
+ Py_ssize_t incr;
+
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 10)
+ incr = 2+1+1;
+ else if (ch < 100)
+ incr = 2+2+1;
+ else if (ch < 1000)
+ incr = 2+3+1;
+ else if (ch < 10000)
+ incr = 2+4+1;
+ else if (ch < 100000)
+ incr = 2+5+1;
+ else if (ch < 1000000)
+ incr = 2+6+1;
+ else {
+ assert(ch <= MAX_UNICODE);
+ incr = 2+7+1;
+ }
+ if (size > PY_SSIZE_T_MAX - incr) {
+ PyErr_SetString(PyExc_OverflowError,
+ "encoded result is too long for a Python string");
+ return NULL;
+ }
+ size += incr;
+ }
+
+ str = _PyBytesWriter_Prepare(writer, str, size);
+ if (str == NULL)
+ return NULL;
+
+ /* generate replacement */
+ for (i = collstart; i < collend; ++i) {
+ str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+ }
+ return str;
+}
+
/* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters.
@@ -647,27 +811,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
int direction)
{
- int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
-
switch (kind) {
case PyUnicode_1BYTE_KIND:
- {
- Py_UCS1 ch1 = (Py_UCS1) ch;
- if (ch1 == ch)
- return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
- else
- return -1;
- }
+ if ((Py_UCS1) ch != ch)
+ return -1;
+ if (direction > 0)
+ return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
+ else
+ return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
case PyUnicode_2BYTE_KIND:
- {
- Py_UCS2 ch2 = (Py_UCS2) ch;
- if (ch2 == ch)
- return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
- else
- return -1;
- }
+ if ((Py_UCS2) ch != ch)
+ return -1;
+ if (direction > 0)
+ return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
+ else
+ return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
case PyUnicode_4BYTE_KIND:
- return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
+ if (direction > 0)
+ return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
+ else
+ return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
default:
assert(0);
return -1;
@@ -3167,24 +3330,22 @@ wcstombs_errorpos(const wchar_t *wstr)
static int
locale_error_handler(const char *errors, int *surrogateescape)
{
- if (errors == NULL) {
- *surrogateescape = 0;
- return 0;
- }
-
- if (strcmp(errors, "strict") == 0) {
+ _Py_error_handler error_handler = get_error_handler(errors);
+ switch (error_handler)
+ {
+ case _Py_ERROR_STRICT:
*surrogateescape = 0;
return 0;
- }
- if (strcmp(errors, "surrogateescape") == 0) {
+ case _Py_ERROR_SURROGATEESCAPE:
*surrogateescape = 1;
return 0;
+ default:
+ PyErr_Format(PyExc_ValueError,
+ "only 'strict' and 'surrogateescape' error handlers "
+ "are supported, not '%s'",
+ errors);
+ return -1;
}
- PyErr_Format(PyExc_ValueError,
- "only 'strict' and 'surrogateescape' error handlers "
- "are supported, not '%s'",
- errors);
- return -1;
}
PyObject *
@@ -4690,8 +4851,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
Py_ssize_t startinpos;
Py_ssize_t endinpos;
const char *errmsg = "";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0) {
if (consumed)
@@ -4716,6 +4878,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
+
if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4754,24 +4917,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
continue;
}
- if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
- "utf-8", errmsg,
- &starts, &end, &startinpos, &endinpos, &exc, &s,
- &writer))
- goto onError;
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_IGNORE:
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_REPLACE:
+ if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+ goto onError;
+ s += (endinpos - startinpos);
+ break;
+
+ case _Py_ERROR_SURROGATEESCAPE:
+ {
+ Py_ssize_t i;
+
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ for (i=startinpos; i<endinpos; i++) {
+ ch = (Py_UCS4)(unsigned char)(starts[i]);
+ PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+ ch + 0xdc00);
+ writer.pos++;
+ }
+ s += (endinpos - startinpos);
+ break;
+ }
+
+ default:
+ if (unicode_decode_call_errorhandler_writer(
+ errors, &error_handler_obj,
+ "utf-8", errmsg,
+ &starts, &end, &startinpos, &endinpos, &exc, &s,
+ &writer))
+ goto onError;
+ }
}
End:
if (consumed)
*consumed = s - starts;
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
@@ -5862,11 +6057,10 @@ PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
{
Py_ssize_t i, len;
- PyObject *repr;
char *p;
int kind;
void *data;
- Py_ssize_t expandsize = 0;
+ _PyBytesWriter writer;
/* Initial allocation is based on the longest-possible character
escape.
@@ -5882,35 +6076,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
len = PyUnicode_GET_LENGTH(unicode);
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
- switch (kind) {
- case PyUnicode_1BYTE_KIND: expandsize = 4; break;
- case PyUnicode_2BYTE_KIND: expandsize = 6; break;
- case PyUnicode_4BYTE_KIND: expandsize = 10; break;
- }
- if (len == 0)
- return PyBytes_FromStringAndSize(NULL, 0);
-
- if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
- return PyErr_NoMemory();
-
- repr = PyBytes_FromStringAndSize(NULL,
- 2
- + expandsize*len
- + 1);
- if (repr == NULL)
- return NULL;
-
- p = PyBytes_AS_STRING(repr);
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
for (i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
/* Escape backslashes */
if (ch == '\\') {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = (char) ch;
continue;
@@ -5919,6 +6106,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -5934,6 +6126,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 16-bit characters to '\uxxxx' */
if (ch >= 256) {
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -5944,20 +6140,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map special whitespace to '\t', \n', '\r' */
else if (ch == '\t') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 't';
}
else if (ch == '\n') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'n';
}
else if (ch == '\r') {
+ p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'r';
}
/* Map non-printable US ASCII to '\xhh' */
else if (ch < ' ' || ch >= 0x7F) {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 4-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'x';
*p++ = Py_hexdigits[(ch >> 4) & 0x000F];
@@ -5969,10 +6182,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p - PyBytes_AS_STRING(repr) > 0);
- if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *
@@ -6101,13 +6315,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{
- PyObject *repr;
char *p;
- char *q;
- Py_ssize_t expandsize, pos;
+ Py_ssize_t pos;
int kind;
void *data;
Py_ssize_t len;
+ _PyBytesWriter writer;
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
@@ -6115,28 +6328,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
+
+ _PyBytesWriter_Init(&writer);
+
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
len = PyUnicode_GET_LENGTH(unicode);
- /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
- bytes, and 1 byte characters 4. */
- expandsize = kind * 2 + 2;
- if (len > PY_SSIZE_T_MAX / expandsize)
- return PyErr_NoMemory();
-
- repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
- if (repr == NULL)
- return NULL;
- if (len == 0)
- return repr;
+ p = _PyBytesWriter_Alloc(&writer, len);
+ if (p == NULL)
+ goto error;
+ writer.overallocate = 1;
- p = q = PyBytes_AS_STRING(repr);
for (pos = 0; pos < len; pos++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* Map 32-bit characters to '\Uxxxxxxxx' */
if (ch >= 0x10000) {
assert(ch <= MAX_UNICODE);
+
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6150,6 +6364,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
}
/* Map 16-bit characters to '\uxxxx' */
else if (ch >= 256) {
+ /* -1: substract 1 preallocated byte */
+ p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+ if (p == NULL)
+ goto error;
+
*p++ = '\\';
*p++ = 'u';
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -6162,10 +6381,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
*p++ = (char) ch;
}
- assert(p > q);
- if (_PyBytes_Resize(&repr, p - q) < 0)
- return NULL;
- return repr;
+ return _PyBytesWriter_Finish(&writer, p);
+
+error:
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
}
PyObject *
@@ -6396,25 +6616,22 @@ unicode_encode_call_errorhandler(const char *errors,
static PyObject *
unicode_encode_ucs1(PyObject *unicode,
const char *errors,
- unsigned int limit)
+ const Py_UCS4 limit)
{
/* input state */
Py_ssize_t pos=0, size;
int kind;
void *data;
- /* output object */
- PyObject *res;
/* pointer into the output */
char *str;
- /* current output position */
- Py_ssize_t ressize;
const char *encoding = (limit == 256) ? "latin-1" : "ascii";
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+ PyObject *rep = NULL;
+ /* output object */
+ _PyBytesWriter writer;
if (PyUnicode_READY(unicode) == -1)
return NULL;
@@ -6425,186 +6642,157 @@ unicode_encode_ucs1(PyObject *unicode,
replacements, if we need more, we'll resize */
if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0);
- res = PyBytes_FromStringAndSize(NULL, size);
- if (res == NULL)
+
+ _PyBytesWriter_Init(&writer);
+ str = _PyBytesWriter_Alloc(&writer, size);
+ if (str == NULL)
return NULL;
- str = PyBytes_AS_STRING(res);
- ressize = size;
while (pos < size) {
- Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+ Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* can we encode this? */
- if (c<limit) {
+ if (ch < limit) {
/* no overflow check, because we know that the space is enough */
- *str++ = (char)c;
+ *str++ = (char)ch;
++pos;
}
else {
- Py_ssize_t requiredsize;
- PyObject *repunicode;
- Py_ssize_t repsize, newpos, respos, i;
+ Py_ssize_t newpos, i;
/* startpos for collecting unencodable chars */
Py_ssize_t collstart = pos;
- Py_ssize_t collend = pos;
+ Py_ssize_t collend = collstart + 1;
/* find all unecodable characters */
+
while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
++collend;
+
+ /* Only overallocate the buffer if it's not the last write */
+ writer.overallocate = (collend < size);
+
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
- if (known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- known_errorHandler = 4;
- else
- known_errorHandler = 0;
- }
- switch (known_errorHandler) {
- case 1: /* strict */
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
goto onError;
- case 2: /* replace */
- while (collstart++ < collend)
- *str++ = '?'; /* fall through */
- case 3: /* ignore */
+
+ case _Py_ERROR_REPLACE:
+ memset(str, '?', collend - collstart);
+ str += (collend - collstart);
+ /* fall through ignore error handler */
+ case _Py_ERROR_IGNORE:
pos = collend;
break;
- case 4: /* xmlcharrefreplace */
- respos = str - PyBytes_AS_STRING(res);
- requiredsize = respos;
- /* determine replacement size */
+
+ case _Py_ERROR_BACKSLASHREPLACE:
+ /* substract preallocated bytes */
+ writer.min_size -= (collend - collstart);
+ str = backslashreplace(&writer, str,
+ unicode, collstart, collend);
+ if (str == NULL)
+ goto onError;
+ pos = collend;
+ break;
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
+ /* substract preallocated bytes */
+ writer.min_size -= (collend - collstart);
+ str = xmlcharrefreplace(&writer, str,
+ unicode, collstart, collend);
+ if (str == NULL)
+ goto onError;
+ pos = collend;
+ break;
+
+ case _Py_ERROR_SURROGATEESCAPE:
for (i = collstart; i < collend; ++i) {
- Py_UCS4 ch = PyUnicode_READ(kind, data, i);
- Py_ssize_t incr;
- if (ch < 10)
- incr = 2+1+1;
- else if (ch < 100)
- incr = 2+2+1;
- else if (ch < 1000)
- incr = 2+3+1;
- else if (ch < 10000)
- incr = 2+4+1;
- else if (ch < 100000)
- incr = 2+5+1;
- else if (ch < 1000000)
- incr = 2+6+1;
- else {
- assert(ch <= MAX_UNICODE);
- incr = 2+7+1;
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0xdc80 || 0xdcff < ch) {
+ /* Not a UTF-8b surrogate */
+ break;
}
- if (requiredsize > PY_SSIZE_T_MAX - incr)
- goto overflow;
- requiredsize += incr;
+ *str++ = (char)(ch - 0xdc00);
+ ++pos;
}
- if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
- goto overflow;
- requiredsize += size - collend;
- if (requiredsize > ressize) {
- if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
- requiredsize = 2*ressize;
- if (_PyBytes_Resize(&res, requiredsize))
- goto onError;
- str = PyBytes_AS_STRING(res) + respos;
- ressize = requiredsize;
- }
- /* generate replacement */
- for (i = collstart; i < collend; ++i) {
- str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
- }
- pos = collend;
- break;
+ if (i >= collend)
+ break;
+ collstart = pos;
+ assert(collstart != collend);
+ /* fallback to general error handling */
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
- encoding, reason, unicode, &exc,
- collstart, collend, &newpos);
- if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
- PyUnicode_READY(repunicode) == -1))
+ rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+ encoding, reason, unicode, &exc,
+ collstart, collend, &newpos);
+ if (rep == NULL)
goto onError;
- if (PyBytes_Check(repunicode)) {
+
+ /* substract preallocated bytes */
+ writer.min_size -= 1;
+
+ if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */
- repsize = PyBytes_Size(repunicode);
- if (repsize > 1) {
- /* Make room for all additional bytes. */
- respos = str - PyBytes_AS_STRING(res);
- if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
- Py_DECREF(repunicode);
- goto overflow;
- }
- if (_PyBytes_Resize(&res, ressize+repsize-1)) {
- Py_DECREF(repunicode);
- goto onError;
- }
- str = PyBytes_AS_STRING(res) + respos;
- ressize += repsize-1;
- }
- memcpy(str, PyBytes_AsString(repunicode), repsize);
- str += repsize;
- pos = newpos;
- Py_DECREF(repunicode);
- break;
- }
- /* need more space? (at least enough for what we
- have+the replacement+the rest of the string, so
- we won't have to check space for encodable characters) */
- respos = str - PyBytes_AS_STRING(res);
- repsize = PyUnicode_GET_LENGTH(repunicode);
- requiredsize = respos;
- if (requiredsize > PY_SSIZE_T_MAX - repsize)
- goto overflow;
- requiredsize += repsize;
- if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
- goto overflow;
- requiredsize += size - collend;
- if (requiredsize > ressize) {
- if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
- requiredsize = 2*ressize;
- if (_PyBytes_Resize(&res, requiredsize)) {
- Py_DECREF(repunicode);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyBytes_AS_STRING(rep),
+ PyBytes_GET_SIZE(rep));
+ if (str == NULL)
goto onError;
- }
- str = PyBytes_AS_STRING(res) + respos;
- ressize = requiredsize;
}
- /* check if there is anything unencodable in the replacement
- and copy it to the output */
- for (i = 0; repsize-->0; ++i, ++str) {
- c = PyUnicode_READ_CHAR(repunicode, i);
- if (c >= limit) {
- raise_encode_exception(&exc, encoding, unicode,
- pos, pos+1, reason);
- Py_DECREF(repunicode);
+ else {
+ assert(PyUnicode_Check(rep));
+
+ if (PyUnicode_READY(rep) < 0)
goto onError;
+
+ if (PyUnicode_IS_ASCII(rep)) {
+ /* Fast path: all characters are smaller than limit */
+ assert(limit >= 128);
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyUnicode_DATA(rep),
+ PyUnicode_GET_LENGTH(rep));
+ }
+ else {
+ Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
+
+ str = _PyBytesWriter_Prepare(&writer, str, repsize);
+ if (str == NULL)
+ goto onError;
+
+ /* check if there is anything unencodable in the
+ replacement and copy it to the output */
+ for (i = 0; repsize-->0; ++i, ++str) {
+ ch = PyUnicode_READ_CHAR(rep, i);
+ if (ch >= limit) {
+ raise_encode_exception(&exc, encoding, unicode,
+ pos, pos+1, reason);
+ goto onError;
+ }
+ *str = (char)ch;
+ }
}
- *str = (char)c;
}
pos = newpos;
- Py_DECREF(repunicode);
+ Py_CLEAR(rep);
}
+
+ /* If overallocation was disabled, ensure that it was the last
+ write. Otherwise, we missed an optimization */
+ assert(writer.overallocate || pos == size);
}
}
- /* Resize if we allocated to much */
- size = str - PyBytes_AS_STRING(res);
- if (size < ressize) { /* If this falls res will be NULL */
- assert(size >= 0);
- if (_PyBytes_Resize(&res, size) < 0)
- goto onError;
- }
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
- return res;
-
- overflow:
- PyErr_SetString(PyExc_OverflowError,
- "encoded result is too long for a Python string");
+ return _PyBytesWriter_Finish(&writer, str);
onError:
- Py_XDECREF(res);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(rep);
+ _PyBytesWriter_Dealloc(&writer);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -6664,8 +6852,9 @@ PyUnicode_DecodeASCII(const char *s,
Py_ssize_t endinpos;
Py_ssize_t outpos;
const char *e;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();
@@ -6694,12 +6883,42 @@ PyUnicode_DecodeASCII(const char *s,
PyUnicode_WRITE(kind, data, writer.pos, c);
writer.pos++;
++s;
+ continue;
}
- else {
+
+ /* byte outsize range 0x00..0x7f: call the error handler */
+
+ if (error_handler == _Py_ERROR_UNKNOWN)
+ error_handler = get_error_handler(errors);
+
+ switch (error_handler)
+ {
+ case _Py_ERROR_REPLACE:
+ case _Py_ERROR_SURROGATEESCAPE:
+ /* Fast-path: the error handler only writes one character,
+ but we may switch to UCS2 at the first write */
+ if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+ goto onError;
+ kind = writer.kind;
+ data = writer.data;
+
+ if (error_handler == _Py_ERROR_REPLACE)
+ PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+ else
+ PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+ writer.pos++;
+ ++s;
+ break;
+
+ case _Py_ERROR_IGNORE:
+ ++s;
+ break;
+
+ default:
startinpos = s-starts;
endinpos = startinpos + 1;
if (unicode_decode_call_errorhandler_writer(
- errors, &errorHandler,
+ errors, &error_handler_obj,
"ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s,
&writer))
@@ -6708,13 +6927,13 @@ PyUnicode_DecodeASCII(const char *s,
data = writer.data;
}
}
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
onError:
_PyUnicodeWriter_Dealloc(&writer);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return NULL;
}
@@ -8080,7 +8299,7 @@ static int
charmap_encoding_error(
PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
- int *known_errorHandler, PyObject **errorHandler, const char *errors,
+ _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8127,23 +8346,15 @@ charmap_encoding_error(
}
/* cache callback name lookup
* (if not done yet, i.e. it's the first error) */
- if (*known_errorHandler==-1) {
- if ((errors==NULL) || (!strcmp(errors, "strict")))
- *known_errorHandler = 1;
- else if (!strcmp(errors, "replace"))
- *known_errorHandler = 2;
- else if (!strcmp(errors, "ignore"))
- *known_errorHandler = 3;
- else if (!strcmp(errors, "xmlcharrefreplace"))
- *known_errorHandler = 4;
- else
- *known_errorHandler = 0;
- }
- switch (*known_errorHandler) {
- case 1: /* strict */
+ if (*error_handler == _Py_ERROR_UNKNOWN)
+ *error_handler = get_error_handler(errors);
+
+ switch (*error_handler) {
+ case _Py_ERROR_STRICT:
raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
return -1;
- case 2: /* replace */
+
+ case _Py_ERROR_REPLACE:
for (collpos = collstartpos; collpos<collendpos; ++collpos) {
x = charmapencode_output('?', mapping, res, respos);
if (x==enc_EXCEPTION) {
@@ -8155,10 +8366,11 @@ charmap_encoding_error(
}
}
/* fall through */
- case 3: /* ignore */
+ case _Py_ERROR_IGNORE:
*inpos = collendpos;
break;
- case 4: /* xmlcharrefreplace */
+
+ case _Py_ERROR_XMLCHARREFREPLACE:
/* generate replacement (temporarily (mis)uses p) */
for (collpos = collstartpos; collpos < collendpos; ++collpos) {
char buffer[2+29+1+1];
@@ -8176,8 +8388,9 @@ charmap_encoding_error(
}
*inpos = collendpos;
break;
+
default:
- repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+ repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
encoding, reason, unicode, exceptionObject,
collstartpos, collendpos, &newpos);
if (repunicode == NULL)
@@ -8240,12 +8453,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
Py_ssize_t size;
/* current output position */
Py_ssize_t respos = 0;
- PyObject *errorHandler = NULL;
+ PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;
- /* the following variable is used for caching string comparisons
- * -1=not initialized, 0=unknown, 1=strict, 2=replace,
- * 3=ignore, 4=xmlcharrefreplace */
- int known_errorHandler = -1;
+ _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
void *data;
int kind;
@@ -8276,7 +8486,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
if (x==enc_FAILED) { /* unencodable character */
if (charmap_encoding_error(unicode, &inpos, mapping,
&exc,
- &known_errorHandler, &errorHandler, errors,
+ &error_handler, &error_handler_obj, errors,
&res, &respos)) {
goto onError;
}
@@ -8292,13 +8502,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
goto onError;
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return res;
onError:
Py_XDECREF(res);
Py_XDECREF(exc);
- Py_XDECREF(errorHandler);
+ Py_XDECREF(error_handler_obj);
return NULL;
}
@@ -8624,7 +8834,7 @@ exit:
return res;
}
-PyObject *
+static PyObject *
_PyUnicode_TranslateCharmap(PyObject *input,
PyObject *mapping,
const char *errors)
@@ -10895,6 +11105,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
}
int
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+ return unicode_eq(aa, bb);
+}
+
+int
PyUnicode_Contains(PyObject *container, PyObject *element)
{
PyObject *str, *sub;
@@ -13227,44 +13443,50 @@ unicode_endswith(PyObject *self,
Py_LOCAL_INLINE(void)
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
- if (!writer->readonly)
+ writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+ writer->data = PyUnicode_DATA(writer->buffer);
+
+ if (!writer->readonly) {
+ writer->kind = PyUnicode_KIND(writer->buffer);
writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+ }
else {
+ /* use a value smaller than PyUnicode_1BYTE_KIND() so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ writer->kind = PyUnicode_WCHAR_KIND;
+ assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
/* Copy-on-write mode: set buffer size to 0 so
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
* next write. */
writer->size = 0;
}
- writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
- writer->data = PyUnicode_DATA(writer->buffer);
- writer->kind = PyUnicode_KIND(writer->buffer);
}
void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
memset(writer, 0, sizeof(*writer));
-#ifdef Py_DEBUG
- writer->kind = 5; /* invalid kind */
-#endif
+
+ /* ASCII is the bare minimum */
writer->min_char = 127;
+
+ /* use a value smaller than PyUnicode_1BYTE_KIND() so
+ _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+ writer->kind = PyUnicode_WCHAR_KIND;
+ assert(writer->kind <= PyUnicode_1BYTE_KIND);
}
int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar)
{
-#ifdef MS_WINDOWS
- /* On Windows, overallocate by 50% is the best factor */
-# define OVERALLOCATE_FACTOR 2
-#else
- /* On Linux, overallocate by 25% is the best factor */
-# define OVERALLOCATE_FACTOR 4
-#endif
Py_ssize_t newlen;
PyObject *newbuffer;
- assert(length > 0);
+ /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+ assert((maxchar > writer->maxchar && length >= 0)
+ || length > 0);
if (length > PY_SSIZE_T_MAX - writer->pos) {
PyErr_NoMemory();
@@ -13331,6 +13553,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
#undef OVERALLOCATE_FACTOR
}
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+ enum PyUnicode_Kind kind)
+{
+ Py_UCS4 maxchar;
+
+ /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+ assert(writer->kind < kind);
+
+ switch (kind)
+ {
+ case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+ case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+ case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+ default:
+ assert(0 && "invalid kind");
+ return -1;
+ }
+
+ return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
Py_LOCAL_INLINE(int)
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
@@ -13501,17 +13745,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
assert(PyUnicode_GET_LENGTH(str) == writer->pos);
return str;
}
- if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
- PyObject *newbuffer;
- newbuffer = resize_compact(writer->buffer, writer->pos);
- if (newbuffer == NULL) {
- Py_CLEAR(writer->buffer);
- return NULL;
+ if (writer->pos == 0) {
+ Py_CLEAR(writer->buffer);
+
+ /* Get the empty Unicode string singleton ('') */
+ _Py_INCREF_UNICODE_EMPTY();
+ str = unicode_empty;
+ }
+ else {
+ str = writer->buffer;
+ writer->buffer = NULL;
+
+ if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+ PyObject *str2;
+ str2 = resize_compact(str, writer->pos);
+ if (str2 == NULL)
+ return NULL;
+ str = str2;
}
- writer->buffer = newbuffer;
}
- str = writer->buffer;
- writer->buffer = NULL;
+
assert(_PyUnicode_CheckConsistency(str, 1));
return unicode_result_ready(str);
}