diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 854 |
1 files changed, 553 insertions, 301 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index adc4615603..d0321baa24 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include "Python.h" #include "ucnhash.h" #include "bytes_methods.h" +#include "stringlib/eq.h" #ifdef MS_WINDOWS #include <windows.h> @@ -162,6 +163,14 @@ extern "C" { *_to++ = (to_type) *_iter++; \ } while (0) +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + /* This dictionary holds all interned unicode strings. Note that references to strings in this dictionary are *not* counted in the string's ob_refcnt. When the interned string reaches a refcnt of 0 the string deallocation @@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject, const char *reason); /* Same for linebreaks */ -static unsigned char ascii_linebreak[] = { +static const unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ /* 0x000B, * LINE TABULATION */ @@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = { #include "clinic/unicodeobject.c.h" +typedef enum { + _Py_ERROR_UNKNOWN=0, + _Py_ERROR_STRICT, + _Py_ERROR_SURROGATEESCAPE, + _Py_ERROR_REPLACE, + _Py_ERROR_IGNORE, + _Py_ERROR_BACKSLASHREPLACE, + _Py_ERROR_SURROGATEPASS, + _Py_ERROR_XMLCHARREFREPLACE, + _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ + if (errors == NULL || strcmp(errors, "strict") == 0) + return _Py_ERROR_STRICT; + if (strcmp(errors, "surrogateescape") == 0) + return _Py_ERROR_SURROGATEESCAPE; + if (strcmp(errors, "replace") == 0) + return _Py_ERROR_REPLACE; + if (strcmp(errors, "ignore") == 0) + return _Py_ERROR_IGNORE; + if (strcmp(errors, "backslashreplace") == 0) + return _Py_ERROR_BACKSLASHREPLACE; + if (strcmp(errors, "surrogatepass") == 0) + return _Py_ERROR_SURROGATEPASS; + if (strcmp(errors, "xmlcharrefreplace") == 0) + return _Py_ERROR_XMLCHARREFREPLACE; + return _Py_ERROR_OTHER; +} + /* The max unicode value is always 0x10FFFF while using the PEP-393 API. This function is kept for backward compatibility with the old API. */ Py_UNICODE @@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode) return _PyUnicode_Copy(unicode); } +/* Implementation of the "backslashreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +backslashreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 0x100) + incr = 2+2; + else if (ch < 0x10000) + incr = 2+4; + else { + assert(ch <= MAX_UNICODE); + incr = 2+8; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + *str++ = '\\'; + if (ch >= 0x00010000) { + *str++ = 'U'; + *str++ = Py_hexdigits[(ch>>28)&0xf]; + *str++ = Py_hexdigits[(ch>>24)&0xf]; + *str++ = Py_hexdigits[(ch>>20)&0xf]; + *str++ = Py_hexdigits[(ch>>16)&0xf]; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else if (ch >= 0x100) { + *str++ = 'u'; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else + *str++ = 'x'; + *str++ = Py_hexdigits[(ch>>4)&0xf]; + *str++ = Py_hexdigits[ch&0xf]; + } + return str; +} + +/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +xmlcharrefreplace(_PyBytesWriter *writer, char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 10) + incr = 2+1+1; + else if (ch < 100) + incr = 2+2+1; + else if (ch < 1000) + incr = 2+3+1; + else if (ch < 10000) + incr = 2+4+1; + else if (ch < 100000) + incr = 2+5+1; + else if (ch < 1000000) + incr = 2+6+1; + else { + assert(ch <= MAX_UNICODE); + incr = 2+7+1; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + } + return str; +} + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -647,27 +811,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind, Py_ssize_t size, Py_UCS4 ch, int direction) { - int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; - switch (kind) { case PyUnicode_1BYTE_KIND: - { - Py_UCS1 ch1 = (Py_UCS1) ch; - if (ch1 == ch) - return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); - else - return -1; - } + if ((Py_UCS1) ch != ch) + return -1; + if (direction > 0) + return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); + else + return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch); case PyUnicode_2BYTE_KIND: - { - Py_UCS2 ch2 = (Py_UCS2) ch; - if (ch2 == ch) - return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); - else - return -1; - } + if ((Py_UCS2) ch != ch) + return -1; + if (direction > 0) + return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); + else + return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch); case PyUnicode_4BYTE_KIND: - return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); + if (direction > 0) + return ucs4lib_find_char((Py_UCS4 *) s, size, ch); + else + return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch); default: assert(0); return -1; @@ -3167,24 +3330,22 @@ wcstombs_errorpos(const wchar_t *wstr) static int locale_error_handler(const char *errors, int *surrogateescape) { - if (errors == NULL) { - *surrogateescape = 0; - return 0; - } - - if (strcmp(errors, "strict") == 0) { + _Py_error_handler error_handler = get_error_handler(errors); + switch (error_handler) + { + case _Py_ERROR_STRICT: *surrogateescape = 0; return 0; - } - if (strcmp(errors, "surrogateescape") == 0) { + case _Py_ERROR_SURROGATEESCAPE: *surrogateescape = 1; return 0; + default: + PyErr_Format(PyExc_ValueError, + "only 'strict' and 'surrogateescape' error handlers " + "are supported, not '%s'", + errors); + return -1; } - PyErr_Format(PyExc_ValueError, - "only 'strict' and 'surrogateescape' error handlers " - "are supported, not '%s'", - errors); - return -1; } PyObject * @@ -3976,7 +4137,7 @@ unicode_decode_call_errorhandler_wchar( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, PyObject **output, Py_ssize_t *outpos) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4084,7 +4245,7 @@ unicode_decode_call_errorhandler_writer( Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */) { - static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; + static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; PyObject *restuple = NULL; PyObject *repunicode = NULL; @@ -4690,8 +4851,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_ssize_t startinpos; Py_ssize_t endinpos; const char *errmsg = ""; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) { if (consumed) @@ -4716,6 +4878,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, while (s < end) { Py_UCS4 ch; int kind = writer.kind; + if (kind == PyUnicode_1BYTE_KIND) { if (PyUnicode_IS_ASCII(writer.buffer)) ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4754,24 +4917,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s, continue; } - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "utf-8", errmsg, - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_IGNORE: + s += (endinpos - startinpos); + break; + + case _Py_ERROR_REPLACE: + if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + goto onError; + s += (endinpos - startinpos); + break; + + case _Py_ERROR_SURROGATEESCAPE: + { + Py_ssize_t i; + + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + for (i=startinpos; i<endinpos; i++) { + ch = (Py_UCS4)(unsigned char)(starts[i]); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, + ch + 0xdc00); + writer.pos++; + } + s += (endinpos - startinpos); + break; + } + + default: + if (unicode_decode_call_errorhandler_writer( + errors, &error_handler_obj, + "utf-8", errmsg, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) + goto onError; + } } End: if (consumed) *consumed = s - starts; - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); _PyUnicodeWriter_Dealloc(&writer); return NULL; @@ -5862,11 +6057,10 @@ PyObject * PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { Py_ssize_t i, len; - PyObject *repr; char *p; int kind; void *data; - Py_ssize_t expandsize = 0; + _PyBytesWriter writer; /* Initial allocation is based on the longest-possible character escape. @@ -5882,35 +6076,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + len = PyUnicode_GET_LENGTH(unicode); kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); - switch (kind) { - case PyUnicode_1BYTE_KIND: expandsize = 4; break; - case PyUnicode_2BYTE_KIND: expandsize = 6; break; - case PyUnicode_4BYTE_KIND: expandsize = 10; break; - } - - if (len == 0) - return PyBytes_FromStringAndSize(NULL, 0); - - if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) - return PyErr_NoMemory(); - - repr = PyBytes_FromStringAndSize(NULL, - 2 - + expandsize*len - + 1); - if (repr == NULL) - return NULL; - p = PyBytes_AS_STRING(repr); + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); /* Escape backslashes */ if (ch == '\\') { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = (char) ch; continue; @@ -5919,6 +6106,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 21-bit characters to '\U00xxxxxx' */ else if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; @@ -5934,6 +6126,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; @@ -5944,20 +6140,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) /* Map special whitespace to '\t', \n', '\r' */ else if (ch == '\t') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 't'; } else if (ch == '\n') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'n'; } else if (ch == '\r') { + p = _PyBytesWriter_Prepare(&writer, p, 2-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'r'; } /* Map non-printable US ASCII to '\xhh' */ else if (ch < ' ' || ch >= 0x7F) { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 4-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'x'; *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; @@ -5969,10 +6182,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p - PyBytes_AS_STRING(repr) > 0); - if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6101,13 +6315,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject * PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) { - PyObject *repr; char *p; - char *q; - Py_ssize_t expandsize, pos; + Py_ssize_t pos; int kind; void *data; Py_ssize_t len; + _PyBytesWriter writer; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); @@ -6115,28 +6328,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } if (PyUnicode_READY(unicode) == -1) return NULL; + + _PyBytesWriter_Init(&writer); + kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); - /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 - bytes, and 1 byte characters 4. */ - expandsize = kind * 2 + 2; - if (len > PY_SSIZE_T_MAX / expandsize) - return PyErr_NoMemory(); - - repr = PyBytes_FromStringAndSize(NULL, expandsize * len); - if (repr == NULL) - return NULL; - if (len == 0) - return repr; + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) + goto error; + writer.overallocate = 1; - p = q = PyBytes_AS_STRING(repr); for (pos = 0; pos < len; pos++) { Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* Map 32-bit characters to '\Uxxxxxxxx' */ if (ch >= 0x10000) { assert(ch <= MAX_UNICODE); + + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 10-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'U'; *p++ = Py_hexdigits[(ch >> 28) & 0xf]; @@ -6150,6 +6364,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) } /* Map 16-bit characters to '\uxxxx' */ else if (ch >= 256) { + /* -1: substract 1 preallocated byte */ + p = _PyBytesWriter_Prepare(&writer, p, 6-1); + if (p == NULL) + goto error; + *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0xf]; @@ -6162,10 +6381,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) *p++ = (char) ch; } - assert(p > q); - if (_PyBytes_Resize(&repr, p - q) < 0) - return NULL; - return repr; + return _PyBytesWriter_Finish(&writer, p); + +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -6342,7 +6562,7 @@ unicode_encode_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; + static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; Py_ssize_t len; PyObject *restuple; PyObject *resunicode; @@ -6396,25 +6616,22 @@ unicode_encode_call_errorhandler(const char *errors, static PyObject * unicode_encode_ucs1(PyObject *unicode, const char *errors, - unsigned int limit) + const Py_UCS4 limit) { /* input state */ Py_ssize_t pos=0, size; int kind; void *data; - /* output object */ - PyObject *res; /* pointer into the output */ char *str; - /* current output position */ - Py_ssize_t ressize; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; + PyObject *rep = NULL; + /* output object */ + _PyBytesWriter writer; if (PyUnicode_READY(unicode) == -1) return NULL; @@ -6425,186 +6642,157 @@ unicode_encode_ucs1(PyObject *unicode, replacements, if we need more, we'll resize */ if (size == 0) return PyBytes_FromStringAndSize(NULL, 0); - res = PyBytes_FromStringAndSize(NULL, size); - if (res == NULL) + + _PyBytesWriter_Init(&writer); + str = _PyBytesWriter_Alloc(&writer, size); + if (str == NULL) return NULL; - str = PyBytes_AS_STRING(res); - ressize = size; while (pos < size) { - Py_UCS4 c = PyUnicode_READ(kind, data, pos); + Py_UCS4 ch = PyUnicode_READ(kind, data, pos); /* can we encode this? */ - if (c<limit) { + if (ch < limit) { /* no overflow check, because we know that the space is enough */ - *str++ = (char)c; + *str++ = (char)ch; ++pos; } else { - Py_ssize_t requiredsize; - PyObject *repunicode; - Py_ssize_t repsize, newpos, respos, i; + Py_ssize_t newpos, i; /* startpos for collecting unencodable chars */ Py_ssize_t collstart = pos; - Py_ssize_t collend = pos; + Py_ssize_t collend = collstart + 1; /* find all unecodable characters */ + while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit)) ++collend; + + /* Only overallocate the buffer if it's not the last write */ + writer.overallocate = (collend < size); + /* cache callback name lookup (if not done yet, i.e. it's the first error) */ - if (known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - known_errorHandler = 4; - else - known_errorHandler = 0; - } - switch (known_errorHandler) { - case 1: /* strict */ + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason); goto onError; - case 2: /* replace */ - while (collstart++ < collend) - *str++ = '?'; /* fall through */ - case 3: /* ignore */ + + case _Py_ERROR_REPLACE: + memset(str, '?', collend - collstart); + str += (collend - collstart); + /* fall through ignore error handler */ + case _Py_ERROR_IGNORE: pos = collend; break; - case 4: /* xmlcharrefreplace */ - respos = str - PyBytes_AS_STRING(res); - requiredsize = respos; - /* determine replacement size */ + + case _Py_ERROR_BACKSLASHREPLACE: + /* substract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = backslashreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; + pos = collend; + break; + + case _Py_ERROR_XMLCHARREFREPLACE: + /* substract preallocated bytes */ + writer.min_size -= (collend - collstart); + str = xmlcharrefreplace(&writer, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; + pos = collend; + break; + + case _Py_ERROR_SURROGATEESCAPE: for (i = collstart; i < collend; ++i) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_ssize_t incr; - if (ch < 10) - incr = 2+1+1; - else if (ch < 100) - incr = 2+2+1; - else if (ch < 1000) - incr = 2+3+1; - else if (ch < 10000) - incr = 2+4+1; - else if (ch < 100000) - incr = 2+5+1; - else if (ch < 1000000) - incr = 2+6+1; - else { - assert(ch <= MAX_UNICODE); - incr = 2+7+1; + ch = PyUnicode_READ(kind, data, i); + if (ch < 0xdc80 || 0xdcff < ch) { + /* Not a UTF-8b surrogate */ + break; } - if (requiredsize > PY_SSIZE_T_MAX - incr) - goto overflow; - requiredsize += incr; - } - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) - goto onError; - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; - } - /* generate replacement */ - for (i = collstart; i < collend; ++i) { - str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + *str++ = (char)(ch - 0xdc00); + ++pos; } - pos = collend; - break; + if (i >= collend) + break; + collstart = pos; + assert(collstart != collend); + /* fallback to general error handling */ + default: - repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, - encoding, reason, unicode, &exc, - collstart, collend, &newpos); - if (repunicode == NULL || (PyUnicode_Check(repunicode) && - PyUnicode_READY(repunicode) == -1)) + rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, + encoding, reason, unicode, &exc, + collstart, collend, &newpos); + if (rep == NULL) goto onError; - if (PyBytes_Check(repunicode)) { + + /* substract preallocated bytes */ + writer.min_size -= 1; + + if (PyBytes_Check(rep)) { /* Directly copy bytes result to output. */ - repsize = PyBytes_Size(repunicode); - if (repsize > 1) { - /* Make room for all additional bytes. */ - respos = str - PyBytes_AS_STRING(res); - if (ressize > PY_SSIZE_T_MAX - repsize - 1) { - Py_DECREF(repunicode); - goto overflow; - } - if (_PyBytes_Resize(&res, ressize+repsize-1)) { - Py_DECREF(repunicode); - goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize += repsize-1; - } - memcpy(str, PyBytes_AsString(repunicode), repsize); - str += repsize; - pos = newpos; - Py_DECREF(repunicode); - break; - } - /* need more space? (at least enough for what we - have+the replacement+the rest of the string, so - we won't have to check space for encodable characters) */ - respos = str - PyBytes_AS_STRING(res); - repsize = PyUnicode_GET_LENGTH(repunicode); - requiredsize = respos; - if (requiredsize > PY_SSIZE_T_MAX - repsize) - goto overflow; - requiredsize += repsize; - if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) - goto overflow; - requiredsize += size - collend; - if (requiredsize > ressize) { - if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) - requiredsize = 2*ressize; - if (_PyBytes_Resize(&res, requiredsize)) { - Py_DECREF(repunicode); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyBytes_AS_STRING(rep), + PyBytes_GET_SIZE(rep)); + if (str == NULL) goto onError; - } - str = PyBytes_AS_STRING(res) + respos; - ressize = requiredsize; } - /* check if there is anything unencodable in the replacement - and copy it to the output */ - for (i = 0; repsize-->0; ++i, ++str) { - c = PyUnicode_READ_CHAR(repunicode, i); - if (c >= limit) { - raise_encode_exception(&exc, encoding, unicode, - pos, pos+1, reason); - Py_DECREF(repunicode); + else { + assert(PyUnicode_Check(rep)); + + if (PyUnicode_READY(rep) < 0) goto onError; + + if (PyUnicode_IS_ASCII(rep)) { + /* Fast path: all characters are smaller than limit */ + assert(limit >= 128); + assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); + str = _PyBytesWriter_WriteBytes(&writer, str, + PyUnicode_DATA(rep), + PyUnicode_GET_LENGTH(rep)); + } + else { + Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); + + str = _PyBytesWriter_Prepare(&writer, str, repsize); + if (str == NULL) + goto onError; + + /* check if there is anything unencodable in the + replacement and copy it to the output */ + for (i = 0; repsize-->0; ++i, ++str) { + ch = PyUnicode_READ_CHAR(rep, i); + if (ch >= limit) { + raise_encode_exception(&exc, encoding, unicode, + pos, pos+1, reason); + goto onError; + } + *str = (char)ch; + } } - *str = (char)c; } pos = newpos; - Py_DECREF(repunicode); + Py_CLEAR(rep); } + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || pos == size); } } - /* Resize if we allocated to much */ - size = str - PyBytes_AS_STRING(res); - if (size < ressize) { /* If this falls res will be NULL */ - assert(size >= 0); - if (_PyBytes_Resize(&res, size) < 0) - goto onError; - } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - return res; - - overflow: - PyErr_SetString(PyExc_OverflowError, - "encoded result is too long for a Python string"); + return _PyBytesWriter_Finish(&writer, str); onError: - Py_XDECREF(res); - Py_XDECREF(errorHandler); + Py_XDECREF(rep); + _PyBytesWriter_Dealloc(&writer); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6664,8 +6852,9 @@ PyUnicode_DecodeASCII(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; if (size == 0) _Py_RETURN_UNICODE_EMPTY(); @@ -6694,12 +6883,42 @@ PyUnicode_DecodeASCII(const char *s, PyUnicode_WRITE(kind, data, writer.pos, c); writer.pos++; ++s; + continue; } - else { + + /* byte outsize range 0x00..0x7f: call the error handler */ + + if (error_handler == _Py_ERROR_UNKNOWN) + error_handler = get_error_handler(errors); + + switch (error_handler) + { + case _Py_ERROR_REPLACE: + case _Py_ERROR_SURROGATEESCAPE: + /* Fast-path: the error handler only writes one character, + but we may switch to UCS2 at the first write */ + if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + goto onError; + kind = writer.kind; + data = writer.data; + + if (error_handler == _Py_ERROR_REPLACE) + PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); + else + PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); + writer.pos++; + ++s; + break; + + case _Py_ERROR_IGNORE: + ++s; + break; + + default: startinpos = s-starts; endinpos = startinpos + 1; if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, + errors, &error_handler_obj, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, &writer)) @@ -6708,13 +6927,13 @@ PyUnicode_DecodeASCII(const char *s, data = writer.data; } } - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); onError: _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); Py_XDECREF(exc); return NULL; } @@ -6769,7 +6988,7 @@ PyUnicode_AsASCIIString(PyObject *unicode) # define WC_ERR_INVALID_CHARS 0x0080 #endif -static char* +static const char* code_page_name(UINT code_page, PyObject **obj) { *obj = NULL; @@ -6877,7 +7096,7 @@ decode_code_page_errors(UINT code_page, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; DWORD err; int ret = -1; @@ -7113,7 +7332,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes, BOOL usedDefaultChar = FALSE; BOOL *pusedDefaultChar = &usedDefaultChar; int outsize; - PyObject *exc = NULL; wchar_t *p; Py_ssize_t size; const DWORD flags = encode_code_page_flags(code_page, NULL); @@ -7222,7 +7440,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *encoding_obj = NULL; - char *encoding; + const char *encoding; Py_ssize_t newpos, newoutsize; PyObject *rep; int ret = -1; @@ -8080,7 +8298,7 @@ static int charmap_encoding_error( PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, - int *known_errorHandler, PyObject **errorHandler, const char *errors, + _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors, PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ @@ -8127,23 +8345,15 @@ charmap_encoding_error( } /* cache callback name lookup * (if not done yet, i.e. it's the first error) */ - if (*known_errorHandler==-1) { - if ((errors==NULL) || (!strcmp(errors, "strict"))) - *known_errorHandler = 1; - else if (!strcmp(errors, "replace")) - *known_errorHandler = 2; - else if (!strcmp(errors, "ignore")) - *known_errorHandler = 3; - else if (!strcmp(errors, "xmlcharrefreplace")) - *known_errorHandler = 4; - else - *known_errorHandler = 0; - } - switch (*known_errorHandler) { - case 1: /* strict */ + if (*error_handler == _Py_ERROR_UNKNOWN) + *error_handler = get_error_handler(errors); + + switch (*error_handler) { + case _Py_ERROR_STRICT: raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason); return -1; - case 2: /* replace */ + + case _Py_ERROR_REPLACE: for (collpos = collstartpos; collpos<collendpos; ++collpos) { x = charmapencode_output('?', mapping, res, respos); if (x==enc_EXCEPTION) { @@ -8155,10 +8365,11 @@ charmap_encoding_error( } } /* fall through */ - case 3: /* ignore */ + case _Py_ERROR_IGNORE: *inpos = collendpos; break; - case 4: /* xmlcharrefreplace */ + + case _Py_ERROR_XMLCHARREFREPLACE: /* generate replacement (temporarily (mis)uses p) */ for (collpos = collstartpos; collpos < collendpos; ++collpos) { char buffer[2+29+1+1]; @@ -8176,8 +8387,9 @@ charmap_encoding_error( } *inpos = collendpos; break; + default: - repunicode = unicode_encode_call_errorhandler(errors, errorHandler, + repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj, encoding, reason, unicode, exceptionObject, collstartpos, collendpos, &newpos); if (repunicode == NULL) @@ -8240,12 +8452,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, Py_ssize_t size; /* current output position */ Py_ssize_t respos = 0; - PyObject *errorHandler = NULL; + PyObject *error_handler_obj = NULL; PyObject *exc = NULL; - /* the following variable is used for caching string comparisons - * -1=not initialized, 0=unknown, 1=strict, 2=replace, - * 3=ignore, 4=xmlcharrefreplace */ - int known_errorHandler = -1; + _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; void *data; int kind; @@ -8276,7 +8485,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(unicode, &inpos, mapping, &exc, - &known_errorHandler, &errorHandler, errors, + &error_handler, &error_handler_obj, errors, &res, &respos)) { goto onError; } @@ -8292,13 +8501,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode, goto onError; Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return res; onError: Py_XDECREF(res); Py_XDECREF(exc); - Py_XDECREF(errorHandler); + Py_XDECREF(error_handler_obj); return NULL; } @@ -8365,7 +8574,7 @@ unicode_translate_call_errorhandler(const char *errors, Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos) { - static char *argparse = "O!n;translating error handler must return (str, int) tuple"; + static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; Py_ssize_t i_newpos; PyObject *restuple; @@ -8622,7 +8831,7 @@ exit: return res; } -PyObject * +static PyObject * _PyUnicode_TranslateCharmap(PyObject *input, PyObject *mapping, const char *errors) @@ -10899,6 +11108,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op) } int +_PyUnicode_EQ(PyObject *aa, PyObject *bb) +{ + return unicode_eq(aa, bb); +} + +int PyUnicode_Contains(PyObject *container, PyObject *element) { PyObject *str, *sub; @@ -11947,7 +12162,7 @@ unicode_lower(PyObject *self) #define BOTHSTRIP 2 /* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; +static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) @@ -13231,44 +13446,50 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) { - if (!writer->readonly) + writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); + writer->data = PyUnicode_DATA(writer->buffer); + + if (!writer->readonly) { + writer->kind = PyUnicode_KIND(writer->buffer); writer->size = PyUnicode_GET_LENGTH(writer->buffer); + } else { + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); + /* Copy-on-write mode: set buffer size to 0 so * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on * next write. */ writer->size = 0; } - writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); - writer->data = PyUnicode_DATA(writer->buffer); - writer->kind = PyUnicode_KIND(writer->buffer); } void _PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(*writer)); -#ifdef Py_DEBUG - writer->kind = 5; /* invalid kind */ -#endif + + /* ASCII is the bare minimum */ writer->min_char = 127; + + /* use a value smaller than PyUnicode_1BYTE_KIND() so + _PyUnicodeWriter_PrepareKind() will copy the buffer. */ + writer->kind = PyUnicode_WCHAR_KIND; + assert(writer->kind <= PyUnicode_1BYTE_KIND); } int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar) { -#ifdef MS_WINDOWS - /* On Windows, overallocate by 50% is the best factor */ -# define OVERALLOCATE_FACTOR 2 -#else - /* On Linux, overallocate by 25% is the best factor */ -# define OVERALLOCATE_FACTOR 4 -#endif Py_ssize_t newlen; PyObject *newbuffer; - assert(length > 0); + /* ensure that the _PyUnicodeWriter_Prepare macro was used */ + assert((maxchar > writer->maxchar && length >= 0) + || length > 0); if (length > PY_SSIZE_T_MAX - writer->pos) { PyErr_NoMemory(); @@ -13334,6 +13555,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, #undef OVERALLOCATE_FACTOR } +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, + enum PyUnicode_Kind kind) +{ + Py_UCS4 maxchar; + + /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ + assert(writer->kind < kind); + + switch (kind) + { + case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; + case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; + case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; + default: + assert(0 && "invalid kind"); + return -1; + } + + return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} + Py_LOCAL_INLINE(int) _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch) { @@ -13504,17 +13747,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) assert(PyUnicode_GET_LENGTH(str) == writer->pos); return str; } - if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { - PyObject *newbuffer; - newbuffer = resize_compact(writer->buffer, writer->pos); - if (newbuffer == NULL) { - Py_CLEAR(writer->buffer); - return NULL; + if (writer->pos == 0) { + Py_CLEAR(writer->buffer); + + /* Get the empty Unicode string singleton ('') */ + _Py_INCREF_UNICODE_EMPTY(); + str = unicode_empty; + } + else { + str = writer->buffer; + writer->buffer = NULL; + + if (PyUnicode_GET_LENGTH(str) != writer->pos) { + PyObject *str2; + str2 = resize_compact(str, writer->pos); + if (str2 == NULL) + return NULL; + str = str2; } - writer->buffer = newbuffer; } - str = writer->buffer; - writer->buffer = NULL; + assert(_PyUnicode_CheckConsistency(str, 1)); return unicode_result_ready(str); } |