diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 1450 | 
1 files changed, 740 insertions, 710 deletions
| diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b146da952d..1e7cba604c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.  #include "Python.h"  #include "ucnhash.h"  #include "bytes_methods.h" +#include "stringlib/eq.h"  #ifdef MS_WINDOWS  #include <windows.h> @@ -162,6 +163,14 @@ extern "C" {              *_to++ = (to_type) *_iter++;                \      } while (0) +#ifdef MS_WINDOWS +   /* On Windows, overallocate by 50% is the best factor */ +#  define OVERALLOCATE_FACTOR 2 +#else +   /* On Linux, overallocate by 25% is the best factor */ +#  define OVERALLOCATE_FACTOR 4 +#endif +  /* This dictionary holds all interned unicode strings.  Note that references     to strings in this dictionary are *not* counted in the string's ob_refcnt.     When the interned string reaches a refcnt of 0 the string deallocation @@ -263,7 +272,7 @@ raise_encode_exception(PyObject **exceptionObject,                         const char *reason);  /* Same for linebreaks */ -static unsigned char ascii_linebreak[] = { +static const unsigned char ascii_linebreak[] = {      0, 0, 0, 0, 0, 0, 0, 0,  /*         0x000A, * LINE FEED */  /*         0x000B, * LINE TABULATION */ @@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = {  #include "clinic/unicodeobject.c.h" +typedef enum { +    _Py_ERROR_UNKNOWN=0, +    _Py_ERROR_STRICT, +    _Py_ERROR_SURROGATEESCAPE, +    _Py_ERROR_REPLACE, +    _Py_ERROR_IGNORE, +    _Py_ERROR_BACKSLASHREPLACE, +    _Py_ERROR_SURROGATEPASS, +    _Py_ERROR_XMLCHARREFREPLACE, +    _Py_ERROR_OTHER +} _Py_error_handler; + +static _Py_error_handler +get_error_handler(const char *errors) +{ +    if (errors == NULL || strcmp(errors, "strict") == 0) +        return _Py_ERROR_STRICT; +    if (strcmp(errors, "surrogateescape") == 0) +        return _Py_ERROR_SURROGATEESCAPE; +    if (strcmp(errors, "replace") == 0) +        return _Py_ERROR_REPLACE; +    if (strcmp(errors, "ignore") == 0) +        return _Py_ERROR_IGNORE; +    if (strcmp(errors, "backslashreplace") == 0) +        return _Py_ERROR_BACKSLASHREPLACE; +    if (strcmp(errors, "surrogatepass") == 0) +        return _Py_ERROR_SURROGATEPASS; +    if (strcmp(errors, "xmlcharrefreplace") == 0) +        return _Py_ERROR_XMLCHARREFREPLACE; +    return _Py_ERROR_OTHER; +} +  /* The max unicode value is always 0x10FFFF while using the PEP-393 API.     This function is kept for backward compatibility with the old API. */  Py_UNICODE @@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode)          return _PyUnicode_Copy(unicode);  } +/* Implementation of the "backslashreplace" error handler for 8-bit encodings: +   ASCII, Latin1, UTF-8, etc. */ +static char* +backslashreplace(_PyBytesWriter *writer, char *str, +                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ +    Py_ssize_t size, i; +    Py_UCS4 ch; +    enum PyUnicode_Kind kind; +    void *data; + +    assert(PyUnicode_IS_READY(unicode)); +    kind = PyUnicode_KIND(unicode); +    data = PyUnicode_DATA(unicode); + +    size = 0; +    /* determine replacement size */ +    for (i = collstart; i < collend; ++i) { +        Py_ssize_t incr; + +        ch = PyUnicode_READ(kind, data, i); +        if (ch < 0x100) +            incr = 2+2; +        else if (ch < 0x10000) +            incr = 2+4; +        else { +            assert(ch <= MAX_UNICODE); +            incr = 2+8; +        } +        if (size > PY_SSIZE_T_MAX - incr) { +            PyErr_SetString(PyExc_OverflowError, +                            "encoded result is too long for a Python string"); +            return NULL; +        } +        size += incr; +    } + +    str = _PyBytesWriter_Prepare(writer, str, size); +    if (str == NULL) +        return NULL; + +    /* generate replacement */ +    for (i = collstart; i < collend; ++i) { +        ch = PyUnicode_READ(kind, data, i); +        *str++ = '\\'; +        if (ch >= 0x00010000) { +            *str++ = 'U'; +            *str++ = Py_hexdigits[(ch>>28)&0xf]; +            *str++ = Py_hexdigits[(ch>>24)&0xf]; +            *str++ = Py_hexdigits[(ch>>20)&0xf]; +            *str++ = Py_hexdigits[(ch>>16)&0xf]; +            *str++ = Py_hexdigits[(ch>>12)&0xf]; +            *str++ = Py_hexdigits[(ch>>8)&0xf]; +        } +        else if (ch >= 0x100) { +            *str++ = 'u'; +            *str++ = Py_hexdigits[(ch>>12)&0xf]; +            *str++ = Py_hexdigits[(ch>>8)&0xf]; +        } +        else +            *str++ = 'x'; +        *str++ = Py_hexdigits[(ch>>4)&0xf]; +        *str++ = Py_hexdigits[ch&0xf]; +    } +    return str; +} + +/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: +   ASCII, Latin1, UTF-8, etc. */ +static char* +xmlcharrefreplace(_PyBytesWriter *writer, char *str, +                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ +    Py_ssize_t size, i; +    Py_UCS4 ch; +    enum PyUnicode_Kind kind; +    void *data; + +    assert(PyUnicode_IS_READY(unicode)); +    kind = PyUnicode_KIND(unicode); +    data = PyUnicode_DATA(unicode); + +    size = 0; +    /* determine replacement size */ +    for (i = collstart; i < collend; ++i) { +        Py_ssize_t incr; + +        ch = PyUnicode_READ(kind, data, i); +        if (ch < 10) +            incr = 2+1+1; +        else if (ch < 100) +            incr = 2+2+1; +        else if (ch < 1000) +            incr = 2+3+1; +        else if (ch < 10000) +            incr = 2+4+1; +        else if (ch < 100000) +            incr = 2+5+1; +        else if (ch < 1000000) +            incr = 2+6+1; +        else { +            assert(ch <= MAX_UNICODE); +            incr = 2+7+1; +        } +        if (size > PY_SSIZE_T_MAX - incr) { +            PyErr_SetString(PyExc_OverflowError, +                            "encoded result is too long for a Python string"); +            return NULL; +        } +        size += incr; +    } + +    str = _PyBytesWriter_Prepare(writer, str, size); +    if (str == NULL) +        return NULL; + +    /* generate replacement */ +    for (i = collstart; i < collend; ++i) { +        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); +    } +    return str; +} +  /* --- Bloom Filters ----------------------------------------------------- */  /* stuff to implement simple "bloom filters" for Unicode characters. @@ -587,6 +751,18 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)  #undef BLOOM_UPDATE  } +static int +ensure_unicode(PyObject *obj) +{ +    if (!PyUnicode_Check(obj)) { +        PyErr_Format(PyExc_TypeError, +                     "must be str, not %.100s", +                     Py_TYPE(obj)->tp_name); +        return -1; +    } +    return PyUnicode_READY(obj); +} +  /* Compilation of templated routines */  #include "stringlib/asciilib.h" @@ -647,27 +823,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,                                       Py_ssize_t size, Py_UCS4 ch,                                       int direction)  { -    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH; -      switch (kind) {      case PyUnicode_1BYTE_KIND: -        { -            Py_UCS1 ch1 = (Py_UCS1) ch; -            if (ch1 == ch) -                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode); -            else -                return -1; -        } +        if ((Py_UCS1) ch != ch) +            return -1; +        if (direction > 0) +            return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch); +        else +            return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);      case PyUnicode_2BYTE_KIND: -        { -            Py_UCS2 ch2 = (Py_UCS2) ch; -            if (ch2 == ch) -                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode); -            else -                return -1; -        } +        if ((Py_UCS2) ch != ch) +            return -1; +        if (direction > 0) +            return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch); +        else +            return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);      case PyUnicode_4BYTE_KIND: -        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode); +        if (direction > 0) +            return ucs4lib_find_char((Py_UCS4 *) s, size, ch); +        else +            return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);      default:          assert(0);          return -1; @@ -2903,7 +3078,7 @@ PyUnicode_FromEncodedObject(PyObject *obj,      /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */      if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {          PyErr_Format(PyExc_TypeError, -                     "coercing to str: need a bytes-like object, %.80s found", +                     "decoding to str: need a bytes-like object, %.80s found",                       Py_TYPE(obj)->tp_name);          return NULL;      } @@ -3167,24 +3342,22 @@ wcstombs_errorpos(const wchar_t *wstr)  static int  locale_error_handler(const char *errors, int *surrogateescape)  { -    if (errors == NULL) { -        *surrogateescape = 0; -        return 0; -    } - -    if (strcmp(errors, "strict") == 0) { +    _Py_error_handler error_handler = get_error_handler(errors); +    switch (error_handler) +    { +    case _Py_ERROR_STRICT:          *surrogateescape = 0;          return 0; -    } -    if (strcmp(errors, "surrogateescape") == 0) { +    case _Py_ERROR_SURROGATEESCAPE:          *surrogateescape = 1;          return 0; +    default: +        PyErr_Format(PyExc_ValueError, +                     "only 'strict' and 'surrogateescape' error handlers " +                     "are supported, not '%s'", +                     errors); +        return -1;      } -    PyErr_Format(PyExc_ValueError, -                 "only 'strict' and 'surrogateescape' error handlers " -                 "are supported, not '%s'", -                 errors); -    return -1;  }  PyObject * @@ -3626,19 +3799,17 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)          output = arg;          Py_INCREF(output);      } -    else { -        arg = PyUnicode_FromObject(arg); -        if (!arg) -            return 0; +    else if (PyUnicode_Check(arg)) {          output = PyUnicode_EncodeFSDefault(arg); -        Py_DECREF(arg);          if (!output)              return 0; -        if (!PyBytes_Check(output)) { -            Py_DECREF(output); -            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); -            return 0; -        } +        assert(PyBytes_Check(output)); +    } +    else { +        PyErr_Format(PyExc_TypeError, +                     "must be str or bytes, not %.100s", +                     Py_TYPE(arg)->tp_name); +        return 0;      }      size = PyBytes_GET_SIZE(output);      data = PyBytes_AS_STRING(output); @@ -3710,7 +3881,7 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)      if (PyUnicode_UTF8(unicode) == NULL) {          assert(!PyUnicode_IS_COMPACT_ASCII(unicode)); -        bytes = _PyUnicode_AsUTF8String(unicode, "strict"); +        bytes = _PyUnicode_AsUTF8String(unicode, NULL);          if (bytes == NULL)              return NULL;          _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1); @@ -3976,7 +4147,7 @@ unicode_decode_call_errorhandler_wchar(      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,      PyObject **output, Py_ssize_t *outpos)  { -    static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; +    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";      PyObject *restuple = NULL;      PyObject *repunicode = NULL; @@ -4084,7 +4255,7 @@ unicode_decode_call_errorhandler_writer(      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,      _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)  { -    static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; +    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";      PyObject *restuple = NULL;      PyObject *repunicode = NULL; @@ -4690,8 +4861,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,      Py_ssize_t startinpos;      Py_ssize_t endinpos;      const char *errmsg = ""; -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;      if (size == 0) {          if (consumed) @@ -4716,6 +4888,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,      while (s < end) {          Py_UCS4 ch;          int kind = writer.kind; +          if (kind == PyUnicode_1BYTE_KIND) {              if (PyUnicode_IS_ASCII(writer.buffer))                  ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); @@ -4754,24 +4927,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,              continue;          } -        if (unicode_decode_call_errorhandler_writer( -                errors, &errorHandler, -                "utf-8", errmsg, -                &starts, &end, &startinpos, &endinpos, &exc, &s, -                &writer)) -            goto onError; +        if (error_handler == _Py_ERROR_UNKNOWN) +            error_handler = get_error_handler(errors); + +        switch (error_handler) { +        case _Py_ERROR_IGNORE: +            s += (endinpos - startinpos); +            break; + +        case _Py_ERROR_REPLACE: +            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) +                goto onError; +            s += (endinpos - startinpos); +            break; + +        case _Py_ERROR_SURROGATEESCAPE: +        { +            Py_ssize_t i; + +            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) +                goto onError; +            for (i=startinpos; i<endinpos; i++) { +                ch = (Py_UCS4)(unsigned char)(starts[i]); +                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, +                                ch + 0xdc00); +                writer.pos++; +            } +            s += (endinpos - startinpos); +            break; +        } + +        default: +            if (unicode_decode_call_errorhandler_writer( +                    errors, &error_handler_obj, +                    "utf-8", errmsg, +                    &starts, &end, &startinpos, &endinpos, &exc, &s, +                    &writer)) +                goto onError; +        }      }  End:      if (consumed)          *consumed = s - starts; -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);      return _PyUnicodeWriter_Finish(&writer);  onError: -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);      _PyUnicodeWriter_Dealloc(&writer);      return NULL; @@ -5862,11 +6067,10 @@ PyObject *  PyUnicode_AsUnicodeEscapeString(PyObject *unicode)  {      Py_ssize_t i, len; -    PyObject *repr;      char *p;      int kind;      void *data; -    Py_ssize_t expandsize = 0; +    _PyBytesWriter writer;      /* Initial allocation is based on the longest-possible character         escape. @@ -5882,35 +6086,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)      }      if (PyUnicode_READY(unicode) == -1)          return NULL; + +    _PyBytesWriter_Init(&writer); +      len = PyUnicode_GET_LENGTH(unicode);      kind = PyUnicode_KIND(unicode);      data = PyUnicode_DATA(unicode); -    switch (kind) { -    case PyUnicode_1BYTE_KIND: expandsize = 4; break; -    case PyUnicode_2BYTE_KIND: expandsize = 6; break; -    case PyUnicode_4BYTE_KIND: expandsize = 10; break; -    } - -    if (len == 0) -        return PyBytes_FromStringAndSize(NULL, 0); - -    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) -        return PyErr_NoMemory(); - -    repr = PyBytes_FromStringAndSize(NULL, -                                     2 -                                     + expandsize*len -                                     + 1); -    if (repr == NULL) -        return NULL; -    p = PyBytes_AS_STRING(repr); +    p = _PyBytesWriter_Alloc(&writer, len); +    if (p == NULL) +        goto error; +    writer.overallocate = 1;      for (i = 0; i < len; i++) {          Py_UCS4 ch = PyUnicode_READ(kind, data, i);          /* Escape backslashes */          if (ch == '\\') { +            /* -1: substract 1 preallocated byte */ +            p = _PyBytesWriter_Prepare(&writer, p, 2-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = (char) ch;              continue; @@ -5919,6 +6116,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)          /* Map 21-bit characters to '\U00xxxxxx' */          else if (ch >= 0x10000) {              assert(ch <= MAX_UNICODE); + +            p = _PyBytesWriter_Prepare(&writer, p, 10-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'U';              *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; @@ -5934,6 +6136,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)          /* Map 16-bit characters to '\uxxxx' */          if (ch >= 256) { +            p = _PyBytesWriter_Prepare(&writer, p, 6-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'u';              *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; @@ -5944,20 +6150,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)          /* Map special whitespace to '\t', \n', '\r' */          else if (ch == '\t') { +            p = _PyBytesWriter_Prepare(&writer, p, 2-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 't';          }          else if (ch == '\n') { +            p = _PyBytesWriter_Prepare(&writer, p, 2-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'n';          }          else if (ch == '\r') { +            p = _PyBytesWriter_Prepare(&writer, p, 2-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'r';          }          /* Map non-printable US ASCII to '\xhh' */          else if (ch < ' ' || ch >= 0x7F) { +            /* -1: substract 1 preallocated byte */ +            p = _PyBytesWriter_Prepare(&writer, p, 4-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'x';              *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; @@ -5969,10 +6192,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)              *p++ = (char) ch;      } -    assert(p - PyBytes_AS_STRING(repr) > 0); -    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) -        return NULL; -    return repr; +    return _PyBytesWriter_Finish(&writer, p); + +error: +    _PyBytesWriter_Dealloc(&writer); +    return NULL;  }  PyObject * @@ -6101,13 +6325,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,  PyObject *  PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)  { -    PyObject *repr;      char *p; -    char *q; -    Py_ssize_t expandsize, pos; +    Py_ssize_t pos;      int kind;      void *data;      Py_ssize_t len; +    _PyBytesWriter writer;      if (!PyUnicode_Check(unicode)) {          PyErr_BadArgument(); @@ -6115,28 +6338,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)      }      if (PyUnicode_READY(unicode) == -1)          return NULL; + +    _PyBytesWriter_Init(&writer); +      kind = PyUnicode_KIND(unicode);      data = PyUnicode_DATA(unicode);      len = PyUnicode_GET_LENGTH(unicode); -    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 -       bytes, and 1 byte characters 4. */ -    expandsize = kind * 2 + 2; -    if (len > PY_SSIZE_T_MAX / expandsize) -        return PyErr_NoMemory(); - -    repr = PyBytes_FromStringAndSize(NULL, expandsize * len); -    if (repr == NULL) -        return NULL; -    if (len == 0) -        return repr; +    p = _PyBytesWriter_Alloc(&writer, len); +    if (p == NULL) +        goto error; +    writer.overallocate = 1; -    p = q = PyBytes_AS_STRING(repr);      for (pos = 0; pos < len; pos++) {          Py_UCS4 ch = PyUnicode_READ(kind, data, pos);          /* Map 32-bit characters to '\Uxxxxxxxx' */          if (ch >= 0x10000) {              assert(ch <= MAX_UNICODE); + +            /* -1: substract 1 preallocated byte */ +            p = _PyBytesWriter_Prepare(&writer, p, 10-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'U';              *p++ = Py_hexdigits[(ch >> 28) & 0xf]; @@ -6150,6 +6374,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)          }          /* Map 16-bit characters to '\uxxxx' */          else if (ch >= 256) { +            /* -1: substract 1 preallocated byte */ +            p = _PyBytesWriter_Prepare(&writer, p, 6-1); +            if (p == NULL) +                goto error; +              *p++ = '\\';              *p++ = 'u';              *p++ = Py_hexdigits[(ch >> 12) & 0xf]; @@ -6162,10 +6391,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)              *p++ = (char) ch;      } -    assert(p > q); -    if (_PyBytes_Resize(&repr, p - q) < 0) -        return NULL; -    return repr; +    return _PyBytesWriter_Finish(&writer, p); + +error: +    _PyBytesWriter_Dealloc(&writer); +    return NULL;  }  PyObject * @@ -6342,7 +6572,7 @@ unicode_encode_call_errorhandler(const char *errors,                                   Py_ssize_t startpos, Py_ssize_t endpos,                                   Py_ssize_t *newpos)  { -    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple"; +    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";      Py_ssize_t len;      PyObject *restuple;      PyObject *resunicode; @@ -6396,25 +6626,22 @@ unicode_encode_call_errorhandler(const char *errors,  static PyObject *  unicode_encode_ucs1(PyObject *unicode,                      const char *errors, -                    unsigned int limit) +                    const Py_UCS4 limit)  {      /* input state */      Py_ssize_t pos=0, size;      int kind;      void *data; -    /* output object */ -    PyObject *res;      /* pointer into the output */      char *str; -    /* current output position */ -    Py_ssize_t ressize;      const char *encoding = (limit == 256) ? "latin-1" : "ascii";      const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL; -    /* the following variable is used for caching string comparisons -     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ -    int known_errorHandler = -1; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN; +    PyObject *rep = NULL; +    /* output object */ +    _PyBytesWriter writer;      if (PyUnicode_READY(unicode) == -1)          return NULL; @@ -6425,186 +6652,157 @@ unicode_encode_ucs1(PyObject *unicode,         replacements, if we need more, we'll resize */      if (size == 0)          return PyBytes_FromStringAndSize(NULL, 0); -    res = PyBytes_FromStringAndSize(NULL, size); -    if (res == NULL) + +    _PyBytesWriter_Init(&writer); +    str = _PyBytesWriter_Alloc(&writer, size); +    if (str == NULL)          return NULL; -    str = PyBytes_AS_STRING(res); -    ressize = size;      while (pos < size) { -        Py_UCS4 c = PyUnicode_READ(kind, data, pos); +        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);          /* can we encode this? */ -        if (c<limit) { +        if (ch < limit) {              /* no overflow check, because we know that the space is enough */ -            *str++ = (char)c; +            *str++ = (char)ch;              ++pos;          }          else { -            Py_ssize_t requiredsize; -            PyObject *repunicode; -            Py_ssize_t repsize, newpos, respos, i; +            Py_ssize_t newpos, i;              /* startpos for collecting unencodable chars */              Py_ssize_t collstart = pos; -            Py_ssize_t collend = pos; +            Py_ssize_t collend = collstart + 1;              /* find all unecodable characters */ +              while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))                  ++collend; + +            /* Only overallocate the buffer if it's not the last write */ +            writer.overallocate = (collend < size); +              /* cache callback name lookup (if not done yet, i.e. it's the first error) */ -            if (known_errorHandler==-1) { -                if ((errors==NULL) || (!strcmp(errors, "strict"))) -                    known_errorHandler = 1; -                else if (!strcmp(errors, "replace")) -                    known_errorHandler = 2; -                else if (!strcmp(errors, "ignore")) -                    known_errorHandler = 3; -                else if (!strcmp(errors, "xmlcharrefreplace")) -                    known_errorHandler = 4; -                else -                    known_errorHandler = 0; -            } -            switch (known_errorHandler) { -            case 1: /* strict */ +            if (error_handler == _Py_ERROR_UNKNOWN) +                error_handler = get_error_handler(errors); + +            switch (error_handler) { +            case _Py_ERROR_STRICT:                  raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);                  goto onError; -            case 2: /* replace */ -                while (collstart++ < collend) -                    *str++ = '?'; /* fall through */ -            case 3: /* ignore */ + +            case _Py_ERROR_REPLACE: +                memset(str, '?', collend - collstart); +                str += (collend - collstart); +                /* fall through ignore error handler */ +            case _Py_ERROR_IGNORE:                  pos = collend;                  break; -            case 4: /* xmlcharrefreplace */ -                respos = str - PyBytes_AS_STRING(res); -                requiredsize = respos; -                /* determine replacement size */ + +            case _Py_ERROR_BACKSLASHREPLACE: +                /* substract preallocated bytes */ +                writer.min_size -= (collend - collstart); +                str = backslashreplace(&writer, str, +                                       unicode, collstart, collend); +                if (str == NULL) +                    goto onError; +                pos = collend; +                break; + +            case _Py_ERROR_XMLCHARREFREPLACE: +                /* substract preallocated bytes */ +                writer.min_size -= (collend - collstart); +                str = xmlcharrefreplace(&writer, str, +                                        unicode, collstart, collend); +                if (str == NULL) +                    goto onError; +                pos = collend; +                break; + +            case _Py_ERROR_SURROGATEESCAPE:                  for (i = collstart; i < collend; ++i) { -                    Py_UCS4 ch = PyUnicode_READ(kind, data, i); -                    Py_ssize_t incr; -                    if (ch < 10) -                        incr = 2+1+1; -                    else if (ch < 100) -                        incr = 2+2+1; -                    else if (ch < 1000) -                        incr = 2+3+1; -                    else if (ch < 10000) -                        incr = 2+4+1; -                    else if (ch < 100000) -                        incr = 2+5+1; -                    else if (ch < 1000000) -                        incr = 2+6+1; -                    else { -                        assert(ch <= MAX_UNICODE); -                        incr = 2+7+1; +                    ch = PyUnicode_READ(kind, data, i); +                    if (ch < 0xdc80 || 0xdcff < ch) { +                        /* Not a UTF-8b surrogate */ +                        break;                      } -                    if (requiredsize > PY_SSIZE_T_MAX - incr) -                        goto overflow; -                    requiredsize += incr; -                } -                if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) -                    goto overflow; -                requiredsize += size - collend; -                if (requiredsize > ressize) { -                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) -                        requiredsize = 2*ressize; -                    if (_PyBytes_Resize(&res, requiredsize)) -                        goto onError; -                    str = PyBytes_AS_STRING(res) + respos; -                    ressize = requiredsize; +                    *str++ = (char)(ch - 0xdc00); +                    ++pos;                  } -                /* generate replacement */ -                for (i = collstart; i < collend; ++i) { -                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); -                } -                pos = collend; -                break; +                if (i >= collend) +                    break; +                collstart = pos; +                assert(collstart != collend); +                /* fallback to general error handling */ +              default: -                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, -                                                              encoding, reason, unicode, &exc, -                                                              collstart, collend, &newpos); -                if (repunicode == NULL || (PyUnicode_Check(repunicode) && -                                           PyUnicode_READY(repunicode) == -1)) +                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj, +                                                       encoding, reason, unicode, &exc, +                                                       collstart, collend, &newpos); +                if (rep == NULL)                      goto onError; -                if (PyBytes_Check(repunicode)) { + +                /* substract preallocated bytes */ +                writer.min_size -= 1; + +                if (PyBytes_Check(rep)) {                      /* Directly copy bytes result to output. */ -                    repsize = PyBytes_Size(repunicode); -                    if (repsize > 1) { -                        /* Make room for all additional bytes. */ -                        respos = str - PyBytes_AS_STRING(res); -                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) { -                            Py_DECREF(repunicode); -                            goto overflow; -                        } -                        if (_PyBytes_Resize(&res, ressize+repsize-1)) { -                            Py_DECREF(repunicode); -                            goto onError; -                        } -                        str = PyBytes_AS_STRING(res) + respos; -                        ressize += repsize-1; -                    } -                    memcpy(str, PyBytes_AsString(repunicode), repsize); -                    str += repsize; -                    pos = newpos; -                    Py_DECREF(repunicode); -                    break; -                } -                /* need more space? (at least enough for what we -                   have+the replacement+the rest of the string, so -                   we won't have to check space for encodable characters) */ -                respos = str - PyBytes_AS_STRING(res); -                repsize = PyUnicode_GET_LENGTH(repunicode); -                requiredsize = respos; -                if (requiredsize > PY_SSIZE_T_MAX - repsize) -                    goto overflow; -                requiredsize += repsize; -                if (requiredsize > PY_SSIZE_T_MAX - (size - collend)) -                    goto overflow; -                requiredsize += size - collend; -                if (requiredsize > ressize) { -                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize) -                        requiredsize = 2*ressize; -                    if (_PyBytes_Resize(&res, requiredsize)) { -                        Py_DECREF(repunicode); +                    str = _PyBytesWriter_WriteBytes(&writer, str, +                                                    PyBytes_AS_STRING(rep), +                                                    PyBytes_GET_SIZE(rep)); +                    if (str == NULL)                          goto onError; -                    } -                    str = PyBytes_AS_STRING(res) + respos; -                    ressize = requiredsize;                  } -                /* check if there is anything unencodable in the replacement -                   and copy it to the output */ -                for (i = 0; repsize-->0; ++i, ++str) { -                    c = PyUnicode_READ_CHAR(repunicode, i); -                    if (c >= limit) { -                        raise_encode_exception(&exc, encoding, unicode, -                                               pos, pos+1, reason); -                        Py_DECREF(repunicode); +                else { +                    assert(PyUnicode_Check(rep)); + +                    if (PyUnicode_READY(rep) < 0)                          goto onError; + +                    if (PyUnicode_IS_ASCII(rep)) { +                        /* Fast path: all characters are smaller than limit */ +                        assert(limit >= 128); +                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); +                        str = _PyBytesWriter_WriteBytes(&writer, str, +                                                        PyUnicode_DATA(rep), +                                                        PyUnicode_GET_LENGTH(rep)); +                    } +                    else { +                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); + +                        str = _PyBytesWriter_Prepare(&writer, str, repsize); +                        if (str == NULL) +                            goto onError; + +                        /* check if there is anything unencodable in the +                           replacement and copy it to the output */ +                        for (i = 0; repsize-->0; ++i, ++str) { +                            ch = PyUnicode_READ_CHAR(rep, i); +                            if (ch >= limit) { +                                raise_encode_exception(&exc, encoding, unicode, +                                                       pos, pos+1, reason); +                                goto onError; +                            } +                            *str = (char)ch; +                        }                      } -                    *str = (char)c;                  }                  pos = newpos; -                Py_DECREF(repunicode); +                Py_CLEAR(rep);              } + +            /* If overallocation was disabled, ensure that it was the last +               write. Otherwise, we missed an optimization */ +            assert(writer.overallocate || pos == size);          }      } -    /* Resize if we allocated to much */ -    size = str - PyBytes_AS_STRING(res); -    if (size < ressize) { /* If this falls res will be NULL */ -        assert(size >= 0); -        if (_PyBytes_Resize(&res, size) < 0) -            goto onError; -    } -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc); -    return res; - -  overflow: -    PyErr_SetString(PyExc_OverflowError, -                    "encoded result is too long for a Python string"); +    return _PyBytesWriter_Finish(&writer, str);    onError: -    Py_XDECREF(res); -    Py_XDECREF(errorHandler); +    Py_XDECREF(rep); +    _PyBytesWriter_Dealloc(&writer); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);      return NULL;  } @@ -6664,8 +6862,9 @@ PyUnicode_DecodeASCII(const char *s,      Py_ssize_t endinpos;      Py_ssize_t outpos;      const char *e; -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;      if (size == 0)          _Py_RETURN_UNICODE_EMPTY(); @@ -6694,12 +6893,42 @@ PyUnicode_DecodeASCII(const char *s,              PyUnicode_WRITE(kind, data, writer.pos, c);              writer.pos++;              ++s; +            continue;          } -        else { + +        /* byte outsize range 0x00..0x7f: call the error handler */ + +        if (error_handler == _Py_ERROR_UNKNOWN) +            error_handler = get_error_handler(errors); + +        switch (error_handler) +        { +        case _Py_ERROR_REPLACE: +        case _Py_ERROR_SURROGATEESCAPE: +            /* Fast-path: the error handler only writes one character, +               but we may switch to UCS2 at the first write */ +            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) +                goto onError; +            kind = writer.kind; +            data = writer.data; + +            if (error_handler == _Py_ERROR_REPLACE) +                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd); +            else +                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00); +            writer.pos++; +            ++s; +            break; + +        case _Py_ERROR_IGNORE: +            ++s; +            break; + +        default:              startinpos = s-starts;              endinpos = startinpos + 1;              if (unicode_decode_call_errorhandler_writer( -                    errors, &errorHandler, +                    errors, &error_handler_obj,                      "ascii", "ordinal not in range(128)",                      &starts, &e, &startinpos, &endinpos, &exc, &s,                      &writer)) @@ -6708,13 +6937,13 @@ PyUnicode_DecodeASCII(const char *s,              data = writer.data;          }      } -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);      return _PyUnicodeWriter_Finish(&writer);    onError:      _PyUnicodeWriter_Dealloc(&writer); -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);      return NULL;  } @@ -6769,7 +6998,7 @@ PyUnicode_AsASCIIString(PyObject *unicode)  #  define WC_ERR_INVALID_CHARS 0x0080  #endif -static char* +static const char*  code_page_name(UINT code_page, PyObject **obj)  {      *obj = NULL; @@ -6877,7 +7106,7 @@ decode_code_page_errors(UINT code_page,      PyObject *errorHandler = NULL;      PyObject *exc = NULL;      PyObject *encoding_obj = NULL; -    char *encoding; +    const char *encoding;      DWORD err;      int ret = -1; @@ -7113,7 +7342,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,      BOOL usedDefaultChar = FALSE;      BOOL *pusedDefaultChar = &usedDefaultChar;      int outsize; -    PyObject *exc = NULL;      wchar_t *p;      Py_ssize_t size;      const DWORD flags = encode_code_page_flags(code_page, NULL); @@ -7222,7 +7450,7 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes,      PyObject *errorHandler = NULL;      PyObject *exc = NULL;      PyObject *encoding_obj = NULL; -    char *encoding; +    const char *encoding;      Py_ssize_t newpos, newoutsize;      PyObject *rep;      int ret = -1; @@ -8080,7 +8308,7 @@ static int  charmap_encoding_error(      PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,      PyObject **exceptionObject, -    int *known_errorHandler, PyObject **errorHandler, const char *errors, +    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,      PyObject **res, Py_ssize_t *respos)  {      PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ @@ -8127,23 +8355,15 @@ charmap_encoding_error(      }      /* cache callback name lookup       * (if not done yet, i.e. it's the first error) */ -    if (*known_errorHandler==-1) { -        if ((errors==NULL) || (!strcmp(errors, "strict"))) -            *known_errorHandler = 1; -        else if (!strcmp(errors, "replace")) -            *known_errorHandler = 2; -        else if (!strcmp(errors, "ignore")) -            *known_errorHandler = 3; -        else if (!strcmp(errors, "xmlcharrefreplace")) -            *known_errorHandler = 4; -        else -            *known_errorHandler = 0; -    } -    switch (*known_errorHandler) { -    case 1: /* strict */ +    if (*error_handler == _Py_ERROR_UNKNOWN) +        *error_handler = get_error_handler(errors); + +    switch (*error_handler) { +    case _Py_ERROR_STRICT:          raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);          return -1; -    case 2: /* replace */ + +    case _Py_ERROR_REPLACE:          for (collpos = collstartpos; collpos<collendpos; ++collpos) {              x = charmapencode_output('?', mapping, res, respos);              if (x==enc_EXCEPTION) { @@ -8155,10 +8375,11 @@ charmap_encoding_error(              }          }          /* fall through */ -    case 3: /* ignore */ +    case _Py_ERROR_IGNORE:          *inpos = collendpos;          break; -    case 4: /* xmlcharrefreplace */ + +    case _Py_ERROR_XMLCHARREFREPLACE:          /* generate replacement (temporarily (mis)uses p) */          for (collpos = collstartpos; collpos < collendpos; ++collpos) {              char buffer[2+29+1+1]; @@ -8176,8 +8397,9 @@ charmap_encoding_error(          }          *inpos = collendpos;          break; +      default: -        repunicode = unicode_encode_call_errorhandler(errors, errorHandler, +        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,                                                        encoding, reason, unicode, exceptionObject,                                                        collstartpos, collendpos, &newpos);          if (repunicode == NULL) @@ -8240,12 +8462,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,      Py_ssize_t size;      /* current output position */      Py_ssize_t respos = 0; -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL; -    /* the following variable is used for caching string comparisons -     * -1=not initialized, 0=unknown, 1=strict, 2=replace, -     * 3=ignore, 4=xmlcharrefreplace */ -    int known_errorHandler = -1; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;      void *data;      int kind; @@ -8276,7 +8495,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,          if (x==enc_FAILED) { /* unencodable character */              if (charmap_encoding_error(unicode, &inpos, mapping,                                         &exc, -                                       &known_errorHandler, &errorHandler, errors, +                                       &error_handler, &error_handler_obj, errors,                                         &res, &respos)) {                  goto onError;              } @@ -8292,13 +8511,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,              goto onError;      Py_XDECREF(exc); -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      return res;    onError:      Py_XDECREF(res);      Py_XDECREF(exc); -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      return NULL;  } @@ -8365,7 +8584,7 @@ unicode_translate_call_errorhandler(const char *errors,                                      Py_ssize_t startpos, Py_ssize_t endpos,                                      Py_ssize_t *newpos)  { -    static char *argparse = "O!n;translating error handler must return (str, int) tuple"; +    static const char *argparse = "O!n;translating error handler must return (str, int) tuple";      Py_ssize_t i_newpos;      PyObject *restuple; @@ -8622,7 +8841,7 @@ exit:      return res;  } -PyObject * +static PyObject *  _PyUnicode_TranslateCharmap(PyObject *input,                              PyObject *mapping,                              const char *errors) @@ -8651,10 +8870,8 @@ _PyUnicode_TranslateCharmap(PyObject *input,      kind = PyUnicode_KIND(input);      size = PyUnicode_GET_LENGTH(input); -    if (size == 0) { -        Py_INCREF(input); -        return input; -    } +    if (size == 0) +        return PyUnicode_FromObject(input);      /* allocate enough for a simple 1:1 translation without         replacements, if we need more, we'll resize */ @@ -8765,14 +8982,9 @@ PyUnicode_Translate(PyObject *str,                      PyObject *mapping,                      const char *errors)  { -    PyObject *result; - -    str = PyUnicode_FromObject(str); -    if (str == NULL) +    if (ensure_unicode(str) < 0)          return NULL; -    result = _PyUnicode_TranslateCharmap(str, mapping, errors); -    Py_DECREF(str); -    return result; +    return _PyUnicode_TranslateCharmap(str, mapping, errors);  }  static Py_UCS4 @@ -8954,9 +9166,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,      }  static Py_ssize_t -any_find_slice(int direction, PyObject* s1, PyObject* s2, +any_find_slice(PyObject* s1, PyObject* s2,                 Py_ssize_t start, -               Py_ssize_t end) +               Py_ssize_t end, +               int direction)  {      int kind1, kind2;      void *buf1, *buf2; @@ -9125,54 +9338,35 @@ PyUnicode_Count(PyObject *str,                  Py_ssize_t end)  {      Py_ssize_t result; -    PyObject* str_obj; -    PyObject* sub_obj;      int kind1, kind2;      void *buf1 = NULL, *buf2 = NULL;      Py_ssize_t len1, len2; -    str_obj = PyUnicode_FromObject(str); -    if (!str_obj) +    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)          return -1; -    sub_obj = PyUnicode_FromObject(substr); -    if (!sub_obj) { -        Py_DECREF(str_obj); -        return -1; -    } -    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) { -        Py_DECREF(sub_obj); -        Py_DECREF(str_obj); -        return -1; -    } -    kind1 = PyUnicode_KIND(str_obj); -    kind2 = PyUnicode_KIND(sub_obj); -    if (kind1 < kind2) { -        Py_DECREF(sub_obj); -        Py_DECREF(str_obj); +    kind1 = PyUnicode_KIND(str); +    kind2 = PyUnicode_KIND(substr); +    if (kind1 < kind2)          return 0; -    } -    len1 = PyUnicode_GET_LENGTH(str_obj); -    len2 = PyUnicode_GET_LENGTH(sub_obj); +    len1 = PyUnicode_GET_LENGTH(str); +    len2 = PyUnicode_GET_LENGTH(substr);      ADJUST_INDICES(start, end, len1); -    if (end - start < len2) { -        Py_DECREF(sub_obj); -        Py_DECREF(str_obj); +    if (end - start < len2)          return 0; -    } -    buf1 = PyUnicode_DATA(str_obj); -    buf2 = PyUnicode_DATA(sub_obj); +    buf1 = PyUnicode_DATA(str); +    buf2 = PyUnicode_DATA(substr);      if (kind2 != kind1) { -        buf2 = _PyUnicode_AsKind(sub_obj, kind1); +        buf2 = _PyUnicode_AsKind(substr, kind1);          if (!buf2)              goto onError;      }      switch (kind1) {      case PyUnicode_1BYTE_KIND: -        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) +        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))              result = asciilib_count(                  ((Py_UCS1*)buf1) + start, end - start,                  buf2, len2, PY_SSIZE_T_MAX @@ -9199,16 +9393,11 @@ PyUnicode_Count(PyObject *str,          assert(0); result = 0;      } -    Py_DECREF(sub_obj); -    Py_DECREF(str_obj); -      if (kind2 != kind1)          PyMem_Free(buf2);      return result;    onError: -    Py_DECREF(sub_obj); -    Py_DECREF(str_obj);      if (kind2 != kind1 && buf2)          PyMem_Free(buf2);      return -1; @@ -9216,35 +9405,15 @@ PyUnicode_Count(PyObject *str,  Py_ssize_t  PyUnicode_Find(PyObject *str, -               PyObject *sub, +               PyObject *substr,                 Py_ssize_t start,                 Py_ssize_t end,                 int direction)  { -    Py_ssize_t result; - -    str = PyUnicode_FromObject(str); -    if (!str) -        return -2; -    sub = PyUnicode_FromObject(sub); -    if (!sub) { -        Py_DECREF(str); +    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)          return -2; -    } -    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) { -        Py_DECREF(sub); -        Py_DECREF(str); -        return -2; -    } -    result = any_find_slice(direction, -        str, sub, start, end -        ); - -    Py_DECREF(str); -    Py_DECREF(sub); - -    return result; +    return any_find_slice(str, substr, start, end, direction);  }  Py_ssize_t @@ -9347,22 +9516,10 @@ PyUnicode_Tailmatch(PyObject *str,                      Py_ssize_t end,                      int direction)  { -    Py_ssize_t result; - -    str = PyUnicode_FromObject(str); -    if (str == NULL) -        return -1; -    substr = PyUnicode_FromObject(substr); -    if (substr == NULL) { -        Py_DECREF(str); +    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)          return -1; -    } -    result = tailmatch(str, substr, -                       start, end, direction); -    Py_DECREF(str); -    Py_DECREF(substr); -    return result; +    return tailmatch(str, substr, start, end, direction);  }  /* Apply fixfct filter to the Unicode object self and return a @@ -9968,13 +10125,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends)  {      PyObject *list; -    string = PyUnicode_FromObject(string); -    if (string == NULL) +    if (ensure_unicode(string) < 0)          return NULL; -    if (PyUnicode_READY(string) == -1) { -        Py_DECREF(string); -        return NULL; -    }      switch (PyUnicode_KIND(string)) {      case PyUnicode_1BYTE_KIND: @@ -10001,7 +10153,6 @@ PyUnicode_Splitlines(PyObject *string, int keepends)          assert(0);          list = 0;      } -    Py_DECREF(string);      return list;  } @@ -10562,28 +10713,27 @@ unicode_casefold(PyObject *self)  } -/* Argument converter.  Coerces to a single unicode character */ +/* Argument converter. Accepts a single Unicode character. */  static int  convert_uc(PyObject *obj, void *addr)  {      Py_UCS4 *fillcharloc = (Py_UCS4 *)addr; -    PyObject *uniobj; -    uniobj = PyUnicode_FromObject(obj); -    if (uniobj == NULL) { -        PyErr_SetString(PyExc_TypeError, -                        "The fill character cannot be converted to Unicode"); +    if (!PyUnicode_Check(obj)) { +        PyErr_Format(PyExc_TypeError, +                     "The fill character must be a unicode character, " +                     "not %.100s", Py_TYPE(obj)->tp_name);          return 0;      } -    if (PyUnicode_GET_LENGTH(uniobj) != 1) { +    if (PyUnicode_READY(obj) < 0) +        return 0; +    if (PyUnicode_GET_LENGTH(obj) != 1) {          PyErr_SetString(PyExc_TypeError,                          "The fill character must be exactly one character long"); -        Py_DECREF(uniobj);          return 0;      } -    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0); -    Py_DECREF(uniobj); +    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);      return 1;  } @@ -10899,59 +11049,49 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)  }  int -PyUnicode_Contains(PyObject *container, PyObject *element) +_PyUnicode_EQ(PyObject *aa, PyObject *bb) +{ +    return unicode_eq(aa, bb); +} + +int +PyUnicode_Contains(PyObject *str, PyObject *substr)  { -    PyObject *str, *sub;      int kind1, kind2;      void *buf1, *buf2;      Py_ssize_t len1, len2;      int result; -    /* Coerce the two arguments */ -    sub = PyUnicode_FromObject(element); -    if (!sub) { +    if (!PyUnicode_Check(substr)) {          PyErr_Format(PyExc_TypeError, -                     "'in <string>' requires string as left operand, not %s", -                     element->ob_type->tp_name); +                     "'in <string>' requires string as left operand, not %.100s", +                     Py_TYPE(substr)->tp_name);          return -1;      } - -    str = PyUnicode_FromObject(container); -    if (!str) { -        Py_DECREF(sub); +    if (PyUnicode_READY(substr) == -1) +        return -1; +    if (ensure_unicode(str) < 0)          return -1; -    }      kind1 = PyUnicode_KIND(str); -    kind2 = PyUnicode_KIND(sub); -    if (kind1 < kind2) { -        Py_DECREF(sub); -        Py_DECREF(str); +    kind2 = PyUnicode_KIND(substr); +    if (kind1 < kind2)          return 0; -    }      len1 = PyUnicode_GET_LENGTH(str); -    len2 = PyUnicode_GET_LENGTH(sub); -    if (len1 < len2) { -        Py_DECREF(sub); -        Py_DECREF(str); +    len2 = PyUnicode_GET_LENGTH(substr); +    if (len1 < len2)          return 0; -    }      buf1 = PyUnicode_DATA(str); -    buf2 = PyUnicode_DATA(sub); +    buf2 = PyUnicode_DATA(substr);      if (len2 == 1) {          Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);          result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1; -        Py_DECREF(sub); -        Py_DECREF(str);          return result;      }      if (kind2 != kind1) { -        buf2 = _PyUnicode_AsKind(sub, kind1); -        if (!buf2) { -            Py_DECREF(sub); -            Py_DECREF(str); +        buf2 = _PyUnicode_AsKind(substr, kind1); +        if (!buf2)              return -1; -        }      }      switch (kind1) { @@ -10969,9 +11109,6 @@ PyUnicode_Contains(PyObject *container, PyObject *element)          assert(0);      } -    Py_DECREF(str); -    Py_DECREF(sub); -      if (kind2 != kind1)          PyMem_Free(buf2); @@ -10983,56 +11120,40 @@ PyUnicode_Contains(PyObject *container, PyObject *element)  PyObject *  PyUnicode_Concat(PyObject *left, PyObject *right)  { -    PyObject *u = NULL, *v = NULL, *w; +    PyObject *result;      Py_UCS4 maxchar, maxchar2; -    Py_ssize_t u_len, v_len, new_len; +    Py_ssize_t left_len, right_len, new_len; -    /* Coerce the two arguments */ -    u = PyUnicode_FromObject(left); -    if (u == NULL) -        goto onError; -    v = PyUnicode_FromObject(right); -    if (v == NULL) -        goto onError; +    if (ensure_unicode(left) < 0 || ensure_unicode(right) < 0) +        return NULL;      /* Shortcuts */ -    if (v == unicode_empty) { -        Py_DECREF(v); -        return u; -    } -    if (u == unicode_empty) { -        Py_DECREF(u); -        return v; -    } +    if (left == unicode_empty) +        return PyUnicode_FromObject(right); +    if (right == unicode_empty) +        return PyUnicode_FromObject(left); -    u_len = PyUnicode_GET_LENGTH(u); -    v_len = PyUnicode_GET_LENGTH(v); -    if (u_len > PY_SSIZE_T_MAX - v_len) { +    left_len = PyUnicode_GET_LENGTH(left); +    right_len = PyUnicode_GET_LENGTH(right); +    if (left_len > PY_SSIZE_T_MAX - right_len) {          PyErr_SetString(PyExc_OverflowError,                          "strings are too large to concat"); -        goto onError; +        return NULL;      } -    new_len = u_len + v_len; +    new_len = left_len + right_len; -    maxchar = PyUnicode_MAX_CHAR_VALUE(u); -    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); +    maxchar = PyUnicode_MAX_CHAR_VALUE(left); +    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);      maxchar = Py_MAX(maxchar, maxchar2);      /* Concat the two Unicode strings */ -    w = PyUnicode_New(new_len, maxchar); -    if (w == NULL) -        goto onError; -    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len); -    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len); -    Py_DECREF(u); -    Py_DECREF(v); -    assert(_PyUnicode_CheckConsistency(w, 1)); -    return w; - -  onError: -    Py_XDECREF(u); -    Py_XDECREF(v); -    return NULL; +    result = PyUnicode_New(new_len, maxchar); +    if (result == NULL) +        return NULL; +    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); +    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); +    assert(_PyUnicode_CheckConsistency(result, 1)); +    return result;  }  void @@ -11123,6 +11244,25 @@ PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)      Py_XDECREF(right);  } +/* +Wraps stringlib_parse_args_finds() and additionally ensures that the +first argument is a unicode object. +*/ + +Py_LOCAL_INLINE(int) +parse_args_finds_unicode(const char * function_name, PyObject *args, +                         PyObject **substring, +                         Py_ssize_t *start, Py_ssize_t *end) +{ +    if(stringlib_parse_args_finds(function_name, args, substring, +                                  start, end)) { +        if (ensure_unicode(*substring) < 0) +            return 0; +        return 1; +    } +    return 0; +} +  PyDoc_STRVAR(count__doc__,               "S.count(sub[, start[, end]]) -> int\n\  \n\ @@ -11141,31 +11281,26 @@ unicode_count(PyObject *self, PyObject *args)      void *buf1, *buf2;      Py_ssize_t len1, len2, iresult; -    if (!stringlib_parse_args_finds_unicode("count", args, &substring, -                                            &start, &end)) +    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))          return NULL;      kind1 = PyUnicode_KIND(self);      kind2 = PyUnicode_KIND(substring); -    if (kind1 < kind2) { -        Py_DECREF(substring); +    if (kind1 < kind2)          return PyLong_FromLong(0); -    } +      len1 = PyUnicode_GET_LENGTH(self);      len2 = PyUnicode_GET_LENGTH(substring);      ADJUST_INDICES(start, end, len1); -    if (end - start < len2) { -        Py_DECREF(substring); +    if (end - start < len2)          return PyLong_FromLong(0); -    } +      buf1 = PyUnicode_DATA(self);      buf2 = PyUnicode_DATA(substring);      if (kind2 != kind1) {          buf2 = _PyUnicode_AsKind(substring, kind1); -        if (!buf2) { -            Py_DECREF(substring); +        if (!buf2)              return NULL; -        }      }      switch (kind1) {      case PyUnicode_1BYTE_KIND: @@ -11195,8 +11330,6 @@ unicode_count(PyObject *self, PyObject *args)      if (kind2 != kind1)          PyMem_Free(buf2); -    Py_DECREF(substring); -      return result;  } @@ -11330,22 +11463,13 @@ unicode_find(PyObject *self, PyObject *args)      Py_ssize_t end = 0;      Py_ssize_t result; -    if (!stringlib_parse_args_finds_unicode("find", args, &substring, -                                            &start, &end)) +    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))          return NULL; -    if (PyUnicode_READY(self) == -1) { -        Py_DECREF(substring); -        return NULL; -    } -    if (PyUnicode_READY(substring) == -1) { -        Py_DECREF(substring); +    if (PyUnicode_READY(self) == -1)          return NULL; -    } -    result = any_find_slice(1, self, substring, start, end); - -    Py_DECREF(substring); +    result = any_find_slice(self, substring, start, end, 1);      if (result == -2)          return NULL; @@ -11418,22 +11542,13 @@ unicode_index(PyObject *self, PyObject *args)      Py_ssize_t start = 0;      Py_ssize_t end = 0; -    if (!stringlib_parse_args_finds_unicode("index", args, &substring, -                                            &start, &end)) +    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))          return NULL; -    if (PyUnicode_READY(self) == -1) { -        Py_DECREF(substring); -        return NULL; -    } -    if (PyUnicode_READY(substring) == -1) { -        Py_DECREF(substring); +    if (PyUnicode_READY(self) == -1)          return NULL; -    } -    result = any_find_slice(1, self, substring, start, end); - -    Py_DECREF(substring); +    result = any_find_slice(self, substring, start, end, 1);      if (result == -2)          return NULL; @@ -11947,7 +12062,7 @@ unicode_lower(PyObject *self)  #define BOTHSTRIP 2  /* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; +static const char * const stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};  #define STRIPNAME(i) (stripformat[i]+3) @@ -12242,40 +12357,15 @@ unicode_repeat(PyObject *str, Py_ssize_t len)  }  PyObject * -PyUnicode_Replace(PyObject *obj, -                  PyObject *subobj, -                  PyObject *replobj, +PyUnicode_Replace(PyObject *str, +                  PyObject *substr, +                  PyObject *replstr,                    Py_ssize_t maxcount)  { -    PyObject *self; -    PyObject *str1; -    PyObject *str2; -    PyObject *result; - -    self = PyUnicode_FromObject(obj); -    if (self == NULL) +    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 || +            ensure_unicode(replstr) < 0)          return NULL; -    str1 = PyUnicode_FromObject(subobj); -    if (str1 == NULL) { -        Py_DECREF(self); -        return NULL; -    } -    str2 = PyUnicode_FromObject(replobj); -    if (str2 == NULL) { -        Py_DECREF(self); -        Py_DECREF(str1); -        return NULL; -    } -    if (PyUnicode_READY(self) == -1 || -        PyUnicode_READY(str1) == -1 || -        PyUnicode_READY(str2) == -1) -        result = NULL; -    else -        result = replace(self, str1, str2, maxcount); -    Py_DECREF(self); -    Py_DECREF(str1); -    Py_DECREF(str2); -    return result; +    return replace(str, substr, replstr, maxcount);  }  PyDoc_STRVAR(replace__doc__, @@ -12291,28 +12381,12 @@ unicode_replace(PyObject *self, PyObject *args)      PyObject *str1;      PyObject *str2;      Py_ssize_t maxcount = -1; -    PyObject *result; -    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) +    if (!PyArg_ParseTuple(args, "UU|n:replace", &str1, &str2, &maxcount))          return NULL;      if (PyUnicode_READY(self) == -1)          return NULL; -    str1 = PyUnicode_FromObject(str1); -    if (str1 == NULL) -        return NULL; -    str2 = PyUnicode_FromObject(str2); -    if (str2 == NULL) { -        Py_DECREF(str1); -        return NULL; -    } -    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1) -        result = NULL; -    else -        result = replace(self, str1, str2, maxcount); - -    Py_DECREF(str1); -    Py_DECREF(str2); -    return result; +    return replace(self, str1, str2, maxcount);  }  static PyObject * @@ -12497,22 +12571,13 @@ unicode_rfind(PyObject *self, PyObject *args)      Py_ssize_t end = 0;      Py_ssize_t result; -    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring, -                                            &start, &end)) +    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))          return NULL; -    if (PyUnicode_READY(self) == -1) { -        Py_DECREF(substring); -        return NULL; -    } -    if (PyUnicode_READY(substring) == -1) { -        Py_DECREF(substring); +    if (PyUnicode_READY(self) == -1)          return NULL; -    } -    result = any_find_slice(-1, self, substring, start, end); - -    Py_DECREF(substring); +    result = any_find_slice(self, substring, start, end, -1);      if (result == -2)          return NULL; @@ -12534,22 +12599,13 @@ unicode_rindex(PyObject *self, PyObject *args)      Py_ssize_t end = 0;      Py_ssize_t result; -    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring, -                                            &start, &end)) +    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))          return NULL; -    if (PyUnicode_READY(self) == -1) { -        Py_DECREF(substring); -        return NULL; -    } -    if (PyUnicode_READY(substring) == -1) { -        Py_DECREF(substring); +    if (PyUnicode_READY(self) == -1)          return NULL; -    } - -    result = any_find_slice(-1, self, substring, start, end); -    Py_DECREF(substring); +    result = any_find_slice(self, substring, start, end, -1);      if (result == -2)          return NULL; @@ -12589,24 +12645,10 @@ unicode_rjust(PyObject *self, PyObject *args)  PyObject *  PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)  { -    PyObject *result; - -    s = PyUnicode_FromObject(s); -    if (s == NULL) +    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))          return NULL; -    if (sep != NULL) { -        sep = PyUnicode_FromObject(sep); -        if (sep == NULL) { -            Py_DECREF(s); -            return NULL; -        } -    } - -    result = split(s, sep, maxsplit); -    Py_DECREF(s); -    Py_XDECREF(sep); -    return result; +    return split(s, sep, maxsplit);  }  PyDoc_STRVAR(split__doc__, @@ -12631,35 +12673,26 @@ unicode_split(PyObject *self, PyObject *args, PyObject *kwds)      if (substring == Py_None)          return split(self, NULL, maxcount); -    else if (PyUnicode_Check(substring)) + +    if (PyUnicode_Check(substring))          return split(self, substring, maxcount); -    else -        return PyUnicode_Split(self, substring, maxcount); + +    PyErr_Format(PyExc_TypeError, +                 "must be str or None, not %.100s", +                 Py_TYPE(substring)->tp_name); +    return NULL;  }  PyObject * -PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) +PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)  { -    PyObject* str_obj; -    PyObject* sep_obj;      PyObject* out;      int kind1, kind2;      void *buf1, *buf2;      Py_ssize_t len1, len2; -    str_obj = PyUnicode_FromObject(str_in); -    if (!str_obj) +    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)          return NULL; -    sep_obj = PyUnicode_FromObject(sep_in); -    if (!sep_obj) { -        Py_DECREF(str_obj); -        return NULL; -    } -    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) { -        Py_DECREF(sep_obj); -        Py_DECREF(str_obj); -        return NULL; -    }      kind1 = PyUnicode_KIND(str_obj);      kind2 = PyUnicode_KIND(sep_obj); @@ -12673,8 +12706,6 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)              out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);              Py_DECREF(unicode_empty);          } -        Py_DECREF(sep_obj); -        Py_DECREF(str_obj);          return out;      }      buf1 = PyUnicode_DATA(str_obj); @@ -12682,7 +12713,7 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)      if (kind2 != kind1) {          buf2 = _PyUnicode_AsKind(sep_obj, kind1);          if (!buf2) -            goto onError; +            return NULL;      }      switch (kind1) { @@ -12703,39 +12734,23 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)          out = 0;      } -    Py_DECREF(sep_obj); -    Py_DECREF(str_obj);      if (kind2 != kind1)          PyMem_Free(buf2);      return out; -  onError: -    Py_DECREF(sep_obj); -    Py_DECREF(str_obj); -    if (kind2 != kind1 && buf2) -        PyMem_Free(buf2); -    return NULL;  }  PyObject * -PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) +PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)  { -    PyObject* str_obj; -    PyObject* sep_obj;      PyObject* out;      int kind1, kind2;      void *buf1, *buf2;      Py_ssize_t len1, len2; -    str_obj = PyUnicode_FromObject(str_in); -    if (!str_obj) -        return NULL; -    sep_obj = PyUnicode_FromObject(sep_in); -    if (!sep_obj) { -        Py_DECREF(str_obj); +    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)          return NULL; -    }      kind1 = PyUnicode_KIND(str_obj);      kind2 = PyUnicode_KIND(sep_obj); @@ -12749,8 +12764,6 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)              out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);              Py_DECREF(unicode_empty);          } -        Py_DECREF(sep_obj); -        Py_DECREF(str_obj);          return out;      }      buf1 = PyUnicode_DATA(str_obj); @@ -12758,7 +12771,7 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)      if (kind2 != kind1) {          buf2 = _PyUnicode_AsKind(sep_obj, kind1);          if (!buf2) -            goto onError; +            return NULL;      }      switch (kind1) { @@ -12779,18 +12792,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)          out = 0;      } -    Py_DECREF(sep_obj); -    Py_DECREF(str_obj);      if (kind2 != kind1)          PyMem_Free(buf2);      return out; -  onError: -    Py_DECREF(sep_obj); -    Py_DECREF(str_obj); -    if (kind2 != kind1 && buf2) -        PyMem_Free(buf2); -    return NULL;  }  PyDoc_STRVAR(partition__doc__, @@ -12822,24 +12827,10 @@ unicode_rpartition(PyObject *self, PyObject *separator)  PyObject *  PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)  { -    PyObject *result; - -    s = PyUnicode_FromObject(s); -    if (s == NULL) +    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))          return NULL; -    if (sep != NULL) { -        sep = PyUnicode_FromObject(sep); -        if (sep == NULL) { -            Py_DECREF(s); -            return NULL; -        } -    } -    result = rsplit(s, sep, maxsplit); - -    Py_DECREF(s); -    Py_XDECREF(sep); -    return result; +    return rsplit(s, sep, maxsplit);  }  PyDoc_STRVAR(rsplit__doc__, @@ -12864,10 +12855,14 @@ unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)      if (substring == Py_None)          return rsplit(self, NULL, maxcount); -    else if (PyUnicode_Check(substring)) + +    if (PyUnicode_Check(substring))          return rsplit(self, substring, maxcount); -    else -        return PyUnicode_RSplit(self, substring, maxcount); + +    PyErr_Format(PyExc_TypeError, +                 "must be str or None, not %.100s", +                 Py_TYPE(substring)->tp_name); +    return NULL;  }  PyDoc_STRVAR(splitlines__doc__, @@ -13148,11 +13143,15 @@ unicode_startswith(PyObject *self,      if (PyTuple_Check(subobj)) {          Py_ssize_t i;          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { -            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i)); -            if (substring == NULL) +            substring = PyTuple_GET_ITEM(subobj, i); +            if (!PyUnicode_Check(substring)) { +                PyErr_Format(PyExc_TypeError, +                             "tuple for startswith must only contain str, " +                             "not %.100s", +                             Py_TYPE(substring)->tp_name);                  return NULL; +            }              result = tailmatch(self, substring, start, end, -1); -            Py_DECREF(substring);              if (result == -1)                  return NULL;              if (result) { @@ -13162,15 +13161,13 @@ unicode_startswith(PyObject *self,          /* nothing matched */          Py_RETURN_FALSE;      } -    substring = PyUnicode_FromObject(subobj); -    if (substring == NULL) { -        if (PyErr_ExceptionMatches(PyExc_TypeError)) -            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or " -                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); +    if (!PyUnicode_Check(subobj)) { +        PyErr_Format(PyExc_TypeError, +                     "startswith first arg must be str or " +                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);          return NULL;      } -    result = tailmatch(self, substring, start, end, -1); -    Py_DECREF(substring); +    result = tailmatch(self, subobj, start, end, -1);      if (result == -1)          return NULL;      return PyBool_FromLong(result); @@ -13200,12 +13197,15 @@ unicode_endswith(PyObject *self,      if (PyTuple_Check(subobj)) {          Py_ssize_t i;          for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { -            substring = PyUnicode_FromObject( -                PyTuple_GET_ITEM(subobj, i)); -            if (substring == NULL) +            substring = PyTuple_GET_ITEM(subobj, i); +            if (!PyUnicode_Check(substring)) { +                PyErr_Format(PyExc_TypeError, +                             "tuple for endswith must only contain str, " +                             "not %.100s", +                             Py_TYPE(substring)->tp_name);                  return NULL; +            }              result = tailmatch(self, substring, start, end, +1); -            Py_DECREF(substring);              if (result == -1)                  return NULL;              if (result) { @@ -13214,15 +13214,13 @@ unicode_endswith(PyObject *self,          }          Py_RETURN_FALSE;      } -    substring = PyUnicode_FromObject(subobj); -    if (substring == NULL) { -        if (PyErr_ExceptionMatches(PyExc_TypeError)) -            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or " -                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name); +    if (!PyUnicode_Check(subobj)) { +        PyErr_Format(PyExc_TypeError, +                     "endswith first arg must be str or " +                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);          return NULL;      } -    result = tailmatch(self, substring, start, end, +1); -    Py_DECREF(substring); +    result = tailmatch(self, subobj, start, end, +1);      if (result == -1)          return NULL;      return PyBool_FromLong(result); @@ -13231,44 +13229,50 @@ unicode_endswith(PyObject *self,  Py_LOCAL_INLINE(void)  _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)  { -    if (!writer->readonly) +    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); +    writer->data = PyUnicode_DATA(writer->buffer); + +    if (!writer->readonly) { +        writer->kind = PyUnicode_KIND(writer->buffer);          writer->size = PyUnicode_GET_LENGTH(writer->buffer); +    }      else { +        /* use a value smaller than PyUnicode_1BYTE_KIND() so +           _PyUnicodeWriter_PrepareKind() will copy the buffer. */ +        writer->kind = PyUnicode_WCHAR_KIND; +        assert(writer->kind <= PyUnicode_1BYTE_KIND); +          /* Copy-on-write mode: set buffer size to 0 so           * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on           * next write. */          writer->size = 0;      } -    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); -    writer->data = PyUnicode_DATA(writer->buffer); -    writer->kind = PyUnicode_KIND(writer->buffer);  }  void  _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)  {      memset(writer, 0, sizeof(*writer)); -#ifdef Py_DEBUG -    writer->kind = 5;    /* invalid kind */ -#endif + +    /* ASCII is the bare minimum */      writer->min_char = 127; + +    /* use a value smaller than PyUnicode_1BYTE_KIND() so +       _PyUnicodeWriter_PrepareKind() will copy the buffer. */ +    writer->kind = PyUnicode_WCHAR_KIND; +    assert(writer->kind <= PyUnicode_1BYTE_KIND);  }  int  _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,                                   Py_ssize_t length, Py_UCS4 maxchar)  { -#ifdef MS_WINDOWS -   /* On Windows, overallocate by 50% is the best factor */ -#  define OVERALLOCATE_FACTOR 2 -#else -   /* On Linux, overallocate by 25% is the best factor */ -#  define OVERALLOCATE_FACTOR 4 -#endif      Py_ssize_t newlen;      PyObject *newbuffer; -    assert(length > 0); +    /* ensure that the _PyUnicodeWriter_Prepare macro was used */ +    assert((maxchar > writer->maxchar && length >= 0) +           || length > 0);      if (length > PY_SSIZE_T_MAX - writer->pos) {          PyErr_NoMemory(); @@ -13334,6 +13338,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,  #undef OVERALLOCATE_FACTOR  } +int +_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer, +                                     enum PyUnicode_Kind kind) +{ +    Py_UCS4 maxchar; + +    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */ +    assert(writer->kind < kind); + +    switch (kind) +    { +    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break; +    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break; +    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break; +    default: +        assert(0 && "invalid kind"); +        return -1; +    } + +    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar); +} +  Py_LOCAL_INLINE(int)  _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)  { @@ -13504,17 +13530,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)          assert(PyUnicode_GET_LENGTH(str) == writer->pos);          return str;      } -    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) { -        PyObject *newbuffer; -        newbuffer = resize_compact(writer->buffer, writer->pos); -        if (newbuffer == NULL) { -            Py_CLEAR(writer->buffer); -            return NULL; +    if (writer->pos == 0) { +        Py_CLEAR(writer->buffer); + +        /* Get the empty Unicode string singleton ('') */ +        _Py_INCREF_UNICODE_EMPTY(); +        str  = unicode_empty; +    } +    else { +        str = writer->buffer; +        writer->buffer = NULL; + +        if (PyUnicode_GET_LENGTH(str) != writer->pos) { +            PyObject *str2; +            str2 = resize_compact(str, writer->pos); +            if (str2 == NULL) +                return NULL; +            str = str2;          } -        writer->buffer = newbuffer;      } -    str = writer->buffer; -    writer->buffer = NULL; +      assert(_PyUnicode_CheckConsistency(str, 1));      return unicode_result_ready(str);  } @@ -14655,13 +14690,10 @@ PyUnicode_Format(PyObject *format, PyObject *args)          return NULL;      } -    ctx.fmtstr = PyUnicode_FromObject(format); -    if (ctx.fmtstr == NULL) -        return NULL; -    if (PyUnicode_READY(ctx.fmtstr) == -1) { -        Py_DECREF(ctx.fmtstr); +    if (ensure_unicode(format) < 0)          return NULL; -    } + +    ctx.fmtstr = format;      ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);      ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);      ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); @@ -14721,11 +14753,9 @@ PyUnicode_Format(PyObject *format, PyObject *args)      if (ctx.args_owned) {          Py_DECREF(ctx.args);      } -    Py_DECREF(ctx.fmtstr);      return _PyUnicodeWriter_Finish(&ctx.writer);    onError: -    Py_DECREF(ctx.fmtstr);      _PyUnicodeWriter_Dealloc(&ctx.writer);      if (ctx.args_owned) {          Py_DECREF(ctx.args); | 
