diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 235 | 
1 files changed, 98 insertions, 137 deletions
| diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2cd9cbfa00..8f6f6c675f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1029,8 +1029,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length)      if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {          PyObject *copy; -        if (PyUnicode_READY(unicode) == -1) -            return NULL; +        assert(PyUnicode_IS_READY(unicode));          copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));          if (copy == NULL) @@ -1974,14 +1973,11 @@ unicode_char(Py_UCS4 ch)      unicode = PyUnicode_New(1, ch);      if (unicode == NULL)          return NULL; -    switch (PyUnicode_KIND(unicode)) { -    case PyUnicode_1BYTE_KIND: -        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch; -        break; -    case PyUnicode_2BYTE_KIND: + +    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND); +    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {          PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch; -        break; -    default: +    } else {          assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);          PyUnicode_4BYTE_DATA(unicode)[0] = ch;      } @@ -1992,12 +1988,32 @@ unicode_char(Py_UCS4 ch)  PyObject *  PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)  { +    if (u == NULL) +        return (PyObject*)_PyUnicode_New(size); + +    if (size < 0) { +        PyErr_BadInternalCall(); +        return NULL; +    } + +    return PyUnicode_FromWideChar(u, size); +} + +PyObject * +PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) +{      PyObject *unicode;      Py_UCS4 maxchar = 0;      Py_ssize_t num_surrogates; -    if (u == NULL) -        return (PyObject*)_PyUnicode_New(size); +    if (u == NULL && size != 0) { +        PyErr_BadInternalCall(); +        return NULL; +    } + +    if (size == -1) { +        size = wcslen(u); +    }      /* If the Unicode data is known at construction time, we can apply         some optimizations which share commonly used objects. */ @@ -2482,27 +2498,6 @@ PyUnicode_AsUCS4Copy(PyObject *string)      return as_ucs4(string, NULL, 0, 1);  } -#ifdef HAVE_WCHAR_H - -PyObject * -PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) -{ -    if (w == NULL) { -        if (size == 0) -            _Py_RETURN_UNICODE_EMPTY(); -        PyErr_BadInternalCall(); -        return NULL; -    } - -    if (size == -1) { -        size = wcslen(w); -    } - -    return PyUnicode_FromUnicode(w, size); -} - -#endif /* HAVE_WCHAR_H */ -  /* maximum number of characters required for output of %lld or %p.     We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,     plus 1 for the sign.  53/22 is an upper bound for log10(256). */ @@ -3300,7 +3295,7 @@ PyUnicode_Encode(const Py_UNICODE *s,  {      PyObject *v, *unicode; -    unicode = PyUnicode_FromUnicode(s, size); +    unicode = PyUnicode_FromWideChar(s, size);      if (unicode == NULL)          return NULL;      v = PyUnicode_AsEncodedString(unicode, encoding, errors); @@ -3412,11 +3407,9 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)  {      Py_ssize_t wlen, wlen2;      wchar_t *wstr; -    PyObject *bytes = NULL;      char *errmsg; -    PyObject *reason = NULL; -    PyObject *exc; -    size_t error_pos; +    PyObject *bytes, *reason, *exc; +    size_t error_pos, errlen;      int surrogateescape;      if (locale_error_handler(errors, &surrogateescape) < 0) @@ -3471,6 +3464,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)          len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);          if (len2 == (size_t)-1 || len2 > len) { +            Py_DECREF(bytes);              error_pos = (size_t)-1;              goto encode_error;          } @@ -3486,17 +3480,15 @@ encode_error:          error_pos = wcstombs_errorpos(wstr);      PyMem_Free(wstr); -    Py_XDECREF(bytes); - -    if (errmsg != NULL) { -        size_t errlen; -        wstr = Py_DecodeLocale(errmsg, &errlen); -        if (wstr != NULL) { -            reason = PyUnicode_FromWideChar(wstr, errlen); -            PyMem_RawFree(wstr); -        } else -            errmsg = NULL; + +    wstr = Py_DecodeLocale(errmsg, &errlen); +    if (wstr != NULL) { +        reason = PyUnicode_FromWideChar(wstr, errlen); +        PyMem_RawFree(wstr); +    } else { +        errmsg = NULL;      } +      if (errmsg == NULL)          reason = PyUnicode_FromString(              "wcstombs() encountered an unencodable " @@ -3512,7 +3504,7 @@ encode_error:      Py_DECREF(reason);      if (exc != NULL) {          PyCodec_StrictErrors(exc); -        Py_XDECREF(exc); +        Py_DECREF(exc);      }      return NULL;  } @@ -3719,10 +3711,9 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,      size_t wlen, wlen2;      PyObject *unicode;      int surrogateescape; -    size_t error_pos; +    size_t error_pos, errlen;      char *errmsg; -    PyObject *reason = NULL;   /* initialize to prevent gcc warning */ -    PyObject *exc; +    PyObject *exc, *reason = NULL;   /* initialize to prevent gcc warning */      if (locale_error_handler(errors, &surrogateescape) < 0)          return NULL; @@ -3780,19 +3771,16 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,      return unicode;  decode_error: -    reason = NULL;      errmsg = strerror(errno);      assert(errmsg != NULL);      error_pos = mbstowcs_errorpos(str, len); -    if (errmsg != NULL) { -        size_t errlen; -        wstr = Py_DecodeLocale(errmsg, &errlen); -        if (wstr != NULL) { -            reason = PyUnicode_FromWideChar(wstr, errlen); -            PyMem_RawFree(wstr); -        } +    wstr = Py_DecodeLocale(errmsg, &errlen); +    if (wstr != NULL) { +        reason = PyUnicode_FromWideChar(wstr, errlen); +        PyMem_RawFree(wstr);      } +      if (reason == NULL)          reason = PyUnicode_FromString(              "mbstowcs() encountered an invalid multibyte sequence"); @@ -3807,7 +3795,7 @@ decode_error:      Py_DECREF(reason);      if (exc != NULL) {          PyCodec_StrictErrors(exc); -        Py_XDECREF(exc); +        Py_DECREF(exc);      }      return NULL;  } @@ -4140,7 +4128,11 @@ PyUnicode_GetSize(PyObject *unicode)          PyErr_BadArgument();          goto onError;      } -    return PyUnicode_GET_SIZE(unicode); +    if (_PyUnicode_WSTR(unicode) == NULL) { +        if (PyUnicode_AsUnicode(unicode) == NULL) +            goto onError; +    } +    return PyUnicode_WSTR_LENGTH(unicode);    onError:      return -1; @@ -4248,7 +4240,7 @@ unicode_decode_call_errorhandler_wchar(      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,      PyObject **output, Py_ssize_t *outpos)  { -    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; +    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";      PyObject *restuple = NULL;      PyObject *repunicode = NULL; @@ -4277,14 +4269,14 @@ unicode_decode_call_errorhandler_wchar(      if (*exceptionObject == NULL)          goto onError; -    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); +    restuple = _PyObject_CallArg1(*errorHandler, *exceptionObject);      if (restuple == NULL)          goto onError;      if (!PyTuple_Check(restuple)) { -        PyErr_SetString(PyExc_TypeError, &argparse[4]); +        PyErr_SetString(PyExc_TypeError, &argparse[3]);          goto onError;      } -    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) +    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))          goto onError;      /* Copy back the bytes variables, which might have been modified by the @@ -4292,9 +4284,6 @@ unicode_decode_call_errorhandler_wchar(      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);      if (!inputobj)          goto onError; -    if (!PyBytes_Check(inputobj)) { -        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); -    }      *input = PyBytes_AS_STRING(inputobj);      insize = PyBytes_GET_SIZE(inputobj);      *inend = *input + insize; @@ -4335,7 +4324,7 @@ unicode_decode_call_errorhandler_wchar(      *inptr = *input + newpos;      /* we made it! */ -    Py_XDECREF(restuple); +    Py_DECREF(restuple);      return 0;    overflow: @@ -4356,7 +4345,7 @@ unicode_decode_call_errorhandler_writer(      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,      _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)  { -    static const char *argparse = "O!n;decoding error handler must return (str, int) tuple"; +    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";      PyObject *restuple = NULL;      PyObject *repunicode = NULL; @@ -4379,14 +4368,14 @@ unicode_decode_call_errorhandler_writer(      if (*exceptionObject == NULL)          goto onError; -    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); +    restuple = _PyObject_CallArg1(*errorHandler, *exceptionObject);      if (restuple == NULL)          goto onError;      if (!PyTuple_Check(restuple)) { -        PyErr_SetString(PyExc_TypeError, &argparse[4]); +        PyErr_SetString(PyExc_TypeError, &argparse[3]);          goto onError;      } -    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) +    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))          goto onError;      /* Copy back the bytes variables, which might have been modified by the @@ -4394,9 +4383,6 @@ unicode_decode_call_errorhandler_writer(      inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);      if (!inputobj)          goto onError; -    if (!PyBytes_Check(inputobj)) { -        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); -    }      *input = PyBytes_AS_STRING(inputobj);      insize = PyBytes_GET_SIZE(inputobj);      *inend = *input + insize; @@ -4411,8 +4397,6 @@ unicode_decode_call_errorhandler_writer(          goto onError;      } -    if (PyUnicode_READY(repunicode) < 0) -        goto onError;      replen = PyUnicode_GET_LENGTH(repunicode);      if (replen > 1) {          writer->min_length += replen - 1; @@ -4428,7 +4412,7 @@ unicode_decode_call_errorhandler_writer(      *inptr = *input + newpos;      /* we made it! */ -    Py_XDECREF(restuple); +    Py_DECREF(restuple);      return 0;    onError: @@ -4834,7 +4818,7 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,                       const char *errors)  {      PyObject *result; -    PyObject *tmp = PyUnicode_FromUnicode(s, size); +    PyObject *tmp = PyUnicode_FromWideChar(s, size);      if (tmp == NULL)          return NULL;      result = _PyUnicode_EncodeUTF7(tmp, base64SetO, @@ -5190,7 +5174,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,  {      PyObject *v, *unicode; -    unicode = PyUnicode_FromUnicode(s, size); +    unicode = PyUnicode_FromWideChar(s, size);      if (unicode == NULL)          return NULL;      v = _PyUnicode_AsUTF8String(unicode, errors); @@ -5515,7 +5499,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,                        int byteorder)  {      PyObject *result; -    PyObject *tmp = PyUnicode_FromUnicode(s, size); +    PyObject *tmp = PyUnicode_FromWideChar(s, size);      if (tmp == NULL)          return NULL;      result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder); @@ -5868,7 +5852,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,                        int byteorder)  {      PyObject *result; -    PyObject *tmp = PyUnicode_FromUnicode(s, size); +    PyObject *tmp = PyUnicode_FromWideChar(s, size);      if (tmp == NULL)          return NULL;      result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder); @@ -6259,7 +6243,7 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,                                Py_ssize_t size)  {      PyObject *result; -    PyObject *tmp = PyUnicode_FromUnicode(s, size); +    PyObject *tmp = PyUnicode_FromWideChar(s, size);      if (tmp == NULL) {          return NULL;      } @@ -6476,7 +6460,7 @@ PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,                                   Py_ssize_t size)  {      PyObject *result; -    PyObject *tmp = PyUnicode_FromUnicode(s, size); +    PyObject *tmp = PyUnicode_FromWideChar(s, size);      if (tmp == NULL)          return NULL;      result = PyUnicode_AsRawUnicodeEscapeString(tmp); @@ -6665,8 +6649,7 @@ unicode_encode_call_errorhandler(const char *errors,      if (*exceptionObject == NULL)          return NULL; -    restuple = PyObject_CallFunctionObjArgs( -        *errorHandler, *exceptionObject, NULL); +    restuple = _PyObject_CallArg1(*errorHandler, *exceptionObject);      if (restuple == NULL)          return NULL;      if (!PyTuple_Check(restuple)) { @@ -6814,7 +6797,7 @@ unicode_encode_ucs1(PyObject *unicode,                      goto onError;                  /* subtract preallocated bytes */ -                writer.min_size -= 1; +                writer.min_size -= newpos - collstart;                  if (PyBytes_Check(rep)) {                      /* Directly copy bytes result to output. */ @@ -6830,33 +6813,19 @@ unicode_encode_ucs1(PyObject *unicode,                      if (PyUnicode_READY(rep) < 0)                          goto onError; -                    if (PyUnicode_IS_ASCII(rep)) { -                        /* Fast path: all characters are smaller than limit */ -                        assert(limit >= 128); -                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); -                        str = _PyBytesWriter_WriteBytes(&writer, str, -                                                        PyUnicode_DATA(rep), -                                                        PyUnicode_GET_LENGTH(rep)); -                    } -                    else { -                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep); - -                        str = _PyBytesWriter_Prepare(&writer, str, repsize); -                        if (str == NULL) -                            goto onError; - -                        /* check if there is anything unencodable in the -                           replacement and copy it to the output */ -                        for (i = 0; repsize-->0; ++i, ++str) { -                            ch = PyUnicode_READ_CHAR(rep, i); -                            if (ch >= limit) { -                                raise_encode_exception(&exc, encoding, unicode, -                                                       pos, pos+1, reason); -                                goto onError; -                            } -                            *str = (char)ch; -                        } +                    if (limit == 256 ? +                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND : +                        !PyUnicode_IS_ASCII(rep)) +                    { +                        /* Not all characters are smaller than limit */ +                        raise_encode_exception(&exc, encoding, unicode, +                                               collstart, collend, reason); +                        goto onError;                      } +                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); +                    str = _PyBytesWriter_WriteBytes(&writer, str, +                                                    PyUnicode_DATA(rep), +                                                    PyUnicode_GET_LENGTH(rep));                  }                  pos = newpos;                  Py_CLEAR(rep); @@ -6887,7 +6856,7 @@ PyUnicode_EncodeLatin1(const Py_UNICODE *p,                         const char *errors)  {      PyObject *result; -    PyObject *unicode = PyUnicode_FromUnicode(p, size); +    PyObject *unicode = PyUnicode_FromWideChar(p, size);      if (unicode == NULL)          return NULL;      result = unicode_encode_ucs1(unicode, errors, 256); @@ -7028,7 +6997,7 @@ PyUnicode_EncodeASCII(const Py_UNICODE *p,                        const char *errors)  {      PyObject *result; -    PyObject *unicode = PyUnicode_FromUnicode(p, size); +    PyObject *unicode = PyUnicode_FromWideChar(p, size);      if (unicode == NULL)          return NULL;      result = unicode_encode_ucs1(unicode, errors, 128); @@ -7754,7 +7723,7 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,                       const char *errors)  {      PyObject *unicode, *res; -    unicode = PyUnicode_FromUnicode(p, size); +    unicode = PyUnicode_FromWideChar(p, size);      if (unicode == NULL)          return NULL;      res = encode_code_page(CP_ACP, unicode, errors); @@ -8602,7 +8571,7 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p,                          const char *errors)  {      PyObject *result; -    PyObject *unicode = PyUnicode_FromUnicode(p, size); +    PyObject *unicode = PyUnicode_FromWideChar(p, size);      if (unicode == NULL)          return NULL;      result = _PyUnicode_EncodeCharmap(unicode, mapping, errors); @@ -8657,7 +8626,7 @@ unicode_translate_call_errorhandler(const char *errors,                                      Py_ssize_t startpos, Py_ssize_t endpos,                                      Py_ssize_t *newpos)  { -    static const char *argparse = "O!n;translating error handler must return (str, int) tuple"; +    static const char *argparse = "Un;translating error handler must return (str, int) tuple";      Py_ssize_t i_newpos;      PyObject *restuple; @@ -8674,16 +8643,15 @@ unicode_translate_call_errorhandler(const char *errors,      if (*exceptionObject == NULL)          return NULL; -    restuple = PyObject_CallFunctionObjArgs( -        *errorHandler, *exceptionObject, NULL); +    restuple = _PyObject_CallArg1(*errorHandler, *exceptionObject);      if (restuple == NULL)          return NULL;      if (!PyTuple_Check(restuple)) { -        PyErr_SetString(PyExc_TypeError, &argparse[4]); +        PyErr_SetString(PyExc_TypeError, &argparse[3]);          Py_DECREF(restuple);          return NULL;      } -    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, +    if (!PyArg_ParseTuple(restuple, argparse,                            &resunicode, &i_newpos)) {          Py_DECREF(restuple);          return NULL; @@ -9042,7 +9010,7 @@ PyUnicode_TranslateCharmap(const Py_UNICODE *p,                             const char *errors)  {      PyObject *result; -    PyObject *unicode = PyUnicode_FromUnicode(p, size); +    PyObject *unicode = PyUnicode_FromWideChar(p, size);      if (!unicode)          return NULL;      result = _PyUnicode_TranslateCharmap(unicode, mapping, errors); @@ -9170,14 +9138,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,          return -1;      } -    unicode = PyUnicode_FromUnicode(s, length); +    unicode = PyUnicode_FromWideChar(s, length);      if (unicode == NULL)          return -1; -    if (PyUnicode_READY(unicode) == -1) { -        Py_DECREF(unicode); -        return -1; -    }      kind = PyUnicode_KIND(unicode);      data = PyUnicode_DATA(unicode); @@ -15345,7 +15309,7 @@ unicodeiter_reduce(unicodeiterobject *it)          return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),                               it->it_seq, it->it_index);      } else { -        PyObject *u = PyUnicode_FromUnicode(NULL, 0); +        PyObject *u = (PyObject *)_PyUnicode_New(0);          if (u == NULL)              return NULL;          return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u); @@ -15440,10 +15404,7 @@ unicode_iter(PyObject *seq)  size_t  Py_UNICODE_strlen(const Py_UNICODE *u)  { -    int res = 0; -    while(*u++) -        res++; -    return res; +    return wcslen(u);  }  Py_UNICODE* @@ -15468,8 +15429,8 @@ Py_UNICODE*  Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)  {      Py_UNICODE *u1 = s1; -    u1 += Py_UNICODE_strlen(u1); -    Py_UNICODE_strcpy(u1, s2); +    u1 += wcslen(u1); +    while ((*u1++ = *s2++));      return s1;  } @@ -15518,7 +15479,7 @@ Py_UNICODE*  Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)  {      const Py_UNICODE *p; -    p = s + Py_UNICODE_strlen(s); +    p = s + wcslen(s);      while (p != s) {          p--;          if (*p == c) | 
