summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c225
1 files changed, 94 insertions, 131 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 9c998f7ab3..7f58129f28 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1029,8 +1029,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length)
if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
PyObject *copy;
- if (PyUnicode_READY(unicode) == -1)
- return NULL;
+ assert(PyUnicode_IS_READY(unicode));
copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
if (copy == NULL)
@@ -1974,14 +1973,11 @@ unicode_char(Py_UCS4 ch)
unicode = PyUnicode_New(1, ch);
if (unicode == NULL)
return NULL;
- switch (PyUnicode_KIND(unicode)) {
- case PyUnicode_1BYTE_KIND:
- PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
- break;
- case PyUnicode_2BYTE_KIND:
+
+ assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
+ if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
- break;
- default:
+ } else {
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
}
@@ -1992,12 +1988,32 @@ unicode_char(Py_UCS4 ch)
PyObject *
PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
{
+ if (u == NULL)
+ return (PyObject*)_PyUnicode_New(size);
+
+ if (size < 0) {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+
+ return PyUnicode_FromWideChar(u, size);
+}
+
+PyObject *
+PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
+{
PyObject *unicode;
Py_UCS4 maxchar = 0;
Py_ssize_t num_surrogates;
- if (u == NULL)
- return (PyObject*)_PyUnicode_New(size);
+ if (u == NULL && size != 0) {
+ PyErr_BadInternalCall();
+ return NULL;
+ }
+
+ if (size == -1) {
+ size = wcslen(u);
+ }
/* If the Unicode data is known at construction time, we can apply
some optimizations which share commonly used objects. */
@@ -2482,27 +2498,6 @@ PyUnicode_AsUCS4Copy(PyObject *string)
return as_ucs4(string, NULL, 0, 1);
}
-#ifdef HAVE_WCHAR_H
-
-PyObject *
-PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
-{
- if (w == NULL) {
- if (size == 0)
- _Py_RETURN_UNICODE_EMPTY();
- PyErr_BadInternalCall();
- return NULL;
- }
-
- if (size == -1) {
- size = wcslen(w);
- }
-
- return PyUnicode_FromUnicode(w, size);
-}
-
-#endif /* HAVE_WCHAR_H */
-
/* maximum number of characters required for output of %lld or %p.
We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
plus 1 for the sign. 53/22 is an upper bound for log10(256). */
@@ -3300,7 +3295,7 @@ PyUnicode_Encode(const Py_UNICODE *s,
{
PyObject *v, *unicode;
- unicode = PyUnicode_FromUnicode(s, size);
+ unicode = PyUnicode_FromWideChar(s, size);
if (unicode == NULL)
return NULL;
v = PyUnicode_AsEncodedString(unicode, encoding, errors);
@@ -3412,11 +3407,9 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
{
Py_ssize_t wlen, wlen2;
wchar_t *wstr;
- PyObject *bytes = NULL;
char *errmsg;
- PyObject *reason = NULL;
- PyObject *exc;
- size_t error_pos;
+ PyObject *bytes, *reason, *exc;
+ size_t error_pos, errlen;
int surrogateescape;
if (locale_error_handler(errors, &surrogateescape) < 0)
@@ -3471,6 +3464,7 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
if (len2 == (size_t)-1 || len2 > len) {
+ Py_DECREF(bytes);
error_pos = (size_t)-1;
goto encode_error;
}
@@ -3486,17 +3480,15 @@ encode_error:
error_pos = wcstombs_errorpos(wstr);
PyMem_Free(wstr);
- Py_XDECREF(bytes);
-
- if (errmsg != NULL) {
- size_t errlen;
- wstr = Py_DecodeLocale(errmsg, &errlen);
- if (wstr != NULL) {
- reason = PyUnicode_FromWideChar(wstr, errlen);
- PyMem_RawFree(wstr);
- } else
- errmsg = NULL;
+
+ wstr = Py_DecodeLocale(errmsg, &errlen);
+ if (wstr != NULL) {
+ reason = PyUnicode_FromWideChar(wstr, errlen);
+ PyMem_RawFree(wstr);
+ } else {
+ errmsg = NULL;
}
+
if (errmsg == NULL)
reason = PyUnicode_FromString(
"wcstombs() encountered an unencodable "
@@ -3512,7 +3504,7 @@ encode_error:
Py_DECREF(reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
- Py_XDECREF(exc);
+ Py_DECREF(exc);
}
return NULL;
}
@@ -3719,10 +3711,9 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
size_t wlen, wlen2;
PyObject *unicode;
int surrogateescape;
- size_t error_pos;
+ size_t error_pos, errlen;
char *errmsg;
- PyObject *reason = NULL; /* initialize to prevent gcc warning */
- PyObject *exc;
+ PyObject *exc, *reason = NULL; /* initialize to prevent gcc warning */
if (locale_error_handler(errors, &surrogateescape) < 0)
return NULL;
@@ -3780,19 +3771,16 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
return unicode;
decode_error:
- reason = NULL;
errmsg = strerror(errno);
assert(errmsg != NULL);
error_pos = mbstowcs_errorpos(str, len);
- if (errmsg != NULL) {
- size_t errlen;
- wstr = Py_DecodeLocale(errmsg, &errlen);
- if (wstr != NULL) {
- reason = PyUnicode_FromWideChar(wstr, errlen);
- PyMem_RawFree(wstr);
- }
+ wstr = Py_DecodeLocale(errmsg, &errlen);
+ if (wstr != NULL) {
+ reason = PyUnicode_FromWideChar(wstr, errlen);
+ PyMem_RawFree(wstr);
}
+
if (reason == NULL)
reason = PyUnicode_FromString(
"mbstowcs() encountered an invalid multibyte sequence");
@@ -3807,7 +3795,7 @@ decode_error:
Py_DECREF(reason);
if (exc != NULL) {
PyCodec_StrictErrors(exc);
- Py_XDECREF(exc);
+ Py_DECREF(exc);
}
return NULL;
}
@@ -4140,7 +4128,11 @@ PyUnicode_GetSize(PyObject *unicode)
PyErr_BadArgument();
goto onError;
}
- return PyUnicode_GET_SIZE(unicode);
+ if (_PyUnicode_WSTR(unicode) == NULL) {
+ if (PyUnicode_AsUnicode(unicode) == NULL)
+ goto onError;
+ }
+ return PyUnicode_WSTR_LENGTH(unicode);
onError:
return -1;
@@ -4248,7 +4240,7 @@ unicode_decode_call_errorhandler_wchar(
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
PyObject **output, Py_ssize_t *outpos)
{
- static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+ static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
@@ -4281,10 +4273,10 @@ unicode_decode_call_errorhandler_wchar(
if (restuple == NULL)
goto onError;
if (!PyTuple_Check(restuple)) {
- PyErr_SetString(PyExc_TypeError, &argparse[4]);
+ PyErr_SetString(PyExc_TypeError, &argparse[3]);
goto onError;
}
- if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
+ if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
goto onError;
/* Copy back the bytes variables, which might have been modified by the
@@ -4292,9 +4284,6 @@ unicode_decode_call_errorhandler_wchar(
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
- if (!PyBytes_Check(inputobj)) {
- PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
- }
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
@@ -4335,7 +4324,7 @@ unicode_decode_call_errorhandler_wchar(
*inptr = *input + newpos;
/* we made it! */
- Py_XDECREF(restuple);
+ Py_DECREF(restuple);
return 0;
overflow:
@@ -4356,7 +4345,7 @@ unicode_decode_call_errorhandler_writer(
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
_PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
{
- static const char *argparse = "O!n;decoding error handler must return (str, int) tuple";
+ static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
PyObject *restuple = NULL;
PyObject *repunicode = NULL;
@@ -4383,10 +4372,10 @@ unicode_decode_call_errorhandler_writer(
if (restuple == NULL)
goto onError;
if (!PyTuple_Check(restuple)) {
- PyErr_SetString(PyExc_TypeError, &argparse[4]);
+ PyErr_SetString(PyExc_TypeError, &argparse[3]);
goto onError;
}
- if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
+ if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
goto onError;
/* Copy back the bytes variables, which might have been modified by the
@@ -4394,9 +4383,6 @@ unicode_decode_call_errorhandler_writer(
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
- if (!PyBytes_Check(inputobj)) {
- PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
- }
*input = PyBytes_AS_STRING(inputobj);
insize = PyBytes_GET_SIZE(inputobj);
*inend = *input + insize;
@@ -4411,8 +4397,6 @@ unicode_decode_call_errorhandler_writer(
goto onError;
}
- if (PyUnicode_READY(repunicode) < 0)
- goto onError;
replen = PyUnicode_GET_LENGTH(repunicode);
if (replen > 1) {
writer->min_length += replen - 1;
@@ -4428,7 +4412,7 @@ unicode_decode_call_errorhandler_writer(
*inptr = *input + newpos;
/* we made it! */
- Py_XDECREF(restuple);
+ Py_DECREF(restuple);
return 0;
onError:
@@ -4834,7 +4818,7 @@ PyUnicode_EncodeUTF7(const Py_UNICODE *s,
const char *errors)
{
PyObject *result;
- PyObject *tmp = PyUnicode_FromUnicode(s, size);
+ PyObject *tmp = PyUnicode_FromWideChar(s, size);
if (tmp == NULL)
return NULL;
result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
@@ -5190,7 +5174,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
{
PyObject *v, *unicode;
- unicode = PyUnicode_FromUnicode(s, size);
+ unicode = PyUnicode_FromWideChar(s, size);
if (unicode == NULL)
return NULL;
v = _PyUnicode_AsUTF8String(unicode, errors);
@@ -5515,7 +5499,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
int byteorder)
{
PyObject *result;
- PyObject *tmp = PyUnicode_FromUnicode(s, size);
+ PyObject *tmp = PyUnicode_FromWideChar(s, size);
if (tmp == NULL)
return NULL;
result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
@@ -5868,7 +5852,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
int byteorder)
{
PyObject *result;
- PyObject *tmp = PyUnicode_FromUnicode(s, size);
+ PyObject *tmp = PyUnicode_FromWideChar(s, size);
if (tmp == NULL)
return NULL;
result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
@@ -6259,7 +6243,7 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
PyObject *result;
- PyObject *tmp = PyUnicode_FromUnicode(s, size);
+ PyObject *tmp = PyUnicode_FromWideChar(s, size);
if (tmp == NULL) {
return NULL;
}
@@ -6476,7 +6460,7 @@ PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
PyObject *result;
- PyObject *tmp = PyUnicode_FromUnicode(s, size);
+ PyObject *tmp = PyUnicode_FromWideChar(s, size);
if (tmp == NULL)
return NULL;
result = PyUnicode_AsRawUnicodeEscapeString(tmp);
@@ -6814,7 +6798,7 @@ unicode_encode_ucs1(PyObject *unicode,
goto onError;
/* subtract preallocated bytes */
- writer.min_size -= 1;
+ writer.min_size -= newpos - collstart;
if (PyBytes_Check(rep)) {
/* Directly copy bytes result to output. */
@@ -6830,33 +6814,19 @@ unicode_encode_ucs1(PyObject *unicode,
if (PyUnicode_READY(rep) < 0)
goto onError;
- if (PyUnicode_IS_ASCII(rep)) {
- /* Fast path: all characters are smaller than limit */
- assert(limit >= 128);
- assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
- str = _PyBytesWriter_WriteBytes(&writer, str,
- PyUnicode_DATA(rep),
- PyUnicode_GET_LENGTH(rep));
- }
- else {
- Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
-
- str = _PyBytesWriter_Prepare(&writer, str, repsize);
- if (str == NULL)
- goto onError;
-
- /* check if there is anything unencodable in the
- replacement and copy it to the output */
- for (i = 0; repsize-->0; ++i, ++str) {
- ch = PyUnicode_READ_CHAR(rep, i);
- if (ch >= limit) {
- raise_encode_exception(&exc, encoding, unicode,
- pos, pos+1, reason);
- goto onError;
- }
- *str = (char)ch;
- }
+ if (limit == 256 ?
+ PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
+ !PyUnicode_IS_ASCII(rep))
+ {
+ /* Not all characters are smaller than limit */
+ raise_encode_exception(&exc, encoding, unicode,
+ collstart, collend, reason);
+ goto onError;
}
+ assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+ str = _PyBytesWriter_WriteBytes(&writer, str,
+ PyUnicode_DATA(rep),
+ PyUnicode_GET_LENGTH(rep));
}
pos = newpos;
Py_CLEAR(rep);
@@ -6887,7 +6857,7 @@ PyUnicode_EncodeLatin1(const Py_UNICODE *p,
const char *errors)
{
PyObject *result;
- PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ PyObject *unicode = PyUnicode_FromWideChar(p, size);
if (unicode == NULL)
return NULL;
result = unicode_encode_ucs1(unicode, errors, 256);
@@ -7028,7 +6998,7 @@ PyUnicode_EncodeASCII(const Py_UNICODE *p,
const char *errors)
{
PyObject *result;
- PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ PyObject *unicode = PyUnicode_FromWideChar(p, size);
if (unicode == NULL)
return NULL;
result = unicode_encode_ucs1(unicode, errors, 128);
@@ -7754,7 +7724,7 @@ PyUnicode_EncodeMBCS(const Py_UNICODE *p,
const char *errors)
{
PyObject *unicode, *res;
- unicode = PyUnicode_FromUnicode(p, size);
+ unicode = PyUnicode_FromWideChar(p, size);
if (unicode == NULL)
return NULL;
res = encode_code_page(CP_ACP, unicode, errors);
@@ -8602,7 +8572,7 @@ PyUnicode_EncodeCharmap(const Py_UNICODE *p,
const char *errors)
{
PyObject *result;
- PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ PyObject *unicode = PyUnicode_FromWideChar(p, size);
if (unicode == NULL)
return NULL;
result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
@@ -8657,7 +8627,7 @@ unicode_translate_call_errorhandler(const char *errors,
Py_ssize_t startpos, Py_ssize_t endpos,
Py_ssize_t *newpos)
{
- static const char *argparse = "O!n;translating error handler must return (str, int) tuple";
+ static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Py_ssize_t i_newpos;
PyObject *restuple;
@@ -8679,11 +8649,11 @@ unicode_translate_call_errorhandler(const char *errors,
if (restuple == NULL)
return NULL;
if (!PyTuple_Check(restuple)) {
- PyErr_SetString(PyExc_TypeError, &argparse[4]);
+ PyErr_SetString(PyExc_TypeError, &argparse[3]);
Py_DECREF(restuple);
return NULL;
}
- if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
+ if (!PyArg_ParseTuple(restuple, argparse,
&resunicode, &i_newpos)) {
Py_DECREF(restuple);
return NULL;
@@ -9042,7 +9012,7 @@ PyUnicode_TranslateCharmap(const Py_UNICODE *p,
const char *errors)
{
PyObject *result;
- PyObject *unicode = PyUnicode_FromUnicode(p, size);
+ PyObject *unicode = PyUnicode_FromWideChar(p, size);
if (!unicode)
return NULL;
result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
@@ -9170,14 +9140,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
return -1;
}
- unicode = PyUnicode_FromUnicode(s, length);
+ unicode = PyUnicode_FromWideChar(s, length);
if (unicode == NULL)
return -1;
- if (PyUnicode_READY(unicode) == -1) {
- Py_DECREF(unicode);
- return -1;
- }
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
@@ -15359,7 +15325,7 @@ unicodeiter_reduce(unicodeiterobject *it)
return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
it->it_seq, it->it_index);
} else {
- PyObject *u = PyUnicode_FromUnicode(NULL, 0);
+ PyObject *u = (PyObject *)_PyUnicode_New(0);
if (u == NULL)
return NULL;
return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
@@ -15454,10 +15420,7 @@ unicode_iter(PyObject *seq)
size_t
Py_UNICODE_strlen(const Py_UNICODE *u)
{
- int res = 0;
- while(*u++)
- res++;
- return res;
+ return wcslen(u);
}
Py_UNICODE*
@@ -15482,8 +15445,8 @@ Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
{
Py_UNICODE *u1 = s1;
- u1 += Py_UNICODE_strlen(u1);
- Py_UNICODE_strcpy(u1, s2);
+ u1 += wcslen(u1);
+ while ((*u1++ = *s2++));
return s1;
}
@@ -15532,7 +15495,7 @@ Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
{
const Py_UNICODE *p;
- p = s + Py_UNICODE_strlen(s);
+ p = s + wcslen(s);
while (p != s) {
p--;
if (*p == c)