diff options
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 438 |
1 files changed, 309 insertions, 129 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 7334eb3e36..fd67d1b9e1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -11,6 +11,8 @@ Copyright (c) Corporation for National Research Initiatives. #include "Python.h" #include <ctype.h> +const char *Py_hexdigits = "0123456789abcdef"; + /* --- Codec Registry ----------------------------------------------------- */ /* Import the standard encodings package which will register the first @@ -61,10 +63,9 @@ PyObject *normalizestring(const char *string) return NULL; } - v = PyString_FromStringAndSize(NULL, len); - if (v == NULL) + p = PyMem_Malloc(len + 1); + if (p == NULL) return NULL; - p = PyString_AS_STRING(v); for (i = 0; i < len; i++) { register char ch = string[i]; if (ch == ' ') @@ -73,6 +74,11 @@ PyObject *normalizestring(const char *string) ch = Py_TOLOWER(Py_CHARMASK(ch)); p[i] = ch; } + p[i] = '\0'; + v = PyUnicode_FromString(p); + if (v == NULL) + return NULL; + PyMem_Free(p); return v; } @@ -112,7 +118,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) v = normalizestring(encoding); if (v == NULL) goto onError; - PyString_InternInPlace(&v); + PyUnicode_InternInPlace(&v); /* First, try to lookup the name in the registry dictionary */ result = PyDict_GetItem(interp->codec_search_cache, v); @@ -167,7 +173,10 @@ PyObject *_PyCodec_Lookup(const char *encoding) } /* Cache and return the result */ - PyDict_SetItem(interp->codec_search_cache, v, result); + if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { + Py_DECREF(result); + goto onError; + } Py_DECREF(args); return result; @@ -176,6 +185,23 @@ PyObject *_PyCodec_Lookup(const char *encoding) return NULL; } +/* Codec registry encoding check API. */ + +int PyCodec_KnownEncoding(const char *encoding) +{ + PyObject *codecs; + + codecs = _PyCodec_Lookup(encoding); + if (!codecs) { + PyErr_Clear(); + return 0; + } + else { + Py_DECREF(codecs); + return 1; + } +} + static PyObject *args_tuple(PyObject *object, const char *errors) @@ -190,7 +216,7 @@ PyObject *args_tuple(PyObject *object, if (errors) { PyObject *v; - v = PyString_FromString(errors); + v = PyUnicode_FromString(errors); if (v == NULL) { Py_DECREF(args); return NULL; @@ -317,7 +343,7 @@ PyObject *PyCodec_Encode(PyObject *object, { PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; - PyObject *v; + PyObject *v = NULL; encoder = PyCodec_Encoder(encoding); if (encoder == NULL) @@ -327,14 +353,14 @@ PyObject *PyCodec_Encode(PyObject *object, if (args == NULL) goto onError; - result = PyEval_CallObject(encoder,args); + result = PyEval_CallObject(encoder, args); if (result == NULL) goto onError; if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2) { PyErr_SetString(PyExc_TypeError, - "encoder must return a tuple (object,integer)"); + "encoder must return a tuple (object, integer)"); goto onError; } v = PyTuple_GET_ITEM(result,0); @@ -441,19 +467,16 @@ PyObject *PyCodec_LookupError(const char *name) static void wrong_exception_type(PyObject *exc) { - PyObject *type = PyObject_GetAttrString(exc, "__class__"); + _Py_IDENTIFIER(__class__); + _Py_IDENTIFIER(__name__); + PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__); if (type != NULL) { - PyObject *name = PyObject_GetAttrString(type, "__name__"); + PyObject *name = _PyObject_GetAttrId(type, &PyId___name__); Py_DECREF(type); if (name != NULL) { - PyObject *string = PyObject_Str(name); + PyErr_Format(PyExc_TypeError, + "don't know how to handle %S in error callback", name); Py_DECREF(name); - if (string != NULL) { - PyErr_Format(PyExc_TypeError, - "don't know how to handle %.400s in error callback", - PyString_AS_STRING(string)); - Py_DECREF(string); - } } } } @@ -468,7 +491,6 @@ PyObject *PyCodec_StrictErrors(PyObject *exc) } -#ifdef Py_USING_UNICODE PyObject *PyCodec_IgnoreErrors(PyObject *exc) { Py_ssize_t end; @@ -488,57 +510,58 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } - /* ouch: passing NULL, 0, pos gives None instead of u'' */ - return Py_BuildValue("(u#n)", &end, 0, end); + return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end); } PyObject *PyCodec_ReplaceErrors(PyObject *exc) { - PyObject *restuple; - Py_ssize_t start; - Py_ssize_t end; - Py_ssize_t i; + Py_ssize_t start, end, i, len; if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *res; - Py_UNICODE *p; + int kind; + void *data; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; - res = PyUnicode_FromUnicode(NULL, end-start); + len = end - start; + res = PyUnicode_New(len, '?'); if (res == NULL) return NULL; - for (p = PyUnicode_AS_UNICODE(res), i = start; - i<end; ++p, ++i) - *p = '?'; - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - return restuple; + kind = PyUnicode_KIND(res); + data = PyUnicode_DATA(res); + for (i = 0; i < len; ++i) + PyUnicode_WRITE(kind, data, i, '?'); + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { - Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; if (PyUnicodeDecodeError_GetEnd(exc, &end)) return NULL; - return Py_BuildValue("(u#n)", &res, 1, end); + return Py_BuildValue("(Cn)", + (int)Py_UNICODE_REPLACEMENT_CHARACTER, + end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { PyObject *res; - Py_UNICODE *p; + int kind; + void *data; if (PyUnicodeTranslateError_GetStart(exc, &start)) return NULL; if (PyUnicodeTranslateError_GetEnd(exc, &end)) return NULL; - res = PyUnicode_FromUnicode(NULL, end-start); + len = end - start; + res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER); if (res == NULL) return NULL; - for (p = PyUnicode_AS_UNICODE(res), i = start; - i<end; ++p, ++i) - *p = Py_UNICODE_REPLACEMENT_CHARACTER; - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); - return restuple; + kind = PyUnicode_KIND(res); + data = PyUnicode_DATA(res); + for (i=0; i < len; i++) + PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); + assert(_PyUnicode_CheckConsistency(res, 1)); + return Py_BuildValue("(Nn)", res, end); } else { wrong_exception_type(exc); @@ -551,82 +574,72 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; - Py_UNICODE *p; - Py_UNICODE *startp; - Py_UNICODE *outp; + unsigned char *outp; int ressize; + Py_UCS4 ch; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); - for (p = startp+start, ressize = 0; p < startp+end; ++p) { - if (*p<10) + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + ch = PyUnicode_READ_CHAR(object, i); + if (ch<10) ressize += 2+1+1; - else if (*p<100) + else if (ch<100) ressize += 2+2+1; - else if (*p<1000) + else if (ch<1000) ressize += 2+3+1; - else if (*p<10000) + else if (ch<10000) ressize += 2+4+1; -#ifndef Py_UNICODE_WIDE - else - ressize += 2+5+1; -#else - else if (*p<100000) + else if (ch<100000) ressize += 2+5+1; - else if (*p<1000000) + else if (ch<1000000) ressize += 2+6+1; else ressize += 2+7+1; -#endif } /* allocate replacement */ - res = PyUnicode_FromUnicode(NULL, ressize); + res = PyUnicode_New(ressize, 127); if (res == NULL) { Py_DECREF(object); return NULL; } + outp = PyUnicode_1BYTE_DATA(res); /* generate replacement */ - for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); - p < startp+end; ++p) { - Py_UNICODE c = *p; + for (i = start; i < end; ++i) { int digits; int base; + ch = PyUnicode_READ_CHAR(object, i); *outp++ = '&'; *outp++ = '#'; - if (*p<10) { + if (ch<10) { digits = 1; base = 1; } - else if (*p<100) { + else if (ch<100) { digits = 2; base = 10; } - else if (*p<1000) { + else if (ch<1000) { digits = 3; base = 100; } - else if (*p<10000) { + else if (ch<10000) { digits = 4; base = 1000; } -#ifndef Py_UNICODE_WIDE - else { - digits = 5; - base = 10000; - } -#else - else if (*p<100000) { + else if (ch<100000) { digits = 5; base = 10000; } - else if (*p<1000000) { + else if (ch<1000000) { digits = 6; base = 100000; } @@ -634,16 +647,15 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) digits = 7; base = 1000000; } -#endif while (digits-->0) { - *outp++ = '0' + c/base; - c %= base; + *outp++ = '0' + ch/base; + ch %= base; base /= 10; } *outp++ = ';'; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); return restuple; } @@ -653,83 +665,237 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } } -static Py_UNICODE hexdigits[] = { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' -}; - PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; + Py_ssize_t i; Py_ssize_t start; Py_ssize_t end; PyObject *res; - Py_UNICODE *p; - Py_UNICODE *startp; - Py_UNICODE *outp; + unsigned char *outp; int ressize; + Py_UCS4 c; if (PyUnicodeEncodeError_GetStart(exc, &start)) return NULL; if (PyUnicodeEncodeError_GetEnd(exc, &end)) return NULL; if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; - startp = PyUnicode_AS_UNICODE(object); - for (p = startp+start, ressize = 0; p < startp+end; ++p) { -#ifdef Py_UNICODE_WIDE - if (*p >= 0x00010000) + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (c >= 0x10000) { ressize += 1+1+8; - else -#endif - if (*p >= 0x100) { + } + else if (c >= 0x100) { ressize += 1+1+4; } else ressize += 1+1+2; } - res = PyUnicode_FromUnicode(NULL, ressize); + res = PyUnicode_New(ressize, 127); if (res==NULL) return NULL; - for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); - p < startp+end; ++p) { - Py_UNICODE c = *p; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); *outp++ = '\\'; -#ifdef Py_UNICODE_WIDE if (c >= 0x00010000) { *outp++ = 'U'; - *outp++ = hexdigits[(c>>28)&0xf]; - *outp++ = hexdigits[(c>>24)&0xf]; - *outp++ = hexdigits[(c>>20)&0xf]; - *outp++ = hexdigits[(c>>16)&0xf]; - *outp++ = hexdigits[(c>>12)&0xf]; - *outp++ = hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; } - else -#endif - if (c >= 0x100) { + else if (c >= 0x100) { *outp++ = 'u'; - *outp++ = hexdigits[(c>>12)&0xf]; - *outp++ = hexdigits[(c>>8)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; } else *outp++ = 'x'; - *outp++ = hexdigits[(c>>4)&0xf]; - *outp++ = hexdigits[c&0xf]; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + +/* This handler is declared static until someone demonstrates + a need to call it directly. */ +static PyObject * +PyCodec_SurrogatePassErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); + if (ch < 0xd800 || ch > 0xdfff) { + /* Not a surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = (char)(0xe0 | (ch >> 12)); + *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (char)(0x80 | (ch & 0x3f)); + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UCS4 ch = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + /* Try decoding a single surrogate character. If + there are more, let the codec call us again. */ + p += start; + if (PyBytes_GET_SIZE(object) - start >= 3 && + (p[0] & 0xf0) == 0xe0 && + (p[1] & 0xc0) == 0x80 && + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (ch < 0xd800 || ch > 0xdfff) + /* it's not a surrogate - fail */ + ch = 0; + } + Py_DECREF(object); + if (ch == 0) { + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; } + res = PyUnicode_FromOrdinal(ch); + if (res == NULL) + return NULL; + return Py_BuildValue("(Nn)", res, start+3); + } + else { + wrong_exception_type(exc); + return NULL; + } +} +static PyObject * +PyCodec_SurrogateEscapeErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + res = PyBytes_FromStringAndSize(NULL, end-start); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (i = start; i < end; i++) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); + if (ch < 0xdc80 || ch > 0xdcff) { + /* Not a UTF-8b surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = ch - 0xdc00; + } restuple = Py_BuildValue("(On)", res, end); Py_DECREF(res); Py_DECREF(object); return restuple; } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + PyObject *str; + unsigned char *p; + Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ + int consumed = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + while (consumed < 4 && consumed < end-start) { + /* Refuse to escape ASCII bytes. */ + if (p[start+consumed] < 128) + break; + ch[consumed] = 0xdc00 + p[start+consumed]; + consumed++; + } + Py_DECREF(object); + if (!consumed) { + /* codec complained about ASCII byte. */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed); + if (str == NULL) + return NULL; + return Py_BuildValue("(Nn)", str, start+consumed); + } else { wrong_exception_type(exc); return NULL; } } -#endif + static PyObject *strict_errors(PyObject *self, PyObject *exc) { @@ -737,7 +903,6 @@ static PyObject *strict_errors(PyObject *self, PyObject *exc) } -#ifdef Py_USING_UNICODE static PyObject *ignore_errors(PyObject *self, PyObject *exc) { return PyCodec_IgnoreErrors(exc); @@ -760,7 +925,16 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) { return PyCodec_BackslashReplaceErrors(exc); } -#endif + +static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogatePassErrors(exc); +} + +static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogateEscapeErrors(exc); +} static int _PyCodecRegistry_Init(void) { @@ -779,7 +953,6 @@ static int _PyCodecRegistry_Init(void) "raises a UnicodeError on coding errors.") } }, -#ifdef Py_USING_UNICODE { "ignore", { @@ -821,8 +994,23 @@ static int _PyCodecRegistry_Init(void) "which replaces an unencodable character with a " "backslashed escape sequence.") } + }, + { + "surrogatepass", + { + "surrogatepass", + surrogatepass_errors, + METH_O + } + }, + { + "surrogateescape", + { + "surrogateescape", + surrogateescape_errors, + METH_O + } } -#endif }; PyInterpreterState *interp = PyThreadState_GET()->interp; @@ -837,7 +1025,7 @@ static int _PyCodecRegistry_Init(void) interp->codec_error_registry = PyDict_New(); if (interp->codec_error_registry) { - for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) { + for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { PyObject *func = PyCFunction_New(&methods[i].def, NULL); int res; if (!func) @@ -854,19 +1042,11 @@ static int _PyCodecRegistry_Init(void) interp->codec_error_registry == NULL) Py_FatalError("can't initialize codec registry"); - mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0); + mod = PyImport_ImportModuleNoBlock("encodings"); if (mod == NULL) { - if (PyErr_ExceptionMatches(PyExc_ImportError)) { - /* Ignore ImportErrors... this is done so that - distributions can disable the encodings package. Note - that other errors are not masked, e.g. SystemErrors - raised to inform the user of an error in the Python - configuration are still reported back to the user. */ - PyErr_Clear(); - return 0; - } return -1; } Py_DECREF(mod); + interp->codecs_initialized = 1; return 0; } |