diff options
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 280 |
1 files changed, 242 insertions, 38 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 7334eb3e36..e21834a5c1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -61,10 +61,9 @@ PyObject *normalizestring(const char *string) return NULL; } - v = PyString_FromStringAndSize(NULL, len); - if (v == NULL) + p = PyMem_Malloc(len + 1); + if (p == NULL) return NULL; - p = PyString_AS_STRING(v); for (i = 0; i < len; i++) { register char ch = string[i]; if (ch == ' ') @@ -73,6 +72,11 @@ PyObject *normalizestring(const char *string) ch = Py_TOLOWER(Py_CHARMASK(ch)); p[i] = ch; } + p[i] = '\0'; + v = PyUnicode_FromString(p); + if (v == NULL) + return NULL; + PyMem_Free(p); return v; } @@ -112,7 +116,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) v = normalizestring(encoding); if (v == NULL) goto onError; - PyString_InternInPlace(&v); + PyUnicode_InternInPlace(&v); /* First, try to lookup the name in the registry dictionary */ result = PyDict_GetItem(interp->codec_search_cache, v); @@ -167,7 +171,10 @@ PyObject *_PyCodec_Lookup(const char *encoding) } /* Cache and return the result */ - PyDict_SetItem(interp->codec_search_cache, v, result); + if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) { + Py_DECREF(result); + goto onError; + } Py_DECREF(args); return result; @@ -176,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding) return NULL; } +/* Codec registry encoding check API. */ + +int PyCodec_KnownEncoding(const char *encoding) +{ + PyObject *codecs; + + codecs = _PyCodec_Lookup(encoding); + if (!codecs) { + PyErr_Clear(); + return 0; + } + else { + Py_DECREF(codecs); + return 1; + } +} + static PyObject *args_tuple(PyObject *object, const char *errors) @@ -190,7 +214,7 @@ PyObject *args_tuple(PyObject *object, if (errors) { PyObject *v; - v = PyString_FromString(errors); + v = PyUnicode_FromString(errors); if (v == NULL) { Py_DECREF(args); return NULL; @@ -317,7 +341,7 @@ PyObject *PyCodec_Encode(PyObject *object, { PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; - PyObject *v; + PyObject *v = NULL; encoder = PyCodec_Encoder(encoding); if (encoder == NULL) @@ -327,14 +351,14 @@ PyObject *PyCodec_Encode(PyObject *object, if (args == NULL) goto onError; - result = PyEval_CallObject(encoder,args); + result = PyEval_CallObject(encoder, args); if (result == NULL) goto onError; if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2) { PyErr_SetString(PyExc_TypeError, - "encoder must return a tuple (object,integer)"); + "encoder must return a tuple (object, integer)"); goto onError; } v = PyTuple_GET_ITEM(result,0); @@ -446,14 +470,9 @@ static void wrong_exception_type(PyObject *exc) PyObject *name = PyObject_GetAttrString(type, "__name__"); Py_DECREF(type); if (name != NULL) { - PyObject *string = PyObject_Str(name); + PyErr_Format(PyExc_TypeError, + "don't know how to handle %S in error callback", name); Py_DECREF(name); - if (string != NULL) { - PyErr_Format(PyExc_TypeError, - "don't know how to handle %.400s in error callback", - PyString_AS_STRING(string)); - Py_DECREF(string); - } } } } @@ -468,7 +487,6 @@ PyObject *PyCodec_StrictErrors(PyObject *exc) } -#ifdef Py_USING_UNICODE PyObject *PyCodec_IgnoreErrors(PyObject *exc) { Py_ssize_t end; @@ -660,6 +678,13 @@ static Py_UNICODE hexdigits[] = { PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { +#ifndef Py_UNICODE_WIDE +#define IS_SURROGATE_PAIR(p, end) \ + (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \ + *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF) +#else +#define IS_SURROGATE_PAIR(p, end) 0 +#endif if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; @@ -684,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) else #endif if (*p >= 0x100) { - ressize += 1+1+4; + if (IS_SURROGATE_PAIR(p, startp+end)) { + ressize += 1+1+8; + ++p; + } + else + ressize += 1+1+4; } else ressize += 1+1+2; @@ -694,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < startp+end; ++p) { - Py_UNICODE c = *p; + Py_UCS4 c = (Py_UCS4) *p; *outp++ = '\\'; -#ifdef Py_UNICODE_WIDE + if (IS_SURROGATE_PAIR(p, startp+end)) { + c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000; + ++p; + } if (c >= 0x00010000) { *outp++ = 'U'; *outp++ = hexdigits[(c>>28)&0xf]; @@ -706,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; } - else -#endif - if (c >= 0x100) { + else if (c >= 0x100) { *outp++ = 'u'; *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; @@ -728,8 +759,167 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } +#undef IS_SURROGATE_PAIR } -#endif + +/* This handler is declared static until someone demonstrates + a need to call it directly. */ +static PyObject * +PyCodec_SurrogatePassErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + Py_UNICODE *p; + Py_UNICODE *startp; + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (p = startp+start; p < startp+end; p++) { + Py_UNICODE ch = *p; + if (ch < 0xd800 || ch > 0xdfff) { + /* Not a surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = (char)(0xe0 | (ch >> 12)); + *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *outp++ = (char)(0x80 | (ch & 0x3f)); + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UNICODE ch = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + /* Try decoding a single surrogate character. If + there are more, let the codec call us again. */ + p += start; + if (PyBytes_GET_SIZE(object) - start >= 3 && + (p[0] & 0xf0) == 0xe0 && + (p[1] & 0xc0) == 0x80 && + (p[2] & 0xc0) == 0x80) { + /* it's a three-byte code */ + ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); + if (ch < 0xd800 || ch > 0xdfff) + /* it's not a surrogate - fail */ + ch = 0; + } + Py_DECREF(object); + if (ch == 0) { + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + return Py_BuildValue("(u#n)", &ch, 1, start+3); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + +static PyObject * +PyCodec_SurrogateEscapeErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + Py_UNICODE *p; + Py_UNICODE *startp; + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + res = PyBytes_FromStringAndSize(NULL, end-start); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (p = startp+start; p < startp+end; p++) { + Py_UNICODE ch = *p; + if (ch < 0xdc80 || ch > 0xdcff) { + /* Not a UTF-8b surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = ch - 0xdc00; + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */ + int consumed = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + while (consumed < 4 && consumed < end-start) { + /* Refuse to escape ASCII bytes. */ + if (p[start+consumed] < 128) + break; + ch[consumed] = 0xdc00 + p[start+consumed]; + consumed++; + } + Py_DECREF(object); + if (!consumed) { + /* codec complained about ASCII byte. */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + return Py_BuildValue("(u#n)", ch, consumed, start+consumed); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + static PyObject *strict_errors(PyObject *self, PyObject *exc) { @@ -737,7 +927,6 @@ static PyObject *strict_errors(PyObject *self, PyObject *exc) } -#ifdef Py_USING_UNICODE static PyObject *ignore_errors(PyObject *self, PyObject *exc) { return PyCodec_IgnoreErrors(exc); @@ -760,7 +949,16 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) { return PyCodec_BackslashReplaceErrors(exc); } -#endif + +static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogatePassErrors(exc); +} + +static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_SurrogateEscapeErrors(exc); +} static int _PyCodecRegistry_Init(void) { @@ -779,7 +977,6 @@ static int _PyCodecRegistry_Init(void) "raises a UnicodeError on coding errors.") } }, -#ifdef Py_USING_UNICODE { "ignore", { @@ -821,8 +1018,23 @@ static int _PyCodecRegistry_Init(void) "which replaces an unencodable character with a " "backslashed escape sequence.") } + }, + { + "surrogatepass", + { + "surrogatepass", + surrogatepass_errors, + METH_O + } + }, + { + "surrogateescape", + { + "surrogateescape", + surrogateescape_errors, + METH_O + } } -#endif }; PyInterpreterState *interp = PyThreadState_GET()->interp; @@ -854,19 +1066,11 @@ static int _PyCodecRegistry_Init(void) interp->codec_error_registry == NULL) Py_FatalError("can't initialize codec registry"); - mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0); + mod = PyImport_ImportModuleNoBlock("encodings"); if (mod == NULL) { - if (PyErr_ExceptionMatches(PyExc_ImportError)) { - /* Ignore ImportErrors... this is done so that - distributions can disable the encodings package. Note - that other errors are not masked, e.g. SystemErrors - raised to inform the user of an error in the Python - configuration are still reported back to the user. */ - PyErr_Clear(); - return 0; - } return -1; } Py_DECREF(mod); + interp->codecs_initialized = 1; return 0; } |