diff options
Diffstat (limited to 'Python/codecs.c')
-rw-r--r-- | Python/codecs.c | 141 |
1 files changed, 137 insertions, 4 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 6849c0f9a4..a0a540304a 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */ #include "Python.h" +#include "ucnhash.h" #include <ctype.h> const char *Py_hexdigits = "0123456789abcdef"; @@ -933,6 +934,104 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static int ucnhash_initialized = 0; + +PyObject *PyCodec_NameReplaceErrors(PyObject *exc) +{ + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + PyObject *restuple; + PyObject *object; + Py_ssize_t i; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + unsigned char *outp; + Py_ssize_t ressize; + int replsize; + Py_UCS4 c; + char buffer[256]; /* NAME_MAXLEN */ + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + if (!ucnhash_initialized) { + /* load the unicode data module */ + ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( + PyUnicodeData_CAPSULE_NAME, 1); + ucnhash_initialized = 1; + } + for (i = start, ressize = 0; i < end; ++i) { + /* object is guaranteed to be "ready" */ + c = PyUnicode_READ_CHAR(object, i); + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + replsize = 1+1+1+strlen(buffer)+1; + } + else if (c >= 0x10000) { + replsize = 1+1+8; + } + else if (c >= 0x100) { + replsize = 1+1+4; + } + else + replsize = 1+1+2; + if (ressize > PY_SSIZE_T_MAX - replsize) + break; + ressize += replsize; + } + end = i; + res = PyUnicode_New(ressize, 127); + if (res==NULL) + return NULL; + for (i = start, outp = PyUnicode_1BYTE_DATA(res); + i < end; ++i) { + c = PyUnicode_READ_CHAR(object, i); + *outp++ = '\\'; + if (ucnhash_CAPI && + ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + *outp++ = 'N'; + *outp++ = '{'; + strcpy((char *)outp, buffer); + outp += strlen(buffer); + *outp++ = '}'; + continue; + } + if (c >= 0x00010000) { + *outp++ = 'U'; + *outp++ = Py_hexdigits[(c>>28)&0xf]; + *outp++ = Py_hexdigits[(c>>24)&0xf]; + *outp++ = Py_hexdigits[(c>>20)&0xf]; + *outp++ = Py_hexdigits[(c>>16)&0xf]; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else if (c >= 0x100) { + *outp++ = 'u'; + *outp++ = Py_hexdigits[(c>>12)&0xf]; + *outp++ = Py_hexdigits[(c>>8)&0xf]; + } + else + *outp++ = 'x'; + *outp++ = Py_hexdigits[(c>>4)&0xf]; + *outp++ = Py_hexdigits[c&0xf]; + } + + assert(outp == PyUnicode_1BYTE_DATA(res) + ressize); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(object); + return restuple; + } + else { + wrong_exception_type(exc); + return NULL; + } +} + +#define ENC_UNKNOWN -1 #define ENC_UTF8 0 #define ENC_UTF16BE 1 #define ENC_UTF16LE 2 @@ -948,7 +1047,11 @@ get_standard_encoding(const char *encoding, int *bytelength) encoding += 3; if (*encoding == '-' || *encoding == '_' ) encoding++; - if (encoding[0] == '1' && encoding[1] == '6') { + if (encoding[0] == '8' && encoding[1] == '\0') { + *bytelength = 3; + return ENC_UTF8; + } + else if (encoding[0] == '1' && encoding[1] == '6') { encoding += 2; *bytelength = 2; if (*encoding == '\0') { @@ -987,9 +1090,11 @@ get_standard_encoding(const char *encoding, int *bytelength) } } } - /* utf-8 */ - *bytelength = 3; - return ENC_UTF8; + else if (strcmp(encoding, "CP_UTF8") == 0) { + *bytelength = 3; + return ENC_UTF8; + } + return ENC_UNKNOWN; } /* This handler is declared static until someone demonstrates @@ -1026,6 +1131,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } if (end - start > PY_SSIZE_T_MAX / bytelength) end = start + PY_SSIZE_T_MAX / bytelength; @@ -1102,6 +1213,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ @@ -1257,6 +1374,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) return PyCodec_BackslashReplaceErrors(exc); } +static PyObject *namereplace_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_NameReplaceErrors(exc); +} + static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc) { return PyCodec_SurrogatePassErrors(exc); @@ -1327,6 +1449,17 @@ static int _PyCodecRegistry_Init(void) } }, { + "namereplace", + { + "namereplace_errors", + namereplace_errors, + METH_O, + PyDoc_STR("Implements the 'namereplace' error handling, " + "which replaces an unencodable character with a " + "\\N{...} escape sequence.") + } + }, + { "surrogatepass", { "surrogatepass", |