summaryrefslogtreecommitdiff
path: root/Python/codecs.c
diff options
context:
space:
mode:
Diffstat (limited to 'Python/codecs.c')
-rw-r--r--Python/codecs.c280
1 files changed, 242 insertions, 38 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 7334eb3e36..e21834a5c1 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -61,10 +61,9 @@ PyObject *normalizestring(const char *string)
return NULL;
}
- v = PyString_FromStringAndSize(NULL, len);
- if (v == NULL)
+ p = PyMem_Malloc(len + 1);
+ if (p == NULL)
return NULL;
- p = PyString_AS_STRING(v);
for (i = 0; i < len; i++) {
register char ch = string[i];
if (ch == ' ')
@@ -73,6 +72,11 @@ PyObject *normalizestring(const char *string)
ch = Py_TOLOWER(Py_CHARMASK(ch));
p[i] = ch;
}
+ p[i] = '\0';
+ v = PyUnicode_FromString(p);
+ if (v == NULL)
+ return NULL;
+ PyMem_Free(p);
return v;
}
@@ -112,7 +116,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
v = normalizestring(encoding);
if (v == NULL)
goto onError;
- PyString_InternInPlace(&v);
+ PyUnicode_InternInPlace(&v);
/* First, try to lookup the name in the registry dictionary */
result = PyDict_GetItem(interp->codec_search_cache, v);
@@ -167,7 +171,10 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}
/* Cache and return the result */
- PyDict_SetItem(interp->codec_search_cache, v, result);
+ if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
+ Py_DECREF(result);
+ goto onError;
+ }
Py_DECREF(args);
return result;
@@ -176,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return NULL;
}
+/* Codec registry encoding check API. */
+
+int PyCodec_KnownEncoding(const char *encoding)
+{
+ PyObject *codecs;
+
+ codecs = _PyCodec_Lookup(encoding);
+ if (!codecs) {
+ PyErr_Clear();
+ return 0;
+ }
+ else {
+ Py_DECREF(codecs);
+ return 1;
+ }
+}
+
static
PyObject *args_tuple(PyObject *object,
const char *errors)
@@ -190,7 +214,7 @@ PyObject *args_tuple(PyObject *object,
if (errors) {
PyObject *v;
- v = PyString_FromString(errors);
+ v = PyUnicode_FromString(errors);
if (v == NULL) {
Py_DECREF(args);
return NULL;
@@ -317,7 +341,7 @@ PyObject *PyCodec_Encode(PyObject *object,
{
PyObject *encoder = NULL;
PyObject *args = NULL, *result = NULL;
- PyObject *v;
+ PyObject *v = NULL;
encoder = PyCodec_Encoder(encoding);
if (encoder == NULL)
@@ -327,14 +351,14 @@ PyObject *PyCodec_Encode(PyObject *object,
if (args == NULL)
goto onError;
- result = PyEval_CallObject(encoder,args);
+ result = PyEval_CallObject(encoder, args);
if (result == NULL)
goto onError;
if (!PyTuple_Check(result) ||
PyTuple_GET_SIZE(result) != 2) {
PyErr_SetString(PyExc_TypeError,
- "encoder must return a tuple (object,integer)");
+ "encoder must return a tuple (object, integer)");
goto onError;
}
v = PyTuple_GET_ITEM(result,0);
@@ -446,14 +470,9 @@ static void wrong_exception_type(PyObject *exc)
PyObject *name = PyObject_GetAttrString(type, "__name__");
Py_DECREF(type);
if (name != NULL) {
- PyObject *string = PyObject_Str(name);
+ PyErr_Format(PyExc_TypeError,
+ "don't know how to handle %S in error callback", name);
Py_DECREF(name);
- if (string != NULL) {
- PyErr_Format(PyExc_TypeError,
- "don't know how to handle %.400s in error callback",
- PyString_AS_STRING(string));
- Py_DECREF(string);
- }
}
}
}
@@ -468,7 +487,6 @@ PyObject *PyCodec_StrictErrors(PyObject *exc)
}
-#ifdef Py_USING_UNICODE
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
{
Py_ssize_t end;
@@ -660,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
+#ifndef Py_UNICODE_WIDE
+#define IS_SURROGATE_PAIR(p, end) \
+ (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
+ *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
+#else
+#define IS_SURROGATE_PAIR(p, end) 0
+#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
@@ -684,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else
#endif
if (*p >= 0x100) {
- ressize += 1+1+4;
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ ressize += 1+1+8;
+ ++p;
+ }
+ else
+ ressize += 1+1+4;
}
else
ressize += 1+1+2;
@@ -694,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) {
- Py_UNICODE c = *p;
+ Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
+ ++p;
+ }
if (c >= 0x00010000) {
*outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf];
@@ -706,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
}
- else
-#endif
- if (c >= 0x100) {
+ else if (c >= 0x100) {
*outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
@@ -728,8 +759,167 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc);
return NULL;
}
+#undef IS_SURROGATE_PAIR
}
-#endif
+
+/* This handler is declared static until someone demonstrates
+ a need to call it directly. */
+static PyObject *
+PyCodec_SurrogatePassErrors(PyObject *exc)
+{
+ PyObject *restuple;
+ PyObject *object;
+ Py_ssize_t start;
+ Py_ssize_t end;
+ PyObject *res;
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ char *outp;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ startp = PyUnicode_AS_UNICODE(object);
+ res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+ if (!res) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ outp = PyBytes_AsString(res);
+ for (p = startp+start; p < startp+end; p++) {
+ Py_UNICODE ch = *p;
+ if (ch < 0xd800 || ch > 0xdfff) {
+ /* Not a surrogate, fail with original exception */
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return NULL;
+ }
+ *outp++ = (char)(0xe0 | (ch >> 12));
+ *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *outp++ = (char)(0x80 | (ch & 0x3f));
+ }
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+ unsigned char *p;
+ Py_UNICODE ch = 0;
+ if (PyUnicodeDecodeError_GetStart(exc, &start))
+ return NULL;
+ if (!(object = PyUnicodeDecodeError_GetObject(exc)))
+ return NULL;
+ if (!(p = (unsigned char*)PyBytes_AsString(object))) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ /* Try decoding a single surrogate character. If
+ there are more, let the codec call us again. */
+ p += start;
+ if (PyBytes_GET_SIZE(object) - start >= 3 &&
+ (p[0] & 0xf0) == 0xe0 &&
+ (p[1] & 0xc0) == 0x80 &&
+ (p[2] & 0xc0) == 0x80) {
+ /* it's a three-byte code */
+ ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+ if (ch < 0xd800 || ch > 0xdfff)
+ /* it's not a surrogate - fail */
+ ch = 0;
+ }
+ Py_DECREF(object);
+ if (ch == 0) {
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ return NULL;
+ }
+ return Py_BuildValue("(u#n)", &ch, 1, start+3);
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
+static PyObject *
+PyCodec_SurrogateEscapeErrors(PyObject *exc)
+{
+ PyObject *restuple;
+ PyObject *object;
+ Py_ssize_t start;
+ Py_ssize_t end;
+ PyObject *res;
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ char *outp;
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ startp = PyUnicode_AS_UNICODE(object);
+ res = PyBytes_FromStringAndSize(NULL, end-start);
+ if (!res) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ outp = PyBytes_AsString(res);
+ for (p = startp+start; p < startp+end; p++) {
+ Py_UNICODE ch = *p;
+ if (ch < 0xdc80 || ch > 0xdcff) {
+ /* Not a UTF-8b surrogate, fail with original exception */
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return NULL;
+ }
+ *outp++ = ch - 0xdc00;
+ }
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
+ unsigned char *p;
+ Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
+ int consumed = 0;
+ if (PyUnicodeDecodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeDecodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeDecodeError_GetObject(exc)))
+ return NULL;
+ if (!(p = (unsigned char*)PyBytes_AsString(object))) {
+ Py_DECREF(object);
+ return NULL;
+ }
+ while (consumed < 4 && consumed < end-start) {
+ /* Refuse to escape ASCII bytes. */
+ if (p[start+consumed] < 128)
+ break;
+ ch[consumed] = 0xdc00 + p[start+consumed];
+ consumed++;
+ }
+ Py_DECREF(object);
+ if (!consumed) {
+ /* codec complained about ASCII byte. */
+ PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
+ return NULL;
+ }
+ return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
@@ -737,7 +927,6 @@ static PyObject *strict_errors(PyObject *self, PyObject *exc)
}
-#ifdef Py_USING_UNICODE
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
{
return PyCodec_IgnoreErrors(exc);
@@ -760,7 +949,16 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
{
return PyCodec_BackslashReplaceErrors(exc);
}
-#endif
+
+static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_SurrogatePassErrors(exc);
+}
+
+static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_SurrogateEscapeErrors(exc);
+}
static int _PyCodecRegistry_Init(void)
{
@@ -779,7 +977,6 @@ static int _PyCodecRegistry_Init(void)
"raises a UnicodeError on coding errors.")
}
},
-#ifdef Py_USING_UNICODE
{
"ignore",
{
@@ -821,8 +1018,23 @@ static int _PyCodecRegistry_Init(void)
"which replaces an unencodable character with a "
"backslashed escape sequence.")
}
+ },
+ {
+ "surrogatepass",
+ {
+ "surrogatepass",
+ surrogatepass_errors,
+ METH_O
+ }
+ },
+ {
+ "surrogateescape",
+ {
+ "surrogateescape",
+ surrogateescape_errors,
+ METH_O
+ }
}
-#endif
};
PyInterpreterState *interp = PyThreadState_GET()->interp;
@@ -854,19 +1066,11 @@ static int _PyCodecRegistry_Init(void)
interp->codec_error_registry == NULL)
Py_FatalError("can't initialize codec registry");
- mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
+ mod = PyImport_ImportModuleNoBlock("encodings");
if (mod == NULL) {
- if (PyErr_ExceptionMatches(PyExc_ImportError)) {
- /* Ignore ImportErrors... this is done so that
- distributions can disable the encodings package. Note
- that other errors are not masked, e.g. SystemErrors
- raised to inform the user of an error in the Python
- configuration are still reported back to the user. */
- PyErr_Clear();
- return 0;
- }
return -1;
}
Py_DECREF(mod);
+ interp->codecs_initialized = 1;
return 0;
}