diff options
| author | Walter Dörwald <walter@livinglogic.de> | 2002-09-02 13:14:32 +0000 | 
|---|---|---|
| committer | Walter Dörwald <walter@livinglogic.de> | 2002-09-02 13:14:32 +0000 | 
| commit | 3aeb632c3152fa082132ce55b9a880e0d16b04ae (patch) | |
| tree | 192bc1543ea77a826d0c940d024dbc8ebba82156 | |
| parent | 94fab762de532de551987e1f48a125145f85304b (diff) | |
| download | cpython-git-3aeb632c3152fa082132ce55b9a880e0d16b04ae.tar.gz | |
PEP 293 implemention (from SF patch http://www.python.org/sf/432401)
| -rw-r--r-- | Doc/lib/libcodecs.tex | 39 | ||||
| -rw-r--r-- | Doc/lib/libexcs.tex | 21 | ||||
| -rw-r--r-- | Include/codecs.h | 30 | ||||
| -rw-r--r-- | Include/pyerrors.h | 66 | ||||
| -rw-r--r-- | Lib/codecs.py | 13 | ||||
| -rw-r--r-- | Lib/test/test_codeccallbacks.py | 483 | ||||
| -rw-r--r-- | Misc/NEWS | 3 | ||||
| -rw-r--r-- | Modules/_codecsmodule.c | 28 | ||||
| -rw-r--r-- | Objects/stringobject.c | 8 | ||||
| -rw-r--r-- | Objects/unicodeobject.c | 1792 | ||||
| -rw-r--r-- | Python/codecs.c | 399 | ||||
| -rw-r--r-- | Python/exceptions.c | 603 | 
12 files changed, 2929 insertions, 556 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex index 136c528992..85ca7a5726 100644 --- a/Doc/lib/libcodecs.tex +++ b/Doc/lib/libcodecs.tex @@ -17,7 +17,7 @@  This module defines base classes for standard Python codecs (encoders  and decoders) and provides access to the internal Python codec -registry which manages the codec lookup process. +registry which manages the codec and error handling lookup process.  It defines the following functions: @@ -98,6 +98,43 @@ Raises a \exception{LookupError} in case the encoding cannot be found.  To simplify working with encoded files or stream, the module  also defines these utility functions: +\begin{funcdesc}{register_error}{name, error_handler} +Register the error handling function \var{error_handler} under the +name \var{name}. \vari{error_handler} will be called during encoding +and decoding in case of an error, when \var{name} is specified as the +errors parameter. \var{error_handler} will be called with an +\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or +\exception{UnicodeTranslateError} instance and must return a tuple +with a replacement for the unencodable/undecodable part of the input +and a position where encoding/decoding should continue. +\end{funcdesc} + +\begin{funcdesc}{lookup_error}{name} +Return the error handler previously register under the name \var{name}. + +Raises a \exception{LookupError} in case the handler cannot be found. +\end{funcdesc} + +\begin{funcdesc}{strict_errors}{exception} +Implements the \code{strict} error handling. +\end{funcdesc} + +\begin{funcdesc}{replace_errors}{exception} +Implements the \code{replace} error handling. +\end{funcdesc} + +\begin{funcdesc}{ignore_errors}{exception} +Implements the \code{ignore} error handling. +\end{funcdesc} + +\begin{funcdesc}{xmlcharrefreplace_errors_errors}{exception} +Implements the \code{xmlcharrefreplace} error handling. +\end{funcdesc} + +\begin{funcdesc}{backslashreplace_errors_errors}{exception} +Implements the \code{backslashreplace} error handling. +\end{funcdesc} +  \begin{funcdesc}{open}{filename, mode\optional{, encoding\optional{,                         errors\optional{, buffering}}}}  Open an encoded file using the given \var{mode} and return diff --git a/Doc/lib/libexcs.tex b/Doc/lib/libexcs.tex index 078fe3c12a..54b141a9a1 100644 --- a/Doc/lib/libexcs.tex +++ b/Doc/lib/libexcs.tex @@ -335,6 +335,24 @@ Raised when an \keyword{assert} statement fails.  \versionadded{2.0}  \end{excdesc} +\begin{excdesc}{UnicodeEncodeError} +  Raised when a Unicode-related error occurs during encoding.  It +  is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} + +\begin{excdesc}{UnicodeDecodeError} +  Raised when a Unicode-related error occurs during decoding.  It +  is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} + +\begin{excdesc}{UnicodeTranslateError} +  Raised when a Unicode-related error occurs during translating.  It +  is a subclass of \exception{UnicodeError}. +\versionadded{2.3} +\end{excdesc} +  \begin{excdesc}{ValueError}    Raised when a built-in operation or function receives an argument    that has the right type but an inappropriate value, and the @@ -426,6 +444,9 @@ The class hierarchy for built-in exceptions is:       |    |    +-- FloatingPointError       |    +-- ValueError       |    |    +-- UnicodeError +     |    |        +-- UnicodeEncodeError +     |    |        +-- UnicodeDecodeError +     |    |        +-- UnicodeTranslateError       |    +-- ReferenceError       |    +-- SystemError       |    +-- MemoryError diff --git a/Include/codecs.h b/Include/codecs.h index 2cc4d7d350..82f18cdc5e 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -117,6 +117,36 @@ PyAPI_FUNC(PyObject *) PyCodec_StreamWriter(         const char *errors         ); +/* Unicode encoding error handling callback registry API */ + +/* Register the error handling callback function error under the name +   name. This function will be called by the codec when it encounters +   unencodable characters/undecodable bytes and doesn't know the +   callback name, when name is specified as the error parameter +   in the call to the encode/decode function. +   Return 0 on success, -1 on error */ +PyAPI_FUNC(int) PyCodec_RegisterError(const char *name, PyObject *error); + +/* Lookup the error handling callback function registered under the +   name error. As a special case NULL can be passed, in which case +   the error handling callback for "strict" will be returned. */ +PyAPI_FUNC(PyObject *) PyCodec_LookupError(const char *name); + +/* raise exc as an exception */ +PyAPI_FUNC(PyObject *) PyCodec_StrictErrors(PyObject *exc); + +/* ignore the unicode error, skipping the faulty input */ +PyAPI_FUNC(PyObject *) PyCodec_IgnoreErrors(PyObject *exc); + +/* replace the unicode error with ? or U+FFFD */ +PyAPI_FUNC(PyObject *) PyCodec_ReplaceErrors(PyObject *exc); + +/* replace the unicode encode error with XML character references */ +PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc); + +/* replace the unicode encode error with backslash escapes (\x, \u and \U) */ +PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc); +  #ifdef __cplusplus  }  #endif diff --git a/Include/pyerrors.h b/Include/pyerrors.h index b783b7ba1d..756c4b2fce 100644 --- a/Include/pyerrors.h +++ b/Include/pyerrors.h @@ -54,6 +54,9 @@ PyAPI_DATA(PyObject *) PyExc_SystemExit;  PyAPI_DATA(PyObject *) PyExc_TypeError;  PyAPI_DATA(PyObject *) PyExc_UnboundLocalError;  PyAPI_DATA(PyObject *) PyExc_UnicodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeEncodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeDecodeError; +PyAPI_DATA(PyObject *) PyExc_UnicodeTranslateError;  PyAPI_DATA(PyObject *) PyExc_ValueError;  PyAPI_DATA(PyObject *) PyExc_ZeroDivisionError;  #ifdef MS_WINDOWS @@ -114,6 +117,69 @@ PyAPI_FUNC(void) PyErr_SetInterrupt(void);  PyAPI_FUNC(void) PyErr_SyntaxLocation(char *, int);  PyAPI_FUNC(PyObject *) PyErr_ProgramText(char *, int); +/* The following functions are used to create and modify unicode +   exceptions from C */ +/* create a UnicodeDecodeError object */ +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_Create( +	const char *, const char *, int, int, int, const char *); + +/* create a UnicodeEncodeError object */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_Create( +	const char *, const Py_UNICODE *, int, int, int, const char *); + +/* create a UnicodeTranslateError object */ +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_Create( +	const Py_UNICODE *, int, int, int, const char *); + +/* get the encoding attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetEncoding(PyObject *); + +/* get the object attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *); + +/* get the value of the start attribute (the int * may not be NULL) +   return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_GetStart(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeDecodeError_GetStart(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeTranslateError_GetStart(PyObject *, int *); + +/* assign a new value to the start attribute +   return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetStart(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetStart(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetStart(PyObject *, int); + +/* get the value of the end attribute (the int *may not be NULL) + return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_GetEnd(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeDecodeError_GetEnd(PyObject *, int *); +PyAPI_FUNC(int) PyUnicodeTranslateError_GetEnd(PyObject *, int *); + +/* assign a new value to the end attribute +   return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetEnd(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetEnd(PyObject *, int); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetEnd(PyObject *, int); + +/* get the value of the reason attribute */ +PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *); +PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *); + +/* assign a new value to the reason attribute +   return 0 on success, -1 on failure */ +PyAPI_FUNC(int) PyUnicodeEncodeError_SetReason( +	PyObject *, const char *); +PyAPI_FUNC(int) PyUnicodeDecodeError_SetReason( +	PyObject *, const char *); +PyAPI_FUNC(int) PyUnicodeTranslateError_SetReason( +	PyObject *, const char *); + +  /* These APIs aren't really part of the error implementation, but     often needed to format error messages; the native C lib APIs are     not available on all platforms, which is why we provide emulations diff --git a/Lib/codecs.py b/Lib/codecs.py index b089e90766..40f0a2e226 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -20,7 +20,10 @@ except ImportError, why:  __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",             "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",             "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", -           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"] +           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", +           "strict_errors", "ignore_errors", "replace_errors", +           "xmlcharrefreplace_errors", +           "register_error", "lookup_error"]  ### Constants @@ -632,6 +635,14 @@ def make_encoding_map(decoding_map):              m[v] = None      return m +### error handlers + +strict_errors = lookup_error("strict") +ignore_errors = lookup_error("ignore") +replace_errors = lookup_error("replace") +xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") +backslashreplace_errors = lookup_error("backslashreplace") +  # Tell modulefinder that using codecs probably needs the encodings  # package  _false = 0 diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py new file mode 100644 index 0000000000..1650965a99 --- /dev/null +++ b/Lib/test/test_codeccallbacks.py @@ -0,0 +1,483 @@ +import test.test_support, unittest +import sys, codecs, htmlentitydefs, unicodedata + +class CodecCallbackTest(unittest.TestCase): + +    def test_xmlcharrefreplace(self): +        # replace unencodable characters which numeric character entities. +        # For ascii, latin-1 and charmaps this is completely implemented +        # in C and should be reasonably fast. +        s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" +        self.assertEqual( +            s.encode("ascii", "xmlcharrefreplace"), +            "スパモ änd eggs" +        ) +        self.assertEqual( +            s.encode("latin-1", "xmlcharrefreplace"), +            "スパモ \xe4nd eggs" +        ) + +    def test_xmlcharnamereplace(self): +        # This time use a named character entity for unencodable +        # characters, if one is available. +        names = {} +        for (key, value) in htmlentitydefs.entitydefs.items(): +            if len(value)==1: +                names[unicode(value, "latin-1")] = unicode(key, "latin-1") +            else: +                names[unichr(int(value[2:-1]))] = unicode(key, "latin-1") + +        def xmlcharnamereplace(exc): +            if not isinstance(exc, UnicodeEncodeError): +                raise TypeError("don't know how to handle %r" % exc) +            l = [] +            for c in exc.object[exc.start:exc.end]: +                try: +                    l.append(u"&%s;" % names[c]) +                except KeyError: +                    l.append(u"&#%d;" % ord(c)) +            return (u"".join(l), exc.end) + +        codecs.register_error( +            "test.xmlcharnamereplace", xmlcharnamereplace) + +        sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" +        sout = "«ℜ» = ⟨ሴ€⟩" +        self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) +        sout = "\xabℜ\xbb = ⟨ሴ€⟩" +        self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) +        sout = "\xabℜ\xbb = ⟨ሴ\xa4⟩" +        self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) + +    def test_uninamereplace(self): +        # We're using the names from the unicode database this time, +        # and we're doing "systax highlighting" here, i.e. we include +        # the replaced text in ANSI escape sequences. For this it is +        # useful that the error handler is not called for every single +        # unencodable character, but for a complete sequence of +        # unencodable characters, otherwise we would output many +        # unneccessary escape sequences. + +        def uninamereplace(exc): +            if not isinstance(exc, UnicodeEncodeError): +                raise TypeError("don't know how to handle %r" % exc) +            l = [] +            for c in exc.object[exc.start:exc.end]: +                l.append(unicodedata.name(c, u"0x%x" % ord(c))) +            return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end) + +        codecs.register_error( +            "test.uninamereplace", uninamereplace) + +        sin = u"\xac\u1234\u20ac\u8000" +        sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" +        self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) + +        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, 0x8000\033[0m" +        self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) + +        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1m0x8000\033[0m" +        self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) + +    def test_backslashescape(self): +        # Does the same as the "unicode-escape" encoding, but with different +        # base encodings. +        sin = u"a\xac\u1234\u20ac\u8000" +        if sys.maxunicode > 0xffff: +            sin += unichr(sys.maxunicode) +        sout = "a\\xac\\u1234\\u20ac\\u8000" +        if sys.maxunicode > 0xffff: +            sout += "\\U%08x" % sys.maxunicode +        self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) + +        sout = "a\xac\\u1234\\u20ac\\u8000" +        if sys.maxunicode > 0xffff: +            sout += "\\U%08x" % sys.maxunicode +        self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) + +        sout = "a\xac\\u1234\xa4\\u8000" +        if sys.maxunicode > 0xffff: +            sout += "\\U%08x" % sys.maxunicode +        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) + +    def test_relaxedutf8(self): +        # This is the test for a decoding callback handler, +        # that relaxes the UTF-8 minimal encoding restriction. +        # A null byte that is encoded as "\xc0\x80" will be +        # decoded as a null byte. All other illegal sequences +        # will be handled strictly. +        def relaxedutf8(exc): +            if not isinstance(exc, UnicodeDecodeError): +                raise TypeError("don't know how to handle %r" % exc) +            if exc.object[exc.start:exc.end].startswith("\xc0\x80"): +                return (u"\x00", exc.start+2) # retry after two bytes +            else: +                raise exc + +        codecs.register_error( +            "test.relaxedutf8", relaxedutf8) + +        sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" +        sout = u"a\x00b\x00c\xfc\x00\x00" +        self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) +        sin = "\xc0\x80\xc0\x81" +        self.assertRaises(UnicodeError, sin.decode, "utf-8", "test.relaxedutf8") + +    def test_charmapencode(self): +        # For charmap encodings the replacement string will be +        # mapped through the encoding again. This means, that +        # to be able to use e.g. the "replace" handler, the +        # charmap has to have a mapping for "?". +        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"]) +        sin = u"abc" +        sout = "AABBCC" +        self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) + +        sin = u"abcA" +        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) + +        charmap[ord("?")] = "XYZ" +        sin = u"abcDEF" +        sout = "AABBCCXYZXYZXYZ" +        self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) + +        charmap[ord("?")] = u"XYZ" +        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) + +        charmap[ord("?")] = u"XYZ" +        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) + +    def test_callbacks(self): +        def handler1(exc): +            if not isinstance(exc, UnicodeEncodeError) \ +               and not isinstance(exc, UnicodeDecodeError): +                raise TypeError("don't know how to handle %r" % exc) +            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] +            return (u"[%s]" % u"".join(l), exc.end) + +        codecs.register_error("test.handler1", handler1) + +        def handler2(exc): +            if not isinstance(exc, UnicodeDecodeError): +                raise TypeError("don't know how to handle %r" % exc) +            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)] +            return (u"[%s]" % u"".join(l), exc.end+1) # skip one character + +        codecs.register_error("test.handler2", handler2) + +        s = "\x00\x81\x7f\x80\xff" + +        self.assertEqual( +            s.decode("ascii", "test.handler1"), +            u"\x00[<129>]\x7f[<128>][<255>]" +        ) +        self.assertEqual( +            s.decode("ascii", "test.handler2"), +            u"\x00[<129>][<128>]" +        ) + +        self.assertEqual( +            "\\u3042\u3xxx".decode("unicode-escape", "test.handler1"), +            u"\u3042[<92><117><51><120>]xx" +        ) + +        self.assertEqual( +            "\\u3042\u3xx".decode("unicode-escape", "test.handler1"), +            u"\u3042[<92><117><51><120><120>]" +        ) + +        self.assertEqual( +            codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0], +            u"z[<98>][<99>]" +        ) + +        self.assertEqual( +            u"g\xfc\xdfrk".encode("ascii", "test.handler1"), +            u"g[<252><223>]rk" +        ) + +        self.assertEqual( +            u"g\xfc\xdf".encode("ascii", "test.handler1"), +            u"g[<252><223>]" +        ) + +    def test_longstrings(self): +        # test long strings to check for memory overflow problems +        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] +        # register the handlers under different names, +        # to prevent the codec from recognizing the name +        for err in errors: +            codecs.register_error("test." + err, codecs.lookup_error(err)) +        l = 1000 +        errors += [ "test." + err for err in errors ] +        for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]: +            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): +                for err in errors: +                   try: +                       uni.encode(enc, err) +                   except UnicodeError: +                       pass + +    def check_exceptionobjectargs(self, exctype, args, msg): +        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion +        # check with one missing argument +        self.assertRaises(TypeError, exctype, *args[:-1]) +        # check with one missing argument +        self.assertRaises(TypeError, exctype, *(args + ["too much"])) +        # check with one argument of the wrong type +        wrongargs = [ "spam", u"eggs", 42, 1.0, None ] +        for i in xrange(len(args)): +            for wrongarg in wrongargs: +                if type(wrongarg) is type(args[i]): +                   continue +                # build argument array +                callargs = [] +                for j in xrange(len(args)): +                    if i==j: +                        callargs.append(wrongarg) +                    else: +                        callargs.append(args[i]) +                self.assertRaises(TypeError, exctype, *callargs) +        exc = exctype(*args) +        self.assertEquals(str(exc), msg) + +    def test_unicodeencodeerror(self): +        self.check_exceptionobjectargs( +            UnicodeEncodeError, +            ["ascii", u"g\xfcrk", 1, 2, "ouch"], +            "'ascii' codec can't encode character '\ufc' in position 1: ouch" +        ) +        self.check_exceptionobjectargs( +            UnicodeEncodeError, +            ["ascii", u"g\xfcrk", 1, 4, "ouch"], +            "'ascii' codec can't encode characters in position 1-3: ouch" +        ) +        self.check_exceptionobjectargs( +            UnicodeEncodeError, +            ["ascii", u"\xfcx", 0, 1, "ouch"], +            "'ascii' codec can't encode character '\ufc' in position 0: ouch" +        ) + +    def test_unicodedecodeerror(self): +        self.check_exceptionobjectargs( +            UnicodeDecodeError, +            ["ascii", "g\xfcrk", 1, 2, "ouch"], +            "'ascii' codec can't decode byte 0xfc in position 1: ouch" +        ) +        self.check_exceptionobjectargs( +            UnicodeDecodeError, +            ["ascii", "g\xfcrk", 1, 3, "ouch"], +            "'ascii' codec can't decode bytes in position 1-2: ouch" +        ) + +    def test_unicodetranslateerror(self): +        self.check_exceptionobjectargs( +            UnicodeTranslateError, +            [u"g\xfcrk", 1, 2, "ouch"], +            "can't translate character '\\ufc' in position 1: ouch" +        ) +        self.check_exceptionobjectargs( +            UnicodeTranslateError, +            [u"g\xfcrk", 1, 3, "ouch"], +            "can't translate characters in position 1-2: ouch" +        ) + +    def test_badandgoodstrictexceptions(self): +        self.assertRaises( +            TypeError, +            codecs.strict_errors, +            42 +        ) +        self.assertRaises( +            Exception, +            codecs.strict_errors, +            Exception("ouch") +        ) + +        self.assertRaises( +            UnicodeEncodeError, +            codecs.strict_errors, +            UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch") +        ) + +    def test_badandgoodignoreexceptions(self): +        self.assertRaises( +           TypeError, +           codecs.ignore_errors, +           42 +        ) +        self.assertRaises( +           TypeError, +           codecs.ignore_errors, +           UnicodeError("ouch") +        ) +        self.assertEquals( +            codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), +            (u"", 1) +        ) +        self.assertEquals( +            codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), +            (u"", 1) +        ) +        self.assertEquals( +            codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), +            (u"", 1) +        ) + +    def test_badandgoodreplaceexceptions(self): +        self.assertRaises( +           TypeError, +           codecs.replace_errors, +           42 +        ) +        self.assertRaises( +           TypeError, +           codecs.replace_errors, +           UnicodeError("ouch") +        ) +        self.assertEquals( +            codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), +            (u"?", 1) +        ) +        self.assertEquals( +            codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")), +            (u"\ufffd", 1) +        ) +        self.assertEquals( +            codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")), +            (u"\ufffd", 1) +        ) + +    def test_badandgoodxmlcharrefreplaceexceptions(self): +        self.assertRaises( +           TypeError, +           codecs.xmlcharrefreplace_errors, +           42 +        ) +        self.assertRaises( +           TypeError, +           codecs.xmlcharrefreplace_errors, +           UnicodeError("ouch") +        ) +        self.assertEquals( +            codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), +            (u"&#%d;" % 0x3042, 1) +        ) +        self.assertRaises( +            TypeError, +            codecs.xmlcharrefreplace_errors, +            UnicodeError("ouch") +        ) +        self.assertRaises( +            TypeError, +            codecs.xmlcharrefreplace_errors, +            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") +        ) +        self.assertRaises( +            TypeError, +            codecs.xmlcharrefreplace_errors, +            UnicodeTranslateError(u"\u3042", 0, 1, "ouch") +        ) + +    def test_badandgoodbackslashreplaceexceptions(self): +        self.assertRaises( +           TypeError, +           codecs.backslashreplace_errors, +           42 +        ) +        self.assertRaises( +           TypeError, +           codecs.backslashreplace_errors, +           UnicodeError("ouch") +        ) +        self.assertEquals( +            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")), +            (u"\\u3042", 1) +        ) +        self.assertEquals( +            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")), +            (u"\\x00", 1) +        ) +        self.assertEquals( +            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")), +            (u"\\xff", 1) +        ) +        self.assertEquals( +            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")), +            (u"\\u0100", 1) +        ) +        self.assertEquals( +            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")), +            (u"\\uffff", 1) +        ) +        if sys.maxunicode>0xffff: +            self.assertEquals( +                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")), +                (u"\\U00010000", 1) +            ) +            self.assertEquals( +                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")), +                (u"\\U0010ffff", 1) +            ) + +        self.assertRaises( +            TypeError, +            codecs.backslashreplace_errors, +            UnicodeError("ouch") +        ) +        self.assertRaises( +            TypeError, +            codecs.backslashreplace_errors, +            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch") +        ) +        self.assertRaises( +            TypeError, +            codecs.backslashreplace_errors, +            UnicodeTranslateError(u"\u3042", 0, 1, "ouch") +        ) + +    def test_badhandlerresults(self): +        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) +        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") + +        for res in results: +            codecs.register_error("test.badhandler", lambda: res) +            for enc in encs: +                self.assertRaises( +                    TypeError, +                    u"\u3042".encode, +                    enc, +                    "test.badhandler" +                ) +            for (enc, bytes) in ( +                ("ascii", "\xff"), +                ("utf-8", "\xff"), +                ("utf-7", "+x-") +            ): +                self.assertRaises( +                    TypeError, +                    bytes.decode, +                    enc, +                    "test.badhandler" +                ) + +    def test_lookup(self): +        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) +        self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore")) +        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict")) +        self.assertEquals( +            codecs.xmlcharrefreplace_errors, +            codecs.lookup_error("xmlcharrefreplace") +        ) +        self.assertEquals( +            codecs.backslashreplace_errors, +            codecs.lookup_error("backslashreplace") +        ) + +def test_main(): +    suite = unittest.TestSuite() +    suite.addTest(unittest.makeSuite(CodecCallbackTest)) +    test.test_support.run_suite(suite) + +if __name__ == "__main__": +    test_main() @@ -57,6 +57,9 @@ Type/class unification and new-style classes  Core and builtins +- Codec error handling callbacks (PEP 293) are implemented. +  Error handling in unicode.encode or str.decode can now be customized. +  - A subtle change to the semantics of the built-in function intern():    interned strings are no longer immortal.  You must keep a reference    to the return value intern() around to get the benefit. diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index 1e3fc5d5b8..24fa1d5408 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -706,6 +706,32 @@ mbcs_encode(PyObject *self,  #endif /* MS_WINDOWS */  #endif /* Py_USING_UNICODE */ +/* --- Error handler registry --------------------------------------------- */ + +static PyObject *register_error(PyObject *self, PyObject *args) +{ +    const char *name; +    PyObject *handler; + +    if (!PyArg_ParseTuple(args, "sO:register_error", +			  &name, &handler)) +	return NULL; +    if (PyCodec_RegisterError(name, handler)) +        return NULL; +    Py_INCREF(Py_None); +    return Py_None; +} + +static PyObject *lookup_error(PyObject *self, PyObject *args) +{ +    const char *name; + +    if (!PyArg_ParseTuple(args, "s:lookup_error", +			  &name)) +	return NULL; +    return PyCodec_LookupError(name); +} +  /* --- Module API --------------------------------------------------------- */  static PyMethodDef _codecs_functions[] = { @@ -744,6 +770,8 @@ static PyMethodDef _codecs_functions[] = {      {"mbcs_decode", 		mbcs_decode,			METH_VARARGS},  #endif  #endif /* Py_USING_UNICODE */ +    {"register_error", 		register_error,			METH_VARARGS}, +    {"lookup_error", 		lookup_error,			METH_VARARGS},      {NULL, NULL}		/* sentinel */  }; diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 8ae9407476..31f188a5b9 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -2468,7 +2468,9 @@ PyDoc_STRVAR(encode__doc__,  Encodes S using the codec registered for encoding. encoding defaults\n\  to the default encoding. errors may be given to set a different error\n\  handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_error that is able to handle UnicodeEncodeErrors.");  static PyObject *  string_encode(PyStringObject *self, PyObject *args) @@ -2487,7 +2489,9 @@ PyDoc_STRVAR(decode__doc__,  Decodes S using the codec registered for encoding. encoding defaults\n\  to the default encoding. errors may be given to set a different error\n\  handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ +as well as any other name registerd with codecs.register_error that is\n\ +able to handle UnicodeDecodeErrors.");  static PyObject *  string_decode(PyStringObject *self, PyObject *args) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 920f9ea2d8..2108d94863 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -528,8 +528,8 @@ PyObject *PyUnicode_Decode(const char *s,  			   const char *errors)  {      PyObject *buffer = NULL, *unicode; -     -    if (encoding == NULL)  + +    if (encoding == NULL)  	encoding = PyUnicode_GetDefaultEncoding();      /* Shortcuts for common default encodings */ @@ -680,6 +680,92 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)      return -1;  } +/* error handling callback helper: +   build arguments, call the callback and check the arguments, +   if no exception occured, copy the replacement to the output +   and adjust various state variables. +   return 0 on success, -1 on error +*/ + +static +int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, +                 const char *encoding, const char *reason, +                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr, +                 PyObject **output, int *outpos, Py_UNICODE **outptr) +{ +    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple"; + +    PyObject *restuple = NULL; +    PyObject *repunicode = NULL; +    int outsize = PyUnicode_GET_SIZE(*output); +    int requiredsize; +    int newpos; +    Py_UNICODE *repptr; +    int repsize; +    int res = -1; + +    if (*errorHandler == NULL) { +	*errorHandler = PyCodec_LookupError(errors); +	if (*errorHandler == NULL) +	   goto onError; +    } + +    if (*exceptionObject == NULL) { +    	*exceptionObject = PyUnicodeDecodeError_Create( +	    encoding, input, insize, *startinpos, *endinpos, reason); +	if (*exceptionObject == NULL) +	   goto onError; +    } +    else { +	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) +	    goto onError; +	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) +	    goto onError; +	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) +	    goto onError; +    } + +    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); +    if (restuple == NULL) +	goto onError; +    if (!PyTuple_Check(restuple)) { +	PyErr_Format(PyExc_TypeError, &argparse[4]); +	goto onError; +    } +    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) +	goto onError; +    if (newpos<0) +	newpos = 0; +    else if (newpos>insize) +	newpos = insize; + +    /* need more space? (at least enough for what we +       have+the replacement+the rest of the string (starting +       at the new input position), so we won't have to check space +       when there are no errors in the rest of the string) */ +    repptr = PyUnicode_AS_UNICODE(repunicode); +    repsize = PyUnicode_GET_SIZE(repunicode); +    requiredsize = *outpos + repsize + insize-newpos; +    if (requiredsize > outsize) { +	if (requiredsize<2*outsize) +	    requiredsize = 2*outsize; +	if (PyUnicode_Resize(output, requiredsize)) +	    goto onError; +	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos; +    } +    *endinpos = newpos; +    *inptr = input + newpos; +    Py_UNICODE_COPY(*outptr, repptr, repsize); +    *outptr += repsize; +    *outpos += repsize; +    /* we made it! */ +    res = 0; + +    onError: +    Py_XDECREF(restuple); +    return res; +} +  /* --- UTF-7 Codec -------------------------------------------------------- */  /* see RFC2152 for details */ @@ -738,40 +824,14 @@ char utf7_special[128] = {  		} \      } \ -static -int utf7_decoding_error(Py_UNICODE **dest, -                        const char *errors, -                        const char *details)  -{ -    if ((errors == NULL) || -        (strcmp(errors,"strict") == 0)) { -        PyErr_Format(PyExc_UnicodeError, -                     "UTF-7 decoding error: %.400s", -                     details); -        return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -        return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -        if (dest != NULL) { -            **dest = Py_UNICODE_REPLACEMENT_CHARACTER; -            (*dest)++; -        } -        return 0; -    } -    else { -        PyErr_Format(PyExc_ValueError, -                     "UTF-7 decoding error; unknown error handling code: %.400s", -                     errors); -        return -1; -    } -} -  PyObject *PyUnicode_DecodeUTF7(const char *s,  			       int size,  			       const char *errors)  { +    const char *starts = s; +    int startinpos; +    int endinpos; +    int outpos;      const char *e;      PyUnicodeObject *unicode;      Py_UNICODE *p; @@ -779,7 +839,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,      int inShift = 0;      unsigned int bitsleft = 0;      unsigned long charsleft = 0; -	int surrogate = 0; +    int surrogate = 0; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      unicode = _PyUnicode_New(size);      if (!unicode) @@ -791,7 +853,9 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,      e = s + size;      while (s < e) { -        Py_UNICODE ch = *s; +        Py_UNICODE ch; +        restart: +        ch = *s;          if (inShift) {              if ((ch == '-') || !B64CHAR(ch)) { @@ -836,6 +900,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,              }          }          else if ( ch == '+' ) { +            startinpos = s-starts;              s++;              if (s < e && *s == '-') {                  s++; @@ -857,21 +922,39 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,          }          continue;      utf7Error: -      if (utf7_decoding_error(&p, errors, errmsg)) -          goto onError; +        outpos = p-PyUnicode_AS_UNICODE(unicode); +        endinpos = s-starts; +        if (unicode_decode_call_errorhandler( +             errors, &errorHandler, +             "utf7", errmsg, +             starts, size, &startinpos, &endinpos, &exc, &s, +             (PyObject **)&unicode, &outpos, &p)) +        goto onError;      }      if (inShift) { -        if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) +        outpos = p-PyUnicode_AS_UNICODE(unicode); +        endinpos = size; +        if (unicode_decode_call_errorhandler( +             errors, &errorHandler, +             "utf7", "unterminated shift sequence", +             starts, size, &startinpos, &endinpos, &exc, &s, +             (PyObject **)&unicode, &outpos, &p))              goto onError; +        if (s < e) +           goto restart;      } -    if (_PyUnicode_Resize(&unicode, p - unicode->str)) +    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))          goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)unicode;  onError: +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      Py_DECREF(unicode);      return NULL;  } @@ -1001,46 +1084,21 @@ char utf8_code_length[256] = {      4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0  }; -static -int utf8_decoding_error(const char **source, -                        Py_UNICODE **dest, -                        const char *errors, -                        const char *details)  -{ -    if ((errors == NULL) || -        (strcmp(errors,"strict") == 0)) { -        PyErr_Format(PyExc_UnicodeError, -                     "UTF-8 decoding error: %.400s", -                     details); -        return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -        (*source)++; -        return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -        (*source)++; -        **dest = Py_UNICODE_REPLACEMENT_CHARACTER; -        (*dest)++; -        return 0; -    } -    else { -        PyErr_Format(PyExc_ValueError, -                     "UTF-8 decoding error; unknown error handling code: %.400s", -                     errors); -        return -1; -    } -} -  PyObject *PyUnicode_DecodeUTF8(const char *s,  			       int size,  			       const char *errors)  { +    const char *starts = s;      int n; +    int startinpos; +    int endinpos; +    int outpos;      const char *e;      PyUnicodeObject *unicode;      Py_UNICODE *p;      const char *errmsg = ""; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* Note: size will always be longer than the resulting Unicode         character count */ @@ -1067,6 +1125,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,          if (s + n > e) {  	    errmsg = "unexpected end of data"; +	    startinpos = s-starts; +	    endinpos = size;  	    goto utf8Error;  	} @@ -1074,19 +1134,27 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,          case 0:              errmsg = "unexpected code byte"; +	    startinpos = s-starts; +	    endinpos = startinpos+1;  	    goto utf8Error;          case 1:              errmsg = "internal error"; +	    startinpos = s-starts; +	    endinpos = startinpos+1;  	    goto utf8Error;          case 2:              if ((s[1] & 0xc0) != 0x80) {                  errmsg = "invalid data"; +		startinpos = s-starts; +		endinpos = startinpos+2;  		goto utf8Error;  	    }              ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);              if (ch < 0x80) { +		startinpos = s-starts; +		endinpos = startinpos+2;                  errmsg = "illegal encoding";  		goto utf8Error;  	    } @@ -1098,6 +1166,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,              if ((s[1] & 0xc0) != 0x80 ||                   (s[2] & 0xc0) != 0x80) {                  errmsg = "invalid data"; +		startinpos = s-starts; +		endinpos = startinpos+3;  		goto utf8Error;  	    }              ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); @@ -1110,6 +1180,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,  		       unit.  		*/                  errmsg = "illegal encoding"; +		startinpos = s-starts; +		endinpos = startinpos+3;  		goto utf8Error;  	    }  	    else @@ -1121,6 +1193,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,                  (s[2] & 0xc0) != 0x80 ||                  (s[3] & 0xc0) != 0x80) {                  errmsg = "invalid data"; +		startinpos = s-starts; +		endinpos = startinpos+4;  		goto utf8Error;  	    }              ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + @@ -1132,6 +1206,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,  					 UTF-16 */  	    {                  errmsg = "illegal encoding"; +		startinpos = s-starts; +		endinpos = startinpos+4;  		goto utf8Error;  	    }  #ifdef Py_UNICODE_WIDE @@ -1153,23 +1229,34 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,          default:              /* Other sizes are only needed for UCS-4 */              errmsg = "unsupported Unicode code range"; +	    startinpos = s-starts; +	    endinpos = startinpos+n;  	    goto utf8Error;          }          s += n;  	continue;      utf8Error: -      if (utf8_decoding_error(&s, &p, errors, errmsg)) -          goto onError; +    outpos = p-PyUnicode_AS_UNICODE(unicode); +    if (unicode_decode_call_errorhandler( +	     errors, &errorHandler, +	     "utf8", errmsg, +	     starts, size, &startinpos, &endinpos, &exc, &s, +	     (PyObject **)&unicode, &outpos, &p)) +	goto onError;      }      /* Adjust length */      if (_PyUnicode_Resize(&unicode, p - unicode->str))          goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)unicode;  onError: +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      Py_DECREF(unicode);      return NULL;  } @@ -1287,43 +1374,16 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)  /* --- UTF-16 Codec ------------------------------------------------------- */ -static -int utf16_decoding_error(Py_UNICODE **dest, -			 const char *errors, -			 const char *details)  -{ -    if ((errors == NULL) || -        (strcmp(errors,"strict") == 0)) { -        PyErr_Format(PyExc_UnicodeError, -                     "UTF-16 decoding error: %.400s", -                     details); -        return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -        return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -	if (dest) { -	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER; -	    (*dest)++; -	} -        return 0; -    } -    else { -        PyErr_Format(PyExc_ValueError, -                     "UTF-16 decoding error; " -		     "unknown error handling code: %.400s", -                     errors); -        return -1; -    } -} -  PyObject *  PyUnicode_DecodeUTF16(const char *s,  		      int size,  		      const char *errors,  		      int *byteorder)  { +    const char *starts = s; +    int startinpos; +    int endinpos; +    int outpos;      PyUnicodeObject *unicode;      Py_UNICODE *p;      const unsigned char *q, *e; @@ -1335,13 +1395,8 @@ PyUnicode_DecodeUTF16(const char *s,  #else      int ihi = 0, ilo = 1;  #endif - -    /* size should be an even number */ -    if (size & 1) { -        if (utf16_decoding_error(NULL, errors, "truncated data")) -            return NULL; -        --size;  /* else ignore the oddball byte */ -    } +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* Note: size will always be longer than the resulting Unicode         character count */ @@ -1398,7 +1453,18 @@ PyUnicode_DecodeUTF16(const char *s,      }      while (q < e) { -	Py_UNICODE ch = (q[ihi] << 8) | q[ilo]; +	Py_UNICODE ch; +	/* remaing bytes at the end? (size should be even) */ +	if (e-q<2) { +	    errmsg = "truncated data"; +	    startinpos = ((const char *)q)-starts; +	    endinpos = ((const char *)e)-starts; +	    goto utf16Error; +	    /* The remaining input chars are ignored if the callback +	       chooses to skip the input */ +	} +	ch = (q[ihi] << 8) | q[ilo]; +  	q += 2;  	if (ch < 0xD800 || ch > 0xDFFF) { @@ -1409,6 +1475,8 @@ PyUnicode_DecodeUTF16(const char *s,  	/* UTF-16 code pair: */  	if (q >= e) {  	    errmsg = "unexpected end of data"; +	    startinpos = (((const char *)q)-2)-starts; +	    endinpos = ((const char *)e)-starts;  	    goto utf16Error;  	}  	if (0xD800 <= ch && ch <= 0xDBFF) { @@ -1425,15 +1493,24 @@ PyUnicode_DecodeUTF16(const char *s,  	    }  	    else {                  errmsg = "illegal UTF-16 surrogate"; +		startinpos = (((const char *)q)-4)-starts; +		endinpos = startinpos+2;  		goto utf16Error;  	    }  	}  	errmsg = "illegal encoding"; +	startinpos = (((const char *)q)-2)-starts; +	endinpos = startinpos+2;  	/* Fall through to report the error */      utf16Error: -	if (utf16_decoding_error(&p, errors, errmsg)) +	outpos = p-PyUnicode_AS_UNICODE(unicode); +	if (unicode_decode_call_errorhandler( +	         errors, &errorHandler, +	         "utf16", errmsg, +	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q, +	         (PyObject **)&unicode, &outpos, &p))  	    goto onError;      } @@ -1444,10 +1521,14 @@ PyUnicode_DecodeUTF16(const char *s,      if (_PyUnicode_Resize(&unicode, p - unicode->str))          goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)unicode;  onError:      Py_DECREF(unicode); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  } @@ -1528,63 +1609,43 @@ PyObject *PyUnicode_AsUTF16String(PyObject *unicode)  /* --- Unicode Escape Codec ----------------------------------------------- */ -static -int unicodeescape_decoding_error(Py_UNICODE **x, -                                 const char *errors, -                                 const char *details)  -{ -    if ((errors == NULL) || -        (strcmp(errors,"strict") == 0)) { -        PyErr_Format(PyExc_UnicodeError, -                     "Unicode-Escape decoding error: %.400s", -                     details); -        return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -        return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -        **x = Py_UNICODE_REPLACEMENT_CHARACTER; -	(*x)++; -        return 0; -    } -    else { -        PyErr_Format(PyExc_ValueError, -                     "Unicode-Escape decoding error; " -                     "unknown error handling code: %.400s", -                     errors); -        return -1; -    } -} -  static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;  PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,  					int size,  					const char *errors)  { +    const char *starts = s; +    int startinpos; +    int endinpos; +    int outpos; +    int i;      PyUnicodeObject *v; -    Py_UNICODE *p, *buf; +    Py_UNICODE *p;      const char *end;      char* message;      Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* Escaped strings will always be longer than the resulting         Unicode string, so we start with size here and then reduce the -       length after conversion to the true value. */ +       length after conversion to the true value. +       (but if the error callback returns a long replacement string +       we'll have to allocate more space) */      v = _PyUnicode_New(size);      if (v == NULL)          goto onError;      if (size == 0)          return (PyObject *)v; -    p = buf = PyUnicode_AS_UNICODE(v); +    p = PyUnicode_AS_UNICODE(v);      end = s + size;      while (s < end) {          unsigned char c;          Py_UNICODE x; -        int i, digits; +        int digits;          /* Non-escape characters are interpreted as Unicode ordinals */          if (*s != '\\') { @@ -1592,6 +1653,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,              continue;          } +        startinpos = s-starts;          /* \ - Escapes */          s++;          switch (*s++) { @@ -1640,14 +1702,28 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,              message = "truncated \\UXXXXXXXX escape";          hexescape:              chr = 0; -            for (i = 0; i < digits; i++) { +            outpos = p-PyUnicode_AS_UNICODE(v); +            if (s+digits>end) { +                endinpos = size; +                if (unicode_decode_call_errorhandler( +                    errors, &errorHandler, +                    "unicodeescape", "end of string in escape sequence", +                    starts, size, &startinpos, &endinpos, &exc, &s, +                    (PyObject **)&v, &outpos, &p)) +                    goto onError; +                goto nextByte; +            } +            for (i = 0; i < digits; ++i) {                  c = (unsigned char) s[i];                  if (!isxdigit(c)) { -                    if (unicodeescape_decoding_error(&p, errors, message)) +                    endinpos = (s+i+1)-starts; +                    if (unicode_decode_call_errorhandler( +                        errors, &errorHandler, +                        "unicodeescape", message, +                        starts, size, &startinpos, &endinpos, &exc, &s, +                        (PyObject **)&v, &outpos, &p))                          goto onError; -                    chr = 0xffffffff; -                    i++; -                    break; +                    goto nextByte;                  }                  chr = (chr<<4) & ~0xF;                  if (c >= '0' && c <= '9') @@ -1659,9 +1735,9 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,              }              s += i;              if (chr == 0xffffffff) -                    /* _decoding_error will have already written into the -                       target buffer. */ -                    break; +                /* _decoding_error will have already written into the +                   target buffer. */ +                break;          store:              /* when we get here, chr is a 32-bit unicode character */              if (chr <= 0xffff) @@ -1678,10 +1754,13 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,                  *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);  #endif              } else { -                if (unicodeescape_decoding_error( -                    &p, errors, -                    "illegal Unicode character") -                    ) +                endinpos = s-starts; +                outpos = p-PyUnicode_AS_UNICODE(v); +                if (unicode_decode_call_errorhandler( +                    errors, &errorHandler, +                    "unicodeescape", "illegal Unicode character", +                    starts, size, &startinpos, &endinpos, &exc, &s, +                    (PyObject **)&v, &outpos, &p))                      goto onError;              }              break; @@ -1717,13 +1796,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,                          goto store;                  }              } -            if (unicodeescape_decoding_error(&p, errors, message)) +            endinpos = s-starts; +            outpos = p-PyUnicode_AS_UNICODE(v); +            if (unicode_decode_call_errorhandler( +                errors, &errorHandler, +                "unicodeescape", message, +                starts, size, &startinpos, &endinpos, &exc, &s, +                (PyObject **)&v, &outpos, &p))                  goto onError;              break;          default:              if (s > end) { -                if (unicodeescape_decoding_error(&p, errors, "\\ at end of string")) +                message = "\\ at end of string"; +                s--; +                endinpos = s-starts; +                outpos = p-PyUnicode_AS_UNICODE(v); +                if (unicode_decode_call_errorhandler( +                    errors, &errorHandler, +                    "unicodeescape", message, +                    starts, size, &startinpos, &endinpos, &exc, &s, +                    (PyObject **)&v, &outpos, &p))                      goto onError;              }              else { @@ -1732,9 +1825,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,              }              break;          } +        nextByte: +        ;      } -    if (_PyUnicode_Resize(&v, (int)(p - buf))) -                goto onError; +    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) +        goto onError;      return (PyObject *)v;  ucnhashError: @@ -1742,10 +1837,14 @@ ucnhashError:          PyExc_UnicodeError,          "\\N escapes not supported (can't load unicodedata module)"          ); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  onError:      Py_XDECREF(v); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  } @@ -1909,20 +2008,27 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,  					   int size,  					   const char *errors)  { +    const char *starts = s; +    int startinpos; +    int endinpos; +    int outpos;      PyUnicodeObject *v; -    Py_UNICODE *p, *buf; +    Py_UNICODE *p;      const char *end;      const char *bs; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* Escaped strings will always be longer than the resulting         Unicode string, so we start with size here and then reduce the -       length after conversion to the true value. */ +       length after conversion to the true value. (But decoding error +       handler might have to resize the string) */      v = _PyUnicode_New(size);      if (v == NULL)  	goto onError;      if (size == 0)  	return (PyObject *)v; -    p = buf = PyUnicode_AS_UNICODE(v); +    p = PyUnicode_AS_UNICODE(v);      end = s + size;      while (s < end) {  	unsigned char c; @@ -1934,6 +2040,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,  	    *p++ = (unsigned char)*s++;  	    continue;  	} +	startinpos = s-starts;  	/* \u-escapes are only interpreted iff the number of leading  	   backslashes if odd */ @@ -1952,15 +2059,18 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,  	s++;  	/* \uXXXX with 4 hex digits */ -	for (x = 0, i = 0; i < 4; i++) { -	    c = (unsigned char)s[i]; +	outpos = p-PyUnicode_AS_UNICODE(v); +	for (x = 0, i = 0; i < 4; ++i, ++s) { +	    c = (unsigned char)*s;  	    if (!isxdigit(c)) { -		if (unicodeescape_decoding_error(&p, errors, -						 "truncated \\uXXXX")) +		endinpos = s-starts; +		if (unicode_decode_call_errorhandler( +		    errors, &errorHandler, +		    "rawunicodeescape", "truncated \\uXXXX", +		    starts, size, &startinpos, &endinpos, &exc, &s, +		    (PyObject **)&v, &outpos, &p))  		    goto onError; -		x = 0xffffffff; -		i++; -		break; +		goto nextByte;  	    }  	    x = (x<<4) & ~0xF;  	    if (c >= '0' && c <= '9') @@ -1970,16 +2080,20 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,  	    else  		x += 10 + c - 'A';  	} -	s += i; -	if (x != 0xffffffff) -		*p++ = x; +	*p++ = x; +	nextByte: +	;      } -    if (_PyUnicode_Resize(&v, (int)(p - buf))) +    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))  	goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)v;   onError:      Py_XDECREF(v); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  } @@ -2059,71 +2173,271 @@ PyObject *PyUnicode_DecodeLatin1(const char *s,      return NULL;  } -static -int latin1_encoding_error(const Py_UNICODE **source, -			  char **dest, -			  const char *errors, -			  const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "Latin-1 encoding error: %.400s", -		     details); -	return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -	return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -	**dest = '?'; -	(*dest)++; -	return 0; +/* create or adjust a UnicodeEncodeError */ +static void make_encode_exception(PyObject **exceptionObject, +    const char *encoding, +    const Py_UNICODE *unicode, int size, +    int startpos, int endpos, +    const char *reason) +{ +    if (*exceptionObject == NULL) { +	*exceptionObject = PyUnicodeEncodeError_Create( +	    encoding, unicode, size, startpos, endpos, reason);      }      else { -	PyErr_Format(PyExc_ValueError, -		     "Latin-1 encoding error; " -		     "unknown error handling code: %.400s", -		     errors); -	return -1; +	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) +	    goto onError; +	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) +	    goto onError; +	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) +	    goto onError; +	return; +	onError: +	Py_DECREF(*exceptionObject); +	*exceptionObject = NULL;      }  } -PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, -				 int size, -				 const char *errors) +/* raises a UnicodeEncodeError */ +static void raise_encode_exception(PyObject **exceptionObject, +    const char *encoding, +    const Py_UNICODE *unicode, int size, +    int startpos, int endpos, +    const char *reason)  { -    PyObject *repr; -    char *s, *start; +    make_encode_exception(exceptionObject, +	encoding, unicode, size, startpos, endpos, reason); +    if (*exceptionObject != NULL) +	PyCodec_StrictErrors(*exceptionObject); +} -    repr = PyString_FromStringAndSize(NULL, size); -    if (repr == NULL) -        return NULL; -    if (size == 0) -	return repr; +/* error handling callback helper: +   build arguments, call the callback and check the arguments, +   put the result into newpos and return the replacement string, which +   has to be freed by the caller */ +static PyObject *unicode_encode_call_errorhandler(const char *errors, +    PyObject **errorHandler, +    const char *encoding, const char *reason, +    const Py_UNICODE *unicode, int size, PyObject **exceptionObject, +    int startpos, int endpos, +    int *newpos) +{ +    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple"; -    s = PyString_AS_STRING(repr); -    start = s; -    while (size-- > 0) { -        Py_UNICODE ch = *p++; -	if (ch >= 256) { -	    if (latin1_encoding_error(&p, &s, errors,  -				      "ordinal not in range(256)")) -		goto onError; +    PyObject *restuple; +    PyObject *resunicode; + +    if (*errorHandler == NULL) { +	*errorHandler = PyCodec_LookupError(errors); +        if (*errorHandler == NULL) +	    return NULL; +    } + +    make_encode_exception(exceptionObject, +	encoding, unicode, size, startpos, endpos, reason); +    if (*exceptionObject == NULL) +	return NULL; + +    restuple = PyObject_CallFunctionObjArgs( +	*errorHandler, *exceptionObject, NULL); +    if (restuple == NULL) +	return NULL; +    if (!PyTuple_Check(restuple)) { +	PyErr_Format(PyExc_TypeError, &argparse[4]); +	Py_DECREF(restuple); +	return NULL; +    } +    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, +	&resunicode, newpos)) { +	Py_DECREF(restuple); +	return NULL; +    } +    if (*newpos<0) +	*newpos = 0; +    else if (*newpos>size) +	*newpos = size; +    Py_INCREF(resunicode); +    Py_DECREF(restuple); +    return resunicode; +} + +static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, +				 int size, +				 const char *errors, +				 int limit) +{ +    /* output object */ +    PyObject *res; +    /* pointers to the beginning and end+1 of input */ +    const Py_UNICODE *startp = p; +    const Py_UNICODE *endp = p + size; +    /* pointer to the beginning of the unencodable characters */ +    /* const Py_UNICODE *badp = NULL; */ +    /* pointer into the output */ +    char *str; +    /* current output position */ +    int respos = 0; +    int ressize; +    char *encoding = (limit == 256) ? "latin-1" : "ascii"; +    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL; +    /* the following variable is used for caching string comparisons +     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ +    int known_errorHandler = -1; + +    /* allocate enough for a simple encoding without +       replacements, if we need more, we'll resize */ +    res = PyString_FromStringAndSize(NULL, size); +    if (res == NULL) +        goto onError; +    if (size == 0) +	return res; +    str = PyString_AS_STRING(res); +    ressize = size; + +    while (p<endp) { +	Py_UNICODE c = *p; + +	/* can we encode this? */ +	if (c<limit) { +	    /* no overflow check, because we know that the space is enough */ +	    *str++ = (char)c; +	    ++p; +	} +	else { +	    int unicodepos = p-startp; +	    int requiredsize; +	    PyObject *repunicode; +	    int repsize; +	    int newpos; +	    int respos; +	    Py_UNICODE *uni2; +	    /* startpos for collecting unencodable chars */ +	    const Py_UNICODE *collstart = p; +	    const Py_UNICODE *collend = p; +	    /* find all unecodable characters */ +	    while ((collend < endp) && ((*collend)>=limit)) +		++collend; +	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */ +	    if (known_errorHandler==-1) { +		if ((errors==NULL) || (!strcmp(errors, "strict"))) +		    known_errorHandler = 1; +		else if (!strcmp(errors, "replace")) +		    known_errorHandler = 2; +		else if (!strcmp(errors, "ignore")) +		    known_errorHandler = 3; +		else if (!strcmp(errors, "xmlcharrefreplace")) +		    known_errorHandler = 4; +		else +		    known_errorHandler = 0; +	    } +	    switch (known_errorHandler) { +		case 1: /* strict */ +		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); +		    goto onError; +		case 2: /* replace */ +		    while (collstart++<collend) +			*str++ = '?'; /* fall through */ +		case 3: /* ignore */ +		    p = collend; +		    break; +		case 4: /* xmlcharrefreplace */ +		    respos = str-PyString_AS_STRING(res); +		    /* determine replacement size (temporarily (mis)uses p) */ +		    for (p = collstart, repsize = 0; p < collend; ++p) { +			if (*p<10) +			    repsize += 2+1+1; +			else if (*p<100) +			    repsize += 2+2+1; +			else if (*p<1000) +			    repsize += 2+3+1; +			else if (*p<10000) +			    repsize += 2+4+1; +			else if (*p<100000) +			    repsize += 2+5+1; +			else if (*p<1000000) +			    repsize += 2+6+1; +			else +			    repsize += 2+7+1; +		    } +		    requiredsize = respos+repsize+(endp-collend); +		    if (requiredsize > ressize) { +			if (requiredsize<2*ressize) +			    requiredsize = 2*ressize; +			if (_PyString_Resize(&res, requiredsize)) +			    goto onError; +			str = PyString_AS_STRING(res) + respos; +			ressize = requiredsize; +		    } +		    /* generate replacement (temporarily (mis)uses p) */ +		    for (p = collstart; p < collend; ++p) { +			str += sprintf(str, "&#%d;", (int)*p); +		    } +		    p = collend; +		    break; +		default: +		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, +			encoding, reason, startp, size, &exc, +			collstart-startp, collend-startp, &newpos); +		    if (repunicode == NULL) +			goto onError; +		    /* need more space? (at least enough for what we +		       have+the replacement+the rest of the string, so +		       we won't have to check space for encodable characters) */ +		    respos = str-PyString_AS_STRING(res); +		    repsize = PyUnicode_GET_SIZE(repunicode); +		    requiredsize = respos+repsize+(endp-collend); +		    if (requiredsize > ressize) { +			if (requiredsize<2*ressize) +			    requiredsize = 2*ressize; +			if (_PyString_Resize(&res, requiredsize)) { +			    Py_DECREF(repunicode); +			    goto onError; +			} +			str = PyString_AS_STRING(res) + respos; +			ressize = requiredsize; +		    } +		    /* check if there is anything unencodable in the replacement +		       and copy it to the output */ +		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { +			c = *uni2; +			if (c >= limit) { +			    raise_encode_exception(&exc, encoding, startp, size, +				unicodepos, unicodepos+1, reason); +			    Py_DECREF(repunicode); +			    goto onError; +			} +			*str = (char)c; +		    } +		    p = startp + newpos; +		    Py_DECREF(repunicode); +	    }  	} -	else -            *s++ = (char)ch;      } -    /* Resize if error handling skipped some characters */ -    if (s - start < PyString_GET_SIZE(repr)) -	_PyString_Resize(&repr, s - start); -    return repr; +    /* Resize if we allocated to much */ +    respos = str-PyString_AS_STRING(res); +    if (respos<ressize) +       /* If this falls res will be NULL */ +	_PyString_Resize(&res, respos); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc); +    return res; - onError: -    Py_DECREF(repr); +    onError: +    Py_XDECREF(res); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  } +PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, +				 int size, +				 const char *errors) +{ +    return unicode_encode_ucs1(p, size, errors, 256); +} +  PyObject *PyUnicode_AsLatin1String(PyObject *unicode)  {      if (!PyUnicode_Check(unicode)) { @@ -2137,42 +2451,19 @@ PyObject *PyUnicode_AsLatin1String(PyObject *unicode)  /* --- 7-bit ASCII Codec -------------------------------------------------- */ -static -int ascii_decoding_error(const char **source, -			 Py_UNICODE **dest, -			 const char *errors, -			 const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "ASCII decoding error: %.400s", -		     details); -	return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -	return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -	**dest = Py_UNICODE_REPLACEMENT_CHARACTER; -	(*dest)++; -	return 0; -    } -    else { -	PyErr_Format(PyExc_ValueError, -		     "ASCII decoding error; " -		     "unknown error handling code: %.400s", -		     errors); -	return -1; -    } -} -  PyObject *PyUnicode_DecodeASCII(const char *s,  				int size,  				const char *errors)  { +    const char *starts = s;      PyUnicodeObject *v;      Py_UNICODE *p; +    int startinpos; +    int endinpos; +    int outpos; +    const char *e; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* ASCII is equivalent to the first 128 ordinals in Unicode. */      if (size == 1 && *(unsigned char*)s < 128) { @@ -2186,89 +2477,44 @@ PyObject *PyUnicode_DecodeASCII(const char *s,      if (size == 0)  	return (PyObject *)v;      p = PyUnicode_AS_UNICODE(v); -    while (size-- > 0) { -	register unsigned char c; - -	c = (unsigned char)*s++; -	if (c < 128) +    e = s + size; +    while (s < e) { +	register unsigned char c = (unsigned char)*s; +	if (c < 128) {  	    *p++ = c; -	else if (ascii_decoding_error(&s, &p, errors,  -				      "ordinal not in range(128)")) +	    ++s; +	} +	else { +	    startinpos = s-starts; +	    endinpos = startinpos + 1; +	    outpos = p-PyUnicode_AS_UNICODE(v); +	    if (unicode_decode_call_errorhandler( +		 errors, &errorHandler, +		 "ascii", "ordinal not in range(128)", +		 starts, size, &startinpos, &endinpos, &exc, &s, +		 (PyObject **)&v, &outpos, &p))  		goto onError; +	}      }      if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))  	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))  	    goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)v;   onError:      Py_XDECREF(v); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return NULL;  } -static -int ascii_encoding_error(const Py_UNICODE **source, -			 char **dest, -			 const char *errors, -			 const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "ASCII encoding error: %.400s", -		     details); -	return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -	return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -	**dest = '?'; -	(*dest)++; -	return 0; -    } -    else { -	PyErr_Format(PyExc_ValueError, -		     "ASCII encoding error; " -		     "unknown error handling code: %.400s", -		     errors); -	return -1; -    } -} -  PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,  				int size,  				const char *errors)  { -    PyObject *repr; -    char *s, *start; - -    repr = PyString_FromStringAndSize(NULL, size); -    if (repr == NULL) -        return NULL; -    if (size == 0) -	return repr; - -    s = PyString_AS_STRING(repr); -    start = s; -    while (size-- > 0) { -        Py_UNICODE ch = *p++; -	if (ch >= 128) { -	    if (ascii_encoding_error(&p, &s, errors,  -				      "ordinal not in range(128)")) -		goto onError; -	} -	else -            *s++ = (char)ch; -    } -    /* Resize if error handling skipped some characters */ -    if (s - start < PyString_GET_SIZE(repr)) -	_PyString_Resize(&repr, s - start); -    return repr; - - onError: -    Py_DECREF(repr); -    return NULL; +    return unicode_encode_ucs1(p, size, errors, 128);  }  PyObject *PyUnicode_AsASCIIString(PyObject *unicode) @@ -2348,44 +2594,21 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,  /* --- Character Mapping Codec -------------------------------------------- */ -static -int charmap_decoding_error(const char **source, -			 Py_UNICODE **dest, -			 const char *errors, -			 const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "charmap decoding error: %.400s", -		     details); -	return -1; -    } -    else if (strcmp(errors,"ignore") == 0) { -	return 0; -    } -    else if (strcmp(errors,"replace") == 0) { -	**dest = Py_UNICODE_REPLACEMENT_CHARACTER; -	(*dest)++; -	return 0; -    } -    else { -	PyErr_Format(PyExc_ValueError, -		     "charmap decoding error; " -		     "unknown error handling code: %.400s", -		     errors); -	return -1; -    } -} -  PyObject *PyUnicode_DecodeCharmap(const char *s,  				  int size,  				  PyObject *mapping,  				  const char *errors)  { +    const char *starts = s; +    int startinpos; +    int endinpos; +    int outpos; +    const char *e;      PyUnicodeObject *v;      Py_UNICODE *p;      int extrachars = 0; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL;      /* Default to Latin-1 */      if (mapping == NULL) @@ -2397,8 +2620,9 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,      if (size == 0)  	return (PyObject *)v;      p = PyUnicode_AS_UNICODE(v); -    while (size-- > 0) { -	unsigned char ch = *s++; +    e = s + size; +    while (s < e) { +	unsigned char ch = *s;  	PyObject *w, *x;  	/* Get mapping (char ordinal -> integer, Unicode char or None) */ @@ -2430,11 +2654,18 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,  	}  	else if (x == Py_None) {  	    /* undefined mapping */ -	    if (charmap_decoding_error(&s, &p, errors,  -				       "character maps to <undefined>")) { +	    outpos = p-PyUnicode_AS_UNICODE(v); +	    startinpos = s-starts; +	    endinpos = startinpos+1; +	    if (unicode_decode_call_errorhandler( +		 errors, &errorHandler, +		 "charmap", "character maps to <undefined>", +		 starts, size, &startinpos, &endinpos, &exc, &s, +		 (PyObject **)&v, &outpos, &p)) {  		Py_DECREF(x);  		goto onError;  	    } +	    continue;  	}  	else if (PyUnicode_Check(x)) {  	    int targetsize = PyUnicode_GET_SIZE(x); @@ -2474,45 +2705,233 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,  	    goto onError;  	}  	Py_DECREF(x); +	++s;      }      if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))  	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))  	    goto onError; +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      return (PyObject *)v;   onError: +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc);      Py_XDECREF(v);      return NULL;  } -static -int charmap_encoding_error(const Py_UNICODE **source, -			   char **dest, -			   const char *errors, -			   const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "charmap encoding error: %.400s", -		     details); -	return -1; +/* Lookup the character ch in the mapping. If the character +   can't be found, Py_None is returned (or NULL, if another +   error occured). */ +static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) +{ +    PyObject *w = PyInt_FromLong((long)c); +    PyObject *x; + +    if (w == NULL) +	 return NULL; +    x = PyObject_GetItem(mapping, w); +    Py_DECREF(w); +    if (x == NULL) { +	if (PyErr_ExceptionMatches(PyExc_LookupError)) { +	    /* No mapping found means: mapping is undefined. */ +	    PyErr_Clear(); +	    x = Py_None; +	    Py_INCREF(x); +	    return x; +	} else +	    return NULL;      } -    else if (strcmp(errors,"ignore") == 0) { -	return 0; +    else if (PyInt_Check(x)) { +	long value = PyInt_AS_LONG(x); +	if (value < 0 || value > 255) { +	    PyErr_SetString(PyExc_TypeError, +			     "character mapping must be in range(256)"); +	    Py_DECREF(x); +	    return NULL; +	} +	return x;      } -    else if (strcmp(errors,"replace") == 0) { -	**dest = '?'; -	(*dest)++; -	return 0; +    else if (PyString_Check(x)) +	return x; +    else { +	/* wrong return value */ +	PyErr_SetString(PyExc_TypeError, +	      "character mapping must return integer, None or str"); +	Py_DECREF(x); +	return NULL;      } +} + +/* lookup the character, put the result in the output string and adjust +   various state variables. Reallocate the output string if not enough +   space is available. Return a new reference to the object that +   was put in the output buffer, or Py_None, if the mapping was undefined +   (in which case no character was written) or NULL, if a +   reallocation error ocurred. The called must decref the result */ +static +PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping, +    PyObject **outobj, int *outpos) +{ +    PyObject *rep = charmapencode_lookup(c, mapping); + +    if (rep==NULL) +	return NULL; +    else if (rep==Py_None) +	return rep;      else { -	PyErr_Format(PyExc_ValueError, -		     "charmap encoding error; " -		     "unknown error handling code: %.400s", -		     errors); -	return -1; +	char *outstart = PyString_AS_STRING(*outobj); +	int outsize = PyString_GET_SIZE(*outobj); +	if (PyInt_Check(rep)) { +	    int requiredsize = *outpos+1; +	    if (outsize<requiredsize) { +		/* exponentially overallocate to minimize reallocations */ +		if (requiredsize < 2*outsize) +		    requiredsize = 2*outsize; +		if (_PyString_Resize(outobj, requiredsize)) { +		    Py_DECREF(rep); +		    return NULL; +		} +		outstart = PyString_AS_STRING(*outobj); +	    } +	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); +	} +	else { +	    const char *repchars = PyString_AS_STRING(rep); +	    int repsize = PyString_GET_SIZE(rep); +	    int requiredsize = *outpos+repsize; +	    if (outsize<requiredsize) { +		/* exponentially overallocate to minimize reallocations */ +		if (requiredsize < 2*outsize) +		    requiredsize = 2*outsize; +		if (_PyString_Resize(outobj, requiredsize)) { +		    Py_DECREF(rep); +		    return NULL; +		} +		outstart = PyString_AS_STRING(*outobj); +	    } +	    memcpy(outstart + *outpos, repchars, repsize); +	    *outpos += repsize; +	} +    } +    return rep; +} + +/* handle an error in PyUnicode_EncodeCharmap +   Return 0 on success, -1 on error */ +static +int charmap_encoding_error( +    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping, +    PyObject **exceptionObject, +    int *known_errorHandler, PyObject *errorHandler, const char *errors, +    PyObject **res, int *respos) +{ +    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ +    int repsize; +    int newpos; +    Py_UNICODE *uni2; +    /* startpos for collecting unencodable chars */ +    int collstartpos = *inpos; +    int collendpos = *inpos+1; +    int collpos; +    char *encoding = "charmap"; +    char *reason = "character maps to <undefined>"; + +    PyObject *x; +    /* find all unencodable characters */ +    while (collendpos < size) { +	x = charmapencode_lookup(p[collendpos], mapping); +	if (x==NULL) +	    return -1; +	else if (x!=Py_None) { +	    Py_DECREF(x); +	    break; +	} +	Py_DECREF(x); +	++collendpos; +    } +    /* cache callback name lookup +     * (if not done yet, i.e. it's the first error) */ +    if (*known_errorHandler==-1) { +	if ((errors==NULL) || (!strcmp(errors, "strict"))) +	    *known_errorHandler = 1; +	else if (!strcmp(errors, "replace")) +	    *known_errorHandler = 2; +	else if (!strcmp(errors, "ignore")) +	    *known_errorHandler = 3; +	else if (!strcmp(errors, "xmlcharrefreplace")) +	    *known_errorHandler = 4; +	else +	    *known_errorHandler = 0; +    } +    switch (*known_errorHandler) { +	case 1: /* strict */ +	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); +	    return -1; +	case 2: /* replace */ +	    for (collpos = collstartpos; collpos<collendpos; ++collpos) { +		x = charmapencode_output('?', mapping, res, respos); +		if (x==NULL) { +		    return -1; +		} +		else if (x==Py_None) { +		    Py_DECREF(x); +		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); +		    return -1; +		} +		Py_DECREF(x); +	    } +	    /* fall through */ +	case 3: /* ignore */ +	    *inpos = collendpos; +	    break; +	case 4: /* xmlcharrefreplace */ +	    /* generate replacement (temporarily (mis)uses p) */ +	    for (collpos = collstartpos; collpos < collendpos; ++collpos) { +		char buffer[2+29+1+1]; +		char *cp; +		sprintf(buffer, "&#%d;", (int)p[collpos]); +		for (cp = buffer; *cp; ++cp) { +		    x = charmapencode_output(*cp, mapping, res, respos); +		    if (x==NULL) +			return -1; +		    else if (x==Py_None) { +			Py_DECREF(x); +			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); +			return -1; +		    } +		    Py_DECREF(x); +		} +	    } +	    *inpos = collendpos; +	    break; +	default: +	    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, +		encoding, reason, p, size, exceptionObject, +		collstartpos, collendpos, &newpos); +	    if (repunicode == NULL) +		return -1; +	    /* generate replacement  */ +	    repsize = PyUnicode_GET_SIZE(repunicode); +	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { +		x = charmapencode_output(*uni2, mapping, res, respos); +		if (x==NULL) { +		    Py_DECREF(repunicode); +		    return -1; +		} +		else if (x==Py_None) { +		    Py_DECREF(repunicode); +		    Py_DECREF(x); +		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); +		    return -1; +		} +		Py_DECREF(x); +	    } +	    *inpos = newpos; +	    Py_DECREF(repunicode);      } +    return 0;  }  PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, @@ -2520,101 +2939,62 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,  				  PyObject *mapping,  				  const char *errors)  { -    PyObject *v; -    char *s; -    int extrachars = 0; +    /* output object */ +    PyObject *res = NULL; +    /* current input position */ +    int inpos = 0; +    /* current output position */ +    int respos = 0; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL; +    /* the following variable is used for caching string comparisons +     * -1=not initialized, 0=unknown, 1=strict, 2=replace, +     * 3=ignore, 4=xmlcharrefreplace */ +    int known_errorHandler = -1;      /* Default to Latin-1 */      if (mapping == NULL)  	return PyUnicode_EncodeLatin1(p, size, errors); -    v = PyString_FromStringAndSize(NULL, size); -    if (v == NULL) -        return NULL; +    /* allocate enough for a simple encoding without +       replacements, if we need more, we'll resize */ +    res = PyString_FromStringAndSize(NULL, size); +    if (res == NULL) +        goto onError;      if (size == 0) -	return v; -    s = PyString_AS_STRING(v); -    while (size-- > 0) { -	Py_UNICODE ch = *p++; -	PyObject *w, *x; +	return res; -	/* Get mapping (Unicode ordinal -> string char, integer or None) */ -	w = PyInt_FromLong((long)ch); -	if (w == NULL) +    while (inpos<size) { +	/* try to encode it */ +	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos); +	if (x==NULL) /* error */  	    goto onError; -	x = PyObject_GetItem(mapping, w); -	Py_DECREF(w); -	if (x == NULL) { -	    if (PyErr_ExceptionMatches(PyExc_LookupError)) { -		/* No mapping found means: mapping is undefined. */ -		PyErr_Clear(); -		x = Py_None; -		Py_INCREF(x); -	    } else +	if (x==Py_None) { /* unencodable character */ +	    if (charmap_encoding_error(p, size, &inpos, mapping, +		&exc, +		&known_errorHandler, errorHandler, errors, +		&res, &respos))  		goto onError;  	} +	else +	    /* done with this character => adjust input position */ +	    ++inpos; +	Py_DECREF(x); +    } -	/* Apply mapping */ -	if (PyInt_Check(x)) { -	    long value = PyInt_AS_LONG(x); -	    if (value < 0 || value > 255) { -		PyErr_SetString(PyExc_TypeError, -				"character mapping must be in range(256)"); -		Py_DECREF(x); -		goto onError; -	    } -	    *s++ = (char)value; -	} -	else if (x == Py_None) { -	    /* undefined mapping */ -	    if (charmap_encoding_error(&p, &s, errors,  -				       "character maps to <undefined>")) { -		Py_DECREF(x); -		goto onError; -	    } -	} -	else if (PyString_Check(x)) { -	    int targetsize = PyString_GET_SIZE(x); - -	    if (targetsize == 1) -		/* 1-1 mapping */ -		*s++ = *PyString_AS_STRING(x); - -	    else if (targetsize > 1) { -		/* 1-n mapping */ -		if (targetsize > extrachars) { -		    /* resize first */ -		    int oldpos = (int)(s - PyString_AS_STRING(v)); -		    int needed = (targetsize - extrachars) + \ -			         (targetsize << 2); -		    extrachars += needed; -		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) { -			Py_DECREF(x); -			goto onError; -		    } -		    s = PyString_AS_STRING(v) + oldpos; -		} -		memcpy(s, PyString_AS_STRING(x), targetsize); -		s += targetsize; -		extrachars -= targetsize; -	    } -	    /* 1-0 mapping: skip the character */ -	} -	else { -	    /* wrong return value */ -	    PyErr_SetString(PyExc_TypeError, -		  "character mapping must return integer, None or unicode"); -	    Py_DECREF(x); +    /* Resize if we allocated to much */ +    if (respos<PyString_GET_SIZE(res)) { +	if (_PyString_Resize(&res, respos))  	    goto onError; -	} -	Py_DECREF(x);      } -    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) -	_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))); -    return v; +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler); +    return res; - onError: -    Py_XDECREF(v); +    onError: +    Py_XDECREF(res); +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler);      return NULL;  } @@ -2631,115 +3011,344 @@ PyObject *PyUnicode_AsCharmapString(PyObject *unicode,  				   NULL);  } +/* create or adjust a UnicodeTranslateError */ +static void make_translate_exception(PyObject **exceptionObject, +    const Py_UNICODE *unicode, int size, +    int startpos, int endpos, +    const char *reason) +{ +    if (*exceptionObject == NULL) { +    	*exceptionObject = PyUnicodeTranslateError_Create( +	    unicode, size, startpos, endpos, reason); +    } +    else { +	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) +	    goto onError; +	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) +	    goto onError; +	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) +	    goto onError; +	return; +	onError: +	Py_DECREF(*exceptionObject); +	*exceptionObject = NULL; +    } +} + +/* raises a UnicodeTranslateError */ +static void raise_translate_exception(PyObject **exceptionObject, +    const Py_UNICODE *unicode, int size, +    int startpos, int endpos, +    const char *reason) +{ +    make_translate_exception(exceptionObject, +	unicode, size, startpos, endpos, reason); +    if (*exceptionObject != NULL) +	PyCodec_StrictErrors(*exceptionObject); +} + +/* error handling callback helper: +   build arguments, call the callback and check the arguments, +   put the result into newpos and return the replacement string, which +   has to be freed by the caller */ +static PyObject *unicode_translate_call_errorhandler(const char *errors, +    PyObject **errorHandler, +    const char *reason, +    const Py_UNICODE *unicode, int size, PyObject **exceptionObject, +    int startpos, int endpos, +    int *newpos) +{ +    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple"; + +    PyObject *restuple; +    PyObject *resunicode; + +    if (*errorHandler == NULL) { +	*errorHandler = PyCodec_LookupError(errors); +        if (*errorHandler == NULL) +	    return NULL; +    } + +    make_translate_exception(exceptionObject, +	unicode, size, startpos, endpos, reason); +    if (*exceptionObject == NULL) +	return NULL; + +    restuple = PyObject_CallFunctionObjArgs( +	*errorHandler, *exceptionObject, NULL); +    if (restuple == NULL) +	return NULL; +    if (!PyTuple_Check(restuple)) { +	PyErr_Format(PyExc_TypeError, &argparse[4]); +	Py_DECREF(restuple); +	return NULL; +    } +    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, +	&resunicode, newpos)) { +	Py_DECREF(restuple); +	return NULL; +    } +    if (*newpos<0) +	*newpos = 0; +    else if (*newpos>size) +	*newpos = size; +    Py_INCREF(resunicode); +    Py_DECREF(restuple); +    return resunicode; +} + +/* Lookup the character ch in the mapping and put the result in result, +   which must be decrefed by the caller. +   Return 0 on success, -1 on error */  static -int translate_error(const Py_UNICODE **source, -		    Py_UNICODE **dest, -		    const char *errors, -		    const char *details)  -{ -    if ((errors == NULL) || -	(strcmp(errors,"strict") == 0)) { -	PyErr_Format(PyExc_UnicodeError, -		     "translate error: %.400s", -		     details); -	return -1; +int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) +{ +    PyObject *w = PyInt_FromLong((long)c); +    PyObject *x; + +    if (w == NULL) +	 return -1; +    x = PyObject_GetItem(mapping, w); +    Py_DECREF(w); +    if (x == NULL) { +	if (PyErr_ExceptionMatches(PyExc_LookupError)) { +	    /* No mapping found means: use 1:1 mapping. */ +	    PyErr_Clear(); +	    *result = NULL; +	    return 0; +	} else +	    return -1;      } -    else if (strcmp(errors,"ignore") == 0) { +    else if (x == Py_None) { +	*result = x;  	return 0;      } -    else if (strcmp(errors,"replace") == 0) { -	**dest = '?'; -	(*dest)++; +    else if (PyInt_Check(x)) { +	long value = PyInt_AS_LONG(x); +	long max = PyUnicode_GetMax(); +	if (value < 0 || value > max) { +	    PyErr_Format(PyExc_TypeError, +			     "character mapping must be in range(0x%lx)", max+1); +	    Py_DECREF(x); +	    return -1; +	} +	*result = x; +	return 0; +    } +    else if (PyUnicode_Check(x)) { +	*result = x;  	return 0;      }      else { -	PyErr_Format(PyExc_ValueError, -		     "translate error; " -		     "unknown error handling code: %.400s", -		     errors); +	/* wrong return value */ +	PyErr_SetString(PyExc_TypeError, +	      "character mapping must return integer, None or unicode"); +	return -1; +    } +} +/* ensure that *outobj is at least requiredsize characters long, +if not reallocate and adjust various state variables. +Return 0 on success, -1 on error */ +static +int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize, +    int requiredsize) +{ +    if (requiredsize > *outsize) { +	/* remember old output position */ +	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj); +	/* exponentially overallocate to minimize reallocations */ +	if (requiredsize < 2 * *outsize) +	    requiredsize = 2 * *outsize; +	if (_PyUnicode_Resize(outobj, requiredsize)) +	    return -1; +	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos; +	*outsize = requiredsize; +    } +    return 0; +} +/* lookup the character, put the result in the output string and adjust +   various state variables. Return a new reference to the object that +   was put in the output buffer in *result, or Py_None, if the mapping was +   undefined (in which case no character was written). +   The called must decref result. +   Return 0 on success, -1 on error. */ +static +int charmaptranslate_output(Py_UNICODE c, PyObject *mapping, +    PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res) +{ +    if (charmaptranslate_lookup(c, mapping, res))  	return -1; +    if (*res==NULL) { +	/* not found => default to 1:1 mapping */ +	*(*outp)++ = (Py_UNICODE)c; +    } +    else if (*res==Py_None) +	; +    else if (PyInt_Check(*res)) { +	/* no overflow check, because we know that the space is enough */ +	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); +    } +    else if (PyUnicode_Check(*res)) { +	int repsize = PyUnicode_GET_SIZE(*res); +	if (repsize==1) { +	    /* no overflow check, because we know that the space is enough */ +	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res); +	} +	else if (repsize!=0) { +	    /* more than one character */ +	    int requiredsize = *outsize + repsize - 1; +	    if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize)) +		return -1; +	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); +	    *outp += repsize; +	}      } +    else +	return -1; +    return 0;  } -PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, +PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,  				     int size,  				     PyObject *mapping,  				     const char *errors)  { -    PyUnicodeObject *v; -    Py_UNICODE *p; -     +    /* output object */ +    PyObject *res = NULL; +    /* pointers to the beginning and end+1 of input */ +    const Py_UNICODE *startp = p; +    const Py_UNICODE *endp = p + size; +    /* pointer into the output */ +    Py_UNICODE *str; +    /* current output position */ +    int respos = 0; +    int ressize; +    char *reason = "character maps to <undefined>"; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL; +    /* the following variable is used for caching string comparisons +     * -1=not initialized, 0=unknown, 1=strict, 2=replace, +     * 3=ignore, 4=xmlcharrefreplace */ +    int known_errorHandler = -1; +      if (mapping == NULL) {  	PyErr_BadArgument();  	return NULL;      } -     -    /* Output will never be longer than input */ -    v = _PyUnicode_New(size); -    if (v == NULL) -	goto onError; -    if (size == 0) -	goto done; -    p = PyUnicode_AS_UNICODE(v); -    while (size-- > 0) { -	Py_UNICODE ch = *s++; -	PyObject *w, *x; -	/* Get mapping */ -	w = PyInt_FromLong(ch); -	if (w == NULL) -	    goto onError; -	x = PyObject_GetItem(mapping, w); -	Py_DECREF(w); -	if (x == NULL) { -	    if (PyErr_ExceptionMatches(PyExc_LookupError)) { -		/* No mapping found: default to 1-1 mapping */ -		PyErr_Clear(); -		*p++ = ch; -		continue; -	    } +    /* allocate enough for a simple 1:1 translation without +       replacements, if we need more, we'll resize */ +    res = PyUnicode_FromUnicode(NULL, size); +    if (res == NULL) +        goto onError; +    if (size == 0) +	return res; +    str = PyUnicode_AS_UNICODE(res); +    ressize = size; + +    while (p<endp) { +	/* try to encode it */ +	PyObject *x = NULL; +	if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) { +	    Py_XDECREF(x);  	    goto onError;  	} - -	/* Apply mapping */ -	if (PyInt_Check(x)) -	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x); -	else if (x == Py_None) { -	    /* undefined mapping */ -	    if (translate_error(&s, &p, errors,  -				"character maps to <undefined>")) { -		Py_DECREF(x); -		goto onError; +	if (x!=Py_None) /* it worked => adjust input pointer */ +	    ++p; +	else { /* untranslatable character */ +	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ +	    int repsize; +	    int newpos; +	    Py_UNICODE *uni2; +	    /* startpos for collecting untranslatable chars */ +	    const Py_UNICODE *collstart = p; +	    const Py_UNICODE *collend = p+1; +	    const Py_UNICODE *coll; + +	    Py_XDECREF(x); +	    /* find all untranslatable characters */ +	    while (collend < endp) { +	    	if (charmaptranslate_lookup(*collend, mapping, &x)) +		    goto onError; +		Py_XDECREF(x); +		if (x!=Py_None) +		    break; +		++collend;  	    } -	} -	else if (PyUnicode_Check(x)) { -	    if (PyUnicode_GET_SIZE(x) != 1) { -		/* 1-n mapping */ -		PyErr_SetString(PyExc_NotImplementedError, -				"1-n mappings are currently not implemented"); -		Py_DECREF(x); -		goto onError; +	    /* cache callback name lookup +	     * (if not done yet, i.e. it's the first error) */ +	    if (known_errorHandler==-1) { +		if ((errors==NULL) || (!strcmp(errors, "strict"))) +		    known_errorHandler = 1; +		else if (!strcmp(errors, "replace")) +		    known_errorHandler = 2; +		else if (!strcmp(errors, "ignore")) +		    known_errorHandler = 3; +		else if (!strcmp(errors, "xmlcharrefreplace")) +		    known_errorHandler = 4; +		else +		    known_errorHandler = 0; +	    } +	    switch (known_errorHandler) { +		case 1: /* strict */ +		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); +		    goto onError; +		case 2: /* replace */ +		    /* No need to check for space, this is a 1:1 replacement */ +		    for (coll = collstart; coll<collend; ++coll) +			*str++ = '?'; +		    /* fall through */ +		case 3: /* ignore */ +		    p = collend; +		    break; +		case 4: /* xmlcharrefreplace */ +		    /* generate replacement (temporarily (mis)uses p) */ +		    for (p = collstart; p < collend; ++p) { +			char buffer[2+29+1+1]; +			char *cp; +			sprintf(buffer, "&#%d;", (int)*p); +			if (charmaptranslate_makespace(&res, &str, &ressize, +			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) +			    goto onError; +			for (cp = buffer; *cp; ++cp) +			    *str++ = *cp; +		    } +		    p = collend; +		    break; +		default: +		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, +			reason, startp, size, &exc, +			collstart-startp, collend-startp, &newpos); +		    if (repunicode == NULL) +			goto onError; +		    /* generate replacement  */ +		    repsize = PyUnicode_GET_SIZE(repunicode); +		    if (charmaptranslate_makespace(&res, &str, &ressize, +			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { +			Py_DECREF(repunicode); +			goto onError; +		    } +		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) +			*str++ = *uni2; +		    p = startp + newpos; +		    Py_DECREF(repunicode);  	    } -	    *p++ = *PyUnicode_AS_UNICODE(x); -	} -	else { -	    /* wrong return value */ -	    PyErr_SetString(PyExc_TypeError, -		  "translate mapping must return integer, None or unicode"); -	    Py_DECREF(x); -	    goto onError;  	} -	Py_DECREF(x);      } -    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) -	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) +    /* Resize if we allocated to much */ +    respos = str-PyUnicode_AS_UNICODE(res); +    if (respos<ressize) { +	if (_PyUnicode_Resize(&res, respos))  	    goto onError; +    } +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler); +    return res; - done: -    return (PyObject *)v; -     - onError: -    Py_XDECREF(v); +    onError: +    Py_XDECREF(res); +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler);      return NULL;  } @@ -2772,6 +3381,13 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,  			    const char *errors)  {      Py_UNICODE *p, *end; +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL; +    const char *encoding = "decimal"; +    const char *reason = "invalid decimal Unicode string"; +    /* the following variable is used for caching string comparisons +     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ +    int known_errorHandler = -1;      if (output == NULL) {  	PyErr_BadArgument(); @@ -2781,40 +3397,110 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,      p = s;      end = s + length;      while (p < end) { -	register Py_UNICODE ch = *p++; +	register Py_UNICODE ch = *p;  	int decimal; +	PyObject *repunicode; +	int repsize; +	int newpos; +	Py_UNICODE *uni2; +	Py_UNICODE *collstart; +	Py_UNICODE *collend;  	if (Py_UNICODE_ISSPACE(ch)) {  	    *output++ = ' '; +	    ++p;  	    continue;  	}  	decimal = Py_UNICODE_TODECIMAL(ch);  	if (decimal >= 0) {  	    *output++ = '0' + decimal; +	    ++p;  	    continue;  	}  	if (0 < ch && ch < 256) {  	    *output++ = (char)ch; +	    ++p;  	    continue;  	} -	/* All other characters are considered invalid */ -	if (errors == NULL || strcmp(errors, "strict") == 0) { -	    PyErr_SetString(PyExc_ValueError, -			    "invalid decimal Unicode string"); -	    goto onError; +	/* All other characters are considered unencodable */ +	collstart = p; +	collend = p+1; +	while (collend < end) { +	    if ((0 < *collend && *collend < 256) || +	        !Py_UNICODE_ISSPACE(*collend) || +	        Py_UNICODE_TODECIMAL(*collend)) +		break;  	} -	else if (strcmp(errors, "ignore") == 0) -	    continue; -	else if (strcmp(errors, "replace") == 0) { -	    *output++ = '?'; -	    continue; +	/* cache callback name lookup +	 * (if not done yet, i.e. it's the first error) */ +	if (known_errorHandler==-1) { +	    if ((errors==NULL) || (!strcmp(errors, "strict"))) +		known_errorHandler = 1; +	    else if (!strcmp(errors, "replace")) +		known_errorHandler = 2; +	    else if (!strcmp(errors, "ignore")) +		known_errorHandler = 3; +	    else if (!strcmp(errors, "xmlcharrefreplace")) +		known_errorHandler = 4; +	    else +		known_errorHandler = 0; +	} +	switch (known_errorHandler) { +	    case 1: /* strict */ +		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); +		goto onError; +	    case 2: /* replace */ +		for (p = collstart; p < collend; ++p) +		    *output++ = '?'; +		/* fall through */ +	    case 3: /* ignore */ +		p = collend; +		break; +	    case 4: /* xmlcharrefreplace */ +		/* generate replacement (temporarily (mis)uses p) */ +		for (p = collstart; p < collend; ++p) +		    output += sprintf(output, "&#%d;", (int)*p); +		p = collend; +		break; +	    default: +		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, +		    encoding, reason, s, length, &exc, +		    collstart-s, collend-s, &newpos); +		if (repunicode == NULL) +		    goto onError; +		/* generate replacement  */ +		repsize = PyUnicode_GET_SIZE(repunicode); +		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { +		    Py_UNICODE ch = *uni2; +		    if (Py_UNICODE_ISSPACE(ch)) +			*output++ = ' '; +		    else { +			decimal = Py_UNICODE_TODECIMAL(ch); +			if (decimal >= 0) +			    *output++ = '0' + decimal; +			else if (0 < ch && ch < 256) +			    *output++ = (char)ch; +			else { +			    Py_DECREF(repunicode); +			    raise_encode_exception(&exc, encoding, +				s, length, collstart-s, collend-s, reason); +			    goto onError; +			} +		    } +		} +		p = s + newpos; +		Py_DECREF(repunicode);  	}      }      /* 0-terminate the output string */      *output++ = '\0'; +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler);      return 0;   onError: +    Py_XDECREF(exc); +    Py_XDECREF(errorHandler);      return -1;  } @@ -3927,7 +4613,9 @@ PyDoc_STRVAR(encode__doc__,  Return an encoded string version of S. Default encoding is the current\n\  default string encoding. errors may be given to set a different error\n\  handling scheme. Default is 'strict' meaning that encoding errors raise\n\ -a ValueError. Other possible values are 'ignore' and 'replace'."); +a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ +'xmlcharrefreplace' as well as any other name registered with\n\ +codecs.register_error that can handle UnicodeEncodeErrors.");  static PyObject *  unicode_encode(PyUnicodeObject *self, PyObject *args) diff --git a/Python/codecs.c b/Python/codecs.c index 3e54d8f920..09cba7516c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -422,12 +422,409 @@ PyObject *PyCodec_Decode(PyObject *object,      return NULL;  } +static PyObject *_PyCodec_ErrorRegistry; + +/* Register the error handling callback function error under the name +   name. This function will be called by the codec when it encounters +   an unencodable characters/undecodable bytes and doesn't know the +   callback name, when name is specified as the error parameter +   in the call to the encode/decode function. +   Return 0 on success, -1 on error */ +int PyCodec_RegisterError(const char *name, PyObject *error) +{ +    if (!PyCallable_Check(error)) { +	PyErr_SetString(PyExc_TypeError, "handler must be callable"); +	return -1; +    } +    return PyDict_SetItemString( _PyCodec_ErrorRegistry, (char *)name, error); +} + +/* Lookup the error handling callback function registered under the +   name error. As a special case NULL can be passed, in which case +   the error handling callback for strict encoding will be returned. */ +PyObject *PyCodec_LookupError(const char *name) +{ +    PyObject *handler = NULL; + +    if (name==NULL) +	name = "strict"; +    handler = PyDict_GetItemString(_PyCodec_ErrorRegistry, (char *)name); +    if (!handler) +	PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name); +    else +	Py_INCREF(handler); +    return handler; +} + +static void wrong_exception_type(PyObject *exc) +{ +    PyObject *type = PyObject_GetAttrString(exc, "__class__"); +    if (type != NULL) { +	PyObject *name = PyObject_GetAttrString(type, "__name__"); +	Py_DECREF(type); +	if (name != NULL) { +	    PyObject *string = PyObject_Str(name); +	    Py_DECREF(name); +	    PyErr_Format(PyExc_TypeError, "don't know how to handle %.400s in error callback", +		PyString_AS_STRING(string)); +	    Py_DECREF(string); +	} +    } +} + +PyObject *PyCodec_StrictErrors(PyObject *exc) +{ +    if (PyInstance_Check(exc)) +	PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class, +	    exc); +    else +	PyErr_SetString(PyExc_TypeError, "codec must pass exception instance"); +    return NULL; +} + + +PyObject *PyCodec_IgnoreErrors(PyObject *exc) +{ +    int end; +    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +	if (PyUnicodeEncodeError_GetEnd(exc, &end)) +	    return NULL; +    } +    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +	if (PyUnicodeDecodeError_GetEnd(exc, &end)) +	    return NULL; +    } +    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { +	if (PyUnicodeTranslateError_GetEnd(exc, &end)) +	    return NULL; +    } +    else { +	wrong_exception_type(exc); +	return NULL; +    } +    /* ouch: passing NULL, 0, pos gives None instead of u'' */ +    return Py_BuildValue("(u#i)", &end, 0, end); +} + + +PyObject *PyCodec_ReplaceErrors(PyObject *exc) +{ +    PyObject *restuple; +    int start; +    int end; +    int i; + +    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +	PyObject *res; +	Py_UNICODE *p; +	if (PyUnicodeEncodeError_GetStart(exc, &start)) +	    return NULL; +	if (PyUnicodeEncodeError_GetEnd(exc, &end)) +	    return NULL; +	res = PyUnicode_FromUnicode(NULL, end-start); +	if (res == NULL) +	    return NULL; +	for (p = PyUnicode_AS_UNICODE(res), i = start; +	    i<end; ++p, ++i) +	    *p = '?'; +	restuple = Py_BuildValue("(Oi)", res, end); +	Py_DECREF(res); +	return restuple; +    } +    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +	Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER; +	if (PyUnicodeDecodeError_GetEnd(exc, &end)) +	    return NULL; +	return Py_BuildValue("(u#i)", &res, 1, end); +    } +    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { +	PyObject *res; +	Py_UNICODE *p; +	if (PyUnicodeTranslateError_GetStart(exc, &start)) +	    return NULL; +	if (PyUnicodeTranslateError_GetEnd(exc, &end)) +	    return NULL; +	res = PyUnicode_FromUnicode(NULL, end-start); +	if (res == NULL) +	    return NULL; +	for (p = PyUnicode_AS_UNICODE(res), i = start; +	    i<end; ++p, ++i) +	    *p = Py_UNICODE_REPLACEMENT_CHARACTER; +	restuple = Py_BuildValue("(Oi)", res, end); +	Py_DECREF(res); +	return restuple; +    } +    else { +	wrong_exception_type(exc); +	return NULL; +    } +} + +PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) +{ +    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +	PyObject *restuple; +	PyObject *object; +	int start; +	int end; +	PyObject *res; +	Py_UNICODE *p; +	Py_UNICODE *startp; +	Py_UNICODE *outp; +	int ressize; +	if (PyUnicodeEncodeError_GetStart(exc, &start)) +	    return NULL; +	if (PyUnicodeEncodeError_GetEnd(exc, &end)) +	    return NULL; +	if (!(object = PyUnicodeEncodeError_GetObject(exc))) +	    return NULL; +	startp = PyUnicode_AS_UNICODE(object); +	for (p = startp+start, ressize = 0; p < startp+end; ++p) { +	    if (*p<10) +		ressize += 2+1+1; +	    else if (*p<100) +		ressize += 2+2+1; +	    else if (*p<1000) +		ressize += 2+3+1; +	    else if (*p<10000) +		ressize += 2+4+1; +	    else if (*p<100000) +		ressize += 2+5+1; +	    else if (*p<1000000) +		ressize += 2+6+1; +	    else +		ressize += 2+7+1; +	} +	/* allocate replacement */ +	res = PyUnicode_FromUnicode(NULL, ressize); +	if (res == NULL) { +	    Py_DECREF(object); +	    return NULL; +	} +	/* generate replacement */ +	for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); +	    p < startp+end; ++p) { +	    Py_UNICODE c = *p; +	    int digits; +	    int base; +	    *outp++ = '&'; +	    *outp++ = '#'; +	    if (*p<10) { +		digits = 1; +		base = 1; +	    } +	    else if (*p<100) { +		digits = 2; +		base = 10; +	    } +	    else if (*p<1000) { +		digits = 3; +		base = 100; +	    } +	    else if (*p<10000) { +		digits = 4; +		base = 1000; +	    } +	    else if (*p<100000) { +		digits = 5; +		base = 10000; +	    } +	    else if (*p<1000000) { +		digits = 6; +		base = 100000; +	    } +	    else { +		digits = 7; +		base = 1000000; +	    } +	    while (digits-->0) { +		*outp++ = '0' + c/base; +		c %= base; +		base /= 10; +	    } +	    *outp++ = ';'; +	} +	restuple = Py_BuildValue("(Oi)", res, end); +	Py_DECREF(res); +	Py_DECREF(object); +	return restuple; +    } +    else { +	wrong_exception_type(exc); +	return NULL; +    } +} + +static Py_UNICODE hexdigits[] = { +    '0', '1', '2', '3', '4', '5', '6', '7', +    '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' +}; + +PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) +{ +    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +	PyObject *restuple; +	PyObject *object; +	int start; +	int end; +	PyObject *res; +	Py_UNICODE *p; +	Py_UNICODE *startp; +	Py_UNICODE *outp; +	int ressize; +	if (PyUnicodeEncodeError_GetStart(exc, &start)) +	    return NULL; +	if (PyUnicodeEncodeError_GetEnd(exc, &end)) +	    return NULL; +	if (!(object = PyUnicodeEncodeError_GetObject(exc))) +	    return NULL; +	startp = PyUnicode_AS_UNICODE(object); +	for (p = startp+start, ressize = 0; p < startp+end; ++p) { +	    if (*p >= 0x00010000) +		ressize += 1+1+8; +	    else if (*p >= 0x100) { +		ressize += 1+1+4; +	    } +	    else +		ressize += 1+1+2; +	} +	res = PyUnicode_FromUnicode(NULL, ressize); +	if (res==NULL) +	    return NULL; +	for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); +	    p < startp+end; ++p) { +	    Py_UNICODE c = *p; +	    *outp++ = '\\'; +	    if (c >= 0x00010000) { +		*outp++ = 'U'; +		*outp++ = hexdigits[(c>>28)&0xf]; +		*outp++ = hexdigits[(c>>24)&0xf]; +		*outp++ = hexdigits[(c>>20)&0xf]; +		*outp++ = hexdigits[(c>>16)&0xf]; +		*outp++ = hexdigits[(c>>12)&0xf]; +		*outp++ = hexdigits[(c>>8)&0xf]; +	    } +	    else if (c >= 0x100) { +		*outp++ = 'u'; +		*outp++ = hexdigits[(c>>12)&0xf]; +		*outp++ = hexdigits[(c>>8)&0xf]; +	    } +	    else +		*outp++ = 'x'; +	    *outp++ = hexdigits[(c>>4)&0xf]; +	    *outp++ = hexdigits[c&0xf]; +	} + +	restuple = Py_BuildValue("(Oi)", res, end); +	Py_DECREF(res); +	Py_DECREF(object); +	return restuple; +    } +    else { +	wrong_exception_type(exc); +	return NULL; +    } +} + +static PyObject *strict_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_StrictErrors(exc); +} + + +static PyObject *ignore_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_IgnoreErrors(exc); +} + + +static PyObject *replace_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_ReplaceErrors(exc); +} + + +static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_XMLCharRefReplaceErrors(exc); +} + + +static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_BackslashReplaceErrors(exc); +} + +  void _PyCodecRegistry_Init(void)  { +    static struct { +	char *name; +	PyMethodDef def; +    } methods[] = +    { +	{ +	    "strict", +	    { +		"strict_errors", +		strict_errors, +		METH_O +	    } +	}, +	{ +	    "ignore", +	    { +		"ignore_errors", +		ignore_errors, +		METH_O +	    } +	}, +	{ +	    "replace", +	    { +		"replace_errors", +		replace_errors, +		METH_O +	    } +	}, +	{ +	    "xmlcharrefreplace", +	    { +		"xmlcharrefreplace_errors", +		xmlcharrefreplace_errors, +		METH_O +	    } +	}, +	{ +	    "backslashreplace", +	    { +		"backslashreplace_errors", +		backslashreplace_errors, +		METH_O +	    } +	} +    };      if (_PyCodec_SearchPath == NULL)  	_PyCodec_SearchPath = PyList_New(0);      if (_PyCodec_SearchCache == NULL)  	_PyCodec_SearchCache = PyDict_New(); +    if (_PyCodec_ErrorRegistry == NULL) { +	int i; +	_PyCodec_ErrorRegistry = PyDict_New(); + +	if (_PyCodec_ErrorRegistry) { +	    for (i = 0; i < 5; ++i) { +		PyObject *func = PyCFunction_New(&methods[i].def, NULL); +		int res; +		if (!func) +		    Py_FatalError("can't initialize codec error registry"); +		res = PyCodec_RegisterError(methods[i].name, func); +		Py_DECREF(func); +		if (res) +		    Py_FatalError("can't initialize codec error registry"); +	    } +	} +    }      if (_PyCodec_SearchPath == NULL ||   	_PyCodec_SearchCache == NULL)  	Py_FatalError("can't initialize codec registry"); @@ -439,4 +836,6 @@ void _PyCodecRegistry_Fini(void)      _PyCodec_SearchPath = NULL;      Py_XDECREF(_PyCodec_SearchCache);      _PyCodec_SearchCache = NULL; +    Py_XDECREF(_PyCodec_ErrorRegistry); +    _PyCodec_ErrorRegistry = NULL;  } diff --git a/Python/exceptions.c b/Python/exceptions.c index c4bd626ded..1667cd9b66 100644 --- a/Python/exceptions.c +++ b/Python/exceptions.c @@ -100,6 +100,10 @@ Exception\n\   |    +-- ValueError\n\   |    |    |\n\   |    |    +-- UnicodeError\n\ + |    |        |\n\ + |    |        +-- UnicodeEncodeError\n\ + |    |        +-- UnicodeDecodeError\n\ + |    |        +-- UnicodeTranslateError\n\   |    |\n\   |    +-- ReferenceError\n\   |    +-- SystemError\n\ @@ -840,6 +844,590 @@ static PyMethodDef SyntaxError_methods[] = {  }; +static +int get_int(PyObject *exc, const char *name, int *value) +{ +    PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + +    if (!attr) +	return -1; +    if (!PyInt_Check(attr)) { +	PyErr_Format(PyExc_TypeError, "%s attribute must be int", name); +	Py_DECREF(attr); +	return -1; +    } +    *value = PyInt_AS_LONG(attr); +    Py_DECREF(attr); +    return 0; +} + + +static +int set_int(PyObject *exc, const char *name, int value) +{ +    PyObject *obj = PyInt_FromLong(value); +    int result; + +    if (!obj) +	return -1; +    result = PyObject_SetAttrString(exc, (char *)name, obj); +    Py_DECREF(obj); +    return result; +} + + +static +PyObject *get_string(PyObject *exc, const char *name) +{ +    PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + +    if (!attr) +	return NULL; +    if (!PyString_Check(attr)) { +	PyErr_Format(PyExc_TypeError, "%s attribute must be str", name); +	Py_DECREF(attr); +	return NULL; +    } +    return attr; +} + + +static +int set_string(PyObject *exc, const char *name, const char *value) +{ +    PyObject *obj = PyString_FromString(value); +    int result; + +    if (!obj) +	return -1; +    result = PyObject_SetAttrString(exc, (char *)name, obj); +    Py_DECREF(obj); +    return result; +} + + +static +PyObject *get_unicode(PyObject *exc, const char *name) +{ +    PyObject *attr = PyObject_GetAttrString(exc, (char *)name); + +    if (!attr) +	return NULL; +    if (!PyUnicode_Check(attr)) { +	PyErr_Format(PyExc_TypeError, "%s attribute must be unicode", name); +	Py_DECREF(attr); +	return NULL; +    } +    return attr; +} + +PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc) +{ +    return get_string(exc, "encoding"); +} + +PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc) +{ +    return get_string(exc, "encoding"); +} + +PyObject * PyUnicodeTranslateError_GetEncoding(PyObject *exc) +{ +    return get_string(exc, "encoding"); +} + +PyObject *PyUnicodeEncodeError_GetObject(PyObject *exc) +{ +    return get_unicode(exc, "object"); +} + +PyObject *PyUnicodeDecodeError_GetObject(PyObject *exc) +{ +    return get_string(exc, "object"); +} + +PyObject *PyUnicodeTranslateError_GetObject(PyObject *exc) +{ +    return get_unicode(exc, "object"); +} + +int PyUnicodeEncodeError_GetStart(PyObject *exc, int *start) +{ +    if (!get_int(exc, "start", start)) { +	PyObject *object = PyUnicodeEncodeError_GetObject(exc); +	int size; +	if (!object) +	    return -1; +	size = PyUnicode_GET_SIZE(object); +	if (*start<0) +	    *start = 0; +	if (*start>=size) +	    *start = size-1; +	Py_DECREF(object); +	return 0; +    } +    return -1; +} + + +int PyUnicodeDecodeError_GetStart(PyObject *exc, int *start) +{ +    if (!get_int(exc, "start", start)) { +	PyObject *object = PyUnicodeDecodeError_GetObject(exc); +	int size; +	if (!object) +	    return -1; +	size = PyString_GET_SIZE(object); +	if (*start<0) +	    *start = 0; +	if (*start>=size) +	    *start = size-1; +	Py_DECREF(object); +	return 0; +    } +    return -1; +} + + +int PyUnicodeTranslateError_GetStart(PyObject *exc, int *start) +{ +    return PyUnicodeEncodeError_GetStart(exc, start); +} + + +int PyUnicodeEncodeError_SetStart(PyObject *exc, int start) +{ +    return set_int(exc, "start", start); +} + + +int PyUnicodeDecodeError_SetStart(PyObject *exc, int start) +{ +    return set_int(exc, "start", start); +} + + +int PyUnicodeTranslateError_SetStart(PyObject *exc, int start) +{ +    return set_int(exc, "start", start); +} + + +int PyUnicodeEncodeError_GetEnd(PyObject *exc, int *end) +{ +    if (!get_int(exc, "end", end)) { +	PyObject *object = PyUnicodeEncodeError_GetObject(exc); +	int size; +	if (!object) +	    return -1; +	size = PyUnicode_GET_SIZE(object); +	if (*end<1) +	    *end = 1; +	if (*end>size) +	    *end = size; +	Py_DECREF(object); +	return 0; +    } +    return -1; +} + + +int PyUnicodeDecodeError_GetEnd(PyObject *exc, int *end) +{ +    if (!get_int(exc, "end", end)) { +	PyObject *object = PyUnicodeDecodeError_GetObject(exc); +	int size; +	if (!object) +	    return -1; +	size = PyString_GET_SIZE(object); +	if (*end<1) +	    *end = 1; +	if (*end>size) +	    *end = size; +	Py_DECREF(object); +	return 0; +    } +    return -1; +} + + +int PyUnicodeTranslateError_GetEnd(PyObject *exc, int *start) +{ +    return PyUnicodeEncodeError_GetEnd(exc, start); +} + + +int PyUnicodeEncodeError_SetEnd(PyObject *exc, int end) +{ +    return set_int(exc, "end", end); +} + + +int PyUnicodeDecodeError_SetEnd(PyObject *exc, int end) +{ +    return set_int(exc, "end", end); +} + + +int PyUnicodeTranslateError_SetEnd(PyObject *exc, int end) +{ +    return set_int(exc, "end", end); +} + + +PyObject *PyUnicodeEncodeError_GetReason(PyObject *exc) +{ +    return get_string(exc, "reason"); +} + + +PyObject *PyUnicodeDecodeError_GetReason(PyObject *exc) +{ +    return get_string(exc, "reason"); +} + + +PyObject *PyUnicodeTranslateError_GetReason(PyObject *exc) +{ +    return get_string(exc, "reason"); +} + + +int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason) +{ +    return set_string(exc, "reason", reason); +} + + +int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason) +{ +    return set_string(exc, "reason", reason); +} + + +int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason) +{ +    return set_string(exc, "reason", reason); +} + + +static PyObject * +UnicodeError__init__(PyObject *self, PyObject *args, PyTypeObject *objecttype) +{ +    PyObject *rtnval = NULL; +    PyObject *encoding; +    PyObject *object; +    PyObject *start; +    PyObject *end; +    PyObject *reason; + +    if (!(self = get_self(args))) +	return NULL; + +    if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) +	return NULL; + +    if (!PyArg_ParseTuple(args, "O!O!O!O!O!", +	&PyString_Type, &encoding, +	objecttype, &object, +	&PyInt_Type, &start, +	&PyInt_Type, &end, +	&PyString_Type, &reason)) +	return NULL; + +    if (PyObject_SetAttrString(self, "args", args)) +	goto finally; + +    if (PyObject_SetAttrString(self, "encoding", encoding)) +	goto finally; +    if (PyObject_SetAttrString(self, "object", object)) +	goto finally; +    if (PyObject_SetAttrString(self, "start", start)) +	goto finally; +    if (PyObject_SetAttrString(self, "end", end)) +	goto finally; +    if (PyObject_SetAttrString(self, "reason", reason)) +	goto finally; + +    Py_INCREF(Py_None); +    rtnval = Py_None; + +  finally: +    Py_DECREF(args); +    return rtnval; +} + + +static PyObject * +UnicodeEncodeError__init__(PyObject *self, PyObject *args) +{ +    return UnicodeError__init__(self, args, &PyUnicode_Type); +} + +static PyObject * +UnicodeEncodeError__str__(PyObject *self, PyObject *arg) +{ +    PyObject *encodingObj = NULL; +    PyObject *objectObj = NULL; +    int length; +    int start; +    int end; +    PyObject *reasonObj = NULL; +    char buffer[1000]; +    PyObject *result = NULL; + +    self = arg; + +    if (!(encodingObj = PyUnicodeEncodeError_GetEncoding(self))) +	goto error; + +    if (!(objectObj = PyUnicodeEncodeError_GetObject(self))) +	goto error; + +    length = PyUnicode_GET_SIZE(objectObj); + +    if (PyUnicodeEncodeError_GetStart(self, &start)) +	goto error; + +    if (PyUnicodeEncodeError_GetEnd(self, &end)) +	goto error; + +    if (!(reasonObj = PyUnicodeEncodeError_GetReason(self))) +	goto error; + +    if (end==start+1) { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "'%.400s' codec can't encode character '\\u%x' in position %d: %.400s", +	    PyString_AS_STRING(encodingObj), +	    (int)PyUnicode_AS_UNICODE(objectObj)[start], +	    start, +	    PyString_AS_STRING(reasonObj) +	); +    } +    else { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "'%.400s' codec can't encode characters in position %d-%d: %.400s", +	    PyString_AS_STRING(encodingObj), +	    start, +	    end-1, +	    PyString_AS_STRING(reasonObj) +	); +    } +    result = PyString_FromString(buffer); + +error: +    Py_XDECREF(reasonObj); +    Py_XDECREF(objectObj); +    Py_XDECREF(encodingObj); +    return result; +} + +static PyMethodDef UnicodeEncodeError_methods[] = { +    {"__init__", UnicodeEncodeError__init__, METH_VARARGS}, +    {"__str__",  UnicodeEncodeError__str__, METH_O}, +    {NULL, NULL} +}; + + +PyObject * PyUnicodeEncodeError_Create( +	const char *encoding, const Py_UNICODE *object, int length, +	int start, int end, const char *reason) +{ +    return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#iis", +	encoding, object, length, start, end, reason); +} + + +static PyObject * +UnicodeDecodeError__init__(PyObject *self, PyObject *args) +{ +    return UnicodeError__init__(self, args, &PyString_Type); +} + +static PyObject * +UnicodeDecodeError__str__(PyObject *self, PyObject *arg) +{ +    PyObject *encodingObj = NULL; +    PyObject *objectObj = NULL; +    int length; +    int start; +    int end; +    PyObject *reasonObj = NULL; +    char buffer[1000]; +    PyObject *result = NULL; + +    self = arg; + +    if (!(encodingObj = PyUnicodeDecodeError_GetEncoding(self))) +	goto error; + +    if (!(objectObj = PyUnicodeDecodeError_GetObject(self))) +	goto error; + +    length = PyString_GET_SIZE(objectObj); + +    if (PyUnicodeDecodeError_GetStart(self, &start)) +	goto error; + +    if (PyUnicodeDecodeError_GetEnd(self, &end)) +	goto error; + +    if (!(reasonObj = PyUnicodeDecodeError_GetReason(self))) +	goto error; + +    if (end==start+1) { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "'%.400s' codec can't decode byte 0x%x in position %d: %.400s", +	    PyString_AS_STRING(encodingObj), +	    ((int)PyString_AS_STRING(objectObj)[start])&0xff, +	    start, +	    PyString_AS_STRING(reasonObj) +	); +    } +    else { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "'%.400s' codec can't decode bytes in position %d-%d: %.400s", +	    PyString_AS_STRING(encodingObj), +	    start, +	    end-1, +	    PyString_AS_STRING(reasonObj) +	); +    } +    result = PyString_FromString(buffer); + +error: +    Py_XDECREF(reasonObj); +    Py_XDECREF(objectObj); +    Py_XDECREF(encodingObj); +    return result; +} + +static PyMethodDef UnicodeDecodeError_methods[] = { +    {"__init__", UnicodeDecodeError__init__, METH_VARARGS}, +    {"__str__",  UnicodeDecodeError__str__, METH_O}, +    {NULL, NULL} +}; + + +PyObject * PyUnicodeDecodeError_Create( +	const char *encoding, const char *object, int length, +	int start, int end, const char *reason) +{ +    return PyObject_CallFunction(PyExc_UnicodeDecodeError, "ss#iis", +	encoding, object, length, start, end, reason); +} + + +static PyObject * +UnicodeTranslateError__init__(PyObject *self, PyObject *args) +{ +    PyObject *rtnval = NULL; +    PyObject *object; +    PyObject *start; +    PyObject *end; +    PyObject *reason; + +    if (!(self = get_self(args))) +	return NULL; + +    if (!(args = PySequence_GetSlice(args, 1, PySequence_Size(args)))) +	return NULL; + +    if (!PyArg_ParseTuple(args, "O!O!O!O!", +	&PyUnicode_Type, &object, +	&PyInt_Type, &start, +	&PyInt_Type, &end, +	&PyString_Type, &reason)) +	goto finally; + +    if (PyObject_SetAttrString(self, "args", args)) +	goto finally; + +    if (PyObject_SetAttrString(self, "object", object)) +	goto finally; +    if (PyObject_SetAttrString(self, "start", start)) +	goto finally; +    if (PyObject_SetAttrString(self, "end", end)) +	goto finally; +    if (PyObject_SetAttrString(self, "reason", reason)) +	goto finally; + +    Py_INCREF(Py_None); +    rtnval = Py_None; + +  finally: +    Py_DECREF(args); +    return rtnval; +} + + +static PyObject * +UnicodeTranslateError__str__(PyObject *self, PyObject *arg) +{ +    PyObject *objectObj = NULL; +    int length; +    int start; +    int end; +    PyObject *reasonObj = NULL; +    char buffer[1000]; +    PyObject *result = NULL; + +    self = arg; + +    if (!(objectObj = PyUnicodeTranslateError_GetObject(self))) +	goto error; + +    length = PyUnicode_GET_SIZE(objectObj); + +    if (PyUnicodeTranslateError_GetStart(self, &start)) +	goto error; + +    if (PyUnicodeTranslateError_GetEnd(self, &end)) +	goto error; + +    if (!(reasonObj = PyUnicodeTranslateError_GetReason(self))) +	goto error; + +    if (end==start+1) { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "can't translate character '\\u%x' in position %d: %.400s", +	    (int)PyUnicode_AS_UNICODE(objectObj)[start], +	    start, +	    PyString_AS_STRING(reasonObj) +	); +    } +    else { +	PyOS_snprintf(buffer, sizeof(buffer), +	    "can't translate characters in position %d-%d: %.400s", +	    start, +	    end-1, +	    PyString_AS_STRING(reasonObj) +	); +    } +    result = PyString_FromString(buffer); + +error: +    Py_XDECREF(reasonObj); +    Py_XDECREF(objectObj); +    return result; +} + +static PyMethodDef UnicodeTranslateError_methods[] = { +    {"__init__", UnicodeTranslateError__init__, METH_VARARGS}, +    {"__str__",  UnicodeTranslateError__str__, METH_O}, +    {NULL, NULL} +}; + + +PyObject * PyUnicodeTranslateError_Create( +	const Py_UNICODE *object, int length, +	int start, int end, const char *reason) +{ +    return PyObject_CallFunction(PyExc_UnicodeTranslateError, "u#iis", +	object, length, start, end, reason); +} + +  /* Exception doc strings */ @@ -865,6 +1453,12 @@ PyDoc_STRVAR(ValueError__doc__,  PyDoc_STRVAR(UnicodeError__doc__, "Unicode related error."); +PyDoc_STRVAR(UnicodeEncodeError__doc__, "Unicode encoding error."); + +PyDoc_STRVAR(UnicodeDecodeError__doc__, "Unicode decoding error."); + +PyDoc_STRVAR(UnicodeTranslateError__doc__, "Unicode translation error."); +  PyDoc_STRVAR(SystemError__doc__,  "Internal error in the Python interpreter.\n\  \n\ @@ -949,6 +1543,9 @@ PyObject *PyExc_SystemError;  PyObject *PyExc_SystemExit;  PyObject *PyExc_UnboundLocalError;  PyObject *PyExc_UnicodeError; +PyObject *PyExc_UnicodeEncodeError; +PyObject *PyExc_UnicodeDecodeError; +PyObject *PyExc_UnicodeTranslateError;  PyObject *PyExc_TypeError;  PyObject *PyExc_ValueError;  PyObject *PyExc_ZeroDivisionError; @@ -1035,6 +1632,12 @@ static struct {    FloatingPointError__doc__},   {"ValueError",   &PyExc_ValueError,  0, ValueError__doc__},   {"UnicodeError", &PyExc_UnicodeError, &PyExc_ValueError, UnicodeError__doc__}, + {"UnicodeEncodeError", &PyExc_UnicodeEncodeError, &PyExc_UnicodeError, +  UnicodeEncodeError__doc__, UnicodeEncodeError_methods}, + {"UnicodeDecodeError", &PyExc_UnicodeDecodeError, &PyExc_UnicodeError, +  UnicodeDecodeError__doc__, UnicodeDecodeError_methods}, + {"UnicodeTranslateError", &PyExc_UnicodeTranslateError, &PyExc_UnicodeError, +  UnicodeTranslateError__doc__, UnicodeTranslateError_methods},   {"ReferenceError",  &PyExc_ReferenceError, 0, ReferenceError__doc__},   {"SystemError",  &PyExc_SystemError, 0, SystemError__doc__},   {"MemoryError",  &PyExc_MemoryError, 0, MemoryError__doc__},  | 
