diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 478 | 
1 files changed, 246 insertions, 232 deletions
| diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d9c131cff9..8f6cc9e0b3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6819,28 +6819,6 @@ code_page_name(UINT code_page, PyObject **obj)      return PyBytes_AS_STRING(*obj);  } -static int -is_dbcs_lead_byte(UINT code_page, const char *s, int offset) -{ -    const char *curr = s + offset; -    const char *prev; - -    if (!IsDBCSLeadByteEx(code_page, *curr)) -        return 0; - -    prev = CharPrevExA(code_page, s, curr, 0); -    if (prev == curr) -        return 1; -    /* FIXME: This code is limited to "true" double-byte encodings, -       as it assumes an incomplete character consists of a single -       byte. */ -    if (curr - prev == 2) -        return 1; -    if (!IsDBCSLeadByteEx(code_page, *prev)) -        return 1; -    return 0; -} -  static DWORD  decode_code_page_flags(UINT code_page)  { @@ -6915,7 +6893,7 @@ static int  decode_code_page_errors(UINT code_page,                          PyObject **v,                          const char *in, const int size, -                        const char *errors) +                        const char *errors, int final)  {      const char *startin = in;      const char *endin = in + size; @@ -6942,7 +6920,7 @@ decode_code_page_errors(UINT code_page,      if (encoding == NULL)          return -1; -    if (errors == NULL || strcmp(errors, "strict") == 0) { +    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {          /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a             UnicodeDecodeError. */          make_decode_exception(&exc, encoding, in, size, 0, 0, reason); @@ -7005,6 +6983,10 @@ decode_code_page_errors(UINT code_page,          if (outsize <= 0) {              Py_ssize_t startinpos, endinpos, outpos; +            /* last character in partial decode? */ +            if (in + insize >= endin && !final) +                break; +              startinpos = in - startin;              endinpos = startinpos + 1;              outpos = out - PyUnicode_AS_UNICODE(*v); @@ -7033,7 +7015,7 @@ decode_code_page_errors(UINT code_page,      assert(outsize <= PyUnicode_WSTR_LENGTH(*v));      if (unicode_resize(v, outsize) < 0)          goto error; -    ret = size; +    ret = in - startin;  error:      Py_XDECREF(encoding_obj); @@ -7074,24 +7056,19 @@ decode_code_page_stateful(int code_page,              done = 1;          } -        /* Skip trailing lead-byte unless 'final' is set */ -        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) -            --chunk_size; -          if (chunk_size == 0 && done) {              if (v != NULL)                  break;              _Py_RETURN_UNICODE_EMPTY();          } -          converted = decode_code_page_strict(code_page, &v,                                              s, chunk_size);          if (converted == -2)              converted = decode_code_page_errors(code_page, &v,                                                  s, chunk_size, -                                                errors); -        assert(converted != 0); +                                                errors, final); +        assert(converted != 0 || done);          if (converted < 0) {              Py_XDECREF(v); @@ -8498,10 +8475,10 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)      }      else if (PyLong_Check(x)) {          long value = PyLong_AS_LONG(x); -        long max = PyUnicode_GetMax(); -        if (value < 0 || value > max) { -            PyErr_Format(PyExc_TypeError, -                         "character mapping must be in range(0x%x)", max+1); +        if (value < 0 || value > MAX_UNICODE) { +            PyErr_Format(PyExc_ValueError, +                         "character mapping must be in range(0x%x)", +                         MAX_UNICODE+1);              Py_DECREF(x);              return -1;          } @@ -8520,76 +8497,168 @@ charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)          return -1;      }  } -/* ensure that *outobj is at least requiredsize characters long, -   if not reallocate and adjust various state variables. -   Return 0 on success, -1 on error */ + +/* lookup the character, write the result into the writer. +   Return 1 if the result was written into the writer, return 0 if the mapping +   was undefined, raise an exception return -1 on error. */  static int -charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize, -                               Py_ssize_t requiredsize) -{ -    Py_ssize_t oldsize = *psize; -    Py_UCS4 *new_outobj; -    if (requiredsize > oldsize) { -        /* exponentially overallocate to minimize reallocations */ -        if (requiredsize < 2 * oldsize) -            requiredsize = 2 * oldsize; -        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4)); -        if (new_outobj == 0) +charmaptranslate_output(Py_UCS4 ch, PyObject *mapping, +                        _PyUnicodeWriter *writer) +{ +    PyObject *item; + +    if (charmaptranslate_lookup(ch, mapping, &item)) +        return -1; + +    if (item == NULL) { +        /* not found => default to 1:1 mapping */ +        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {              return -1; -        *outobj = new_outobj; -        *psize = requiredsize; +        } +        return 1;      } -    return 0; + +    if (item == Py_None) { +        Py_DECREF(item); +        return 0; +    } + +    if (PyLong_Check(item)) { +        long ch = (Py_UCS4)PyLong_AS_LONG(item); +        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already +           used it */ +        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) { +            Py_DECREF(item); +            return -1; +        } +        Py_DECREF(item); +        return 1; +    } + +    if (!PyUnicode_Check(item)) { +        Py_DECREF(item); +        return -1; +    } + +    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) { +        Py_DECREF(item); +        return -1; +    } + +    Py_DECREF(item); +    return 1;  } -/* lookup the character, put the result in the output string and adjust -   various state variables. Return a new reference to the object that -   was put in the output buffer in *result, or Py_None, if the mapping was -   undefined (in which case no character was written). -   The called must decref result. -   Return 0 on success, -1 on error. */ +  static int -charmaptranslate_output(PyObject *input, Py_ssize_t ipos, -                        PyObject *mapping, Py_UCS4 **output, -                        Py_ssize_t *osize, Py_ssize_t *opos, -                        PyObject **res) +unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch, +                              Py_UCS1 *translate)  { -    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos); -    if (charmaptranslate_lookup(curinp, mapping, res)) +    PyObject *item = NULL; +    int ret = 0; + +    if (charmaptranslate_lookup(ch, mapping, &item)) {          return -1; -    if (*res==NULL) { +    } + +    if (item == Py_None) { +        /* deletion */ +        translate[ch] = 0xfe; +    } +    else if (item == NULL) {          /* not found => default to 1:1 mapping */ -        (*output)[(*opos)++] = curinp; +        translate[ch] = ch; +        return 1;      } -    else if (*res==Py_None) -        ; -    else if (PyLong_Check(*res)) { -        /* no overflow check, because we know that the space is enough */ -        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res); +    else if (PyLong_Check(item)) { +        long replace = PyLong_AS_LONG(item); +        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already +           used it */ +        if (127 < replace) { +            /* invalid character or character outside ASCII: +               skip the fast translate */ +            goto exit; +        } +        translate[ch] = (Py_UCS1)replace;      } -    else if (PyUnicode_Check(*res)) { -        Py_ssize_t repsize; -        if (PyUnicode_READY(*res) == -1) +    else if (PyUnicode_Check(item)) { +        Py_UCS4 replace; + +        if (PyUnicode_READY(item) == -1) { +            Py_DECREF(item);              return -1; -        repsize = PyUnicode_GET_LENGTH(*res); -        if (repsize==1) { -            /* no overflow check, because we know that the space is enough */ -            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);          } -        else if (repsize!=0) { -            /* more than one character */ -            Py_ssize_t requiredsize = *opos + -                (PyUnicode_GET_LENGTH(input) - ipos) + -                repsize - 1; -            Py_ssize_t i; -            if (charmaptranslate_makespace(output, osize, requiredsize)) +        if (PyUnicode_GET_LENGTH(item) != 1) +            goto exit; + +        replace = PyUnicode_READ_CHAR(item, 0); +        if (replace > 127) +            goto exit; +        translate[ch] = (Py_UCS1)replace; +    } +    else { +        /* not None, NULL, long or unicode */ +        goto exit; +    } +    ret = 1; + +  exit: +    Py_DECREF(item); +    return ret; +} + +/* Fast path for ascii => ascii translation. Return 1 if the whole string +   was translated into writer, return 0 if the input string was partially +   translated into writer, raise an exception and return -1 on error. */ +static int +unicode_fast_translate(PyObject *input, PyObject *mapping, +                       _PyUnicodeWriter *writer, int ignore) +{ +    Py_UCS1 ascii_table[128], ch, ch2; +    Py_ssize_t len; +    Py_UCS1 *in, *end, *out; +    int res = 0; + +    if (PyUnicode_READY(input) == -1) +        return -1; +    if (!PyUnicode_IS_ASCII(input)) +        return 0; +    len = PyUnicode_GET_LENGTH(input); + +    memset(ascii_table, 0xff, 128); + +    in = PyUnicode_1BYTE_DATA(input); +    end = in + len; + +    assert(PyUnicode_IS_ASCII(writer->buffer)); +    assert(PyUnicode_GET_LENGTH(writer->buffer) == len); +    out = PyUnicode_1BYTE_DATA(writer->buffer); + +    for (; in < end; in++) { +        ch = *in; +        ch2 = ascii_table[ch]; +        if (ch2 == 0xff) { +            int translate = unicode_fast_translate_lookup(mapping, ch, +                                                          ascii_table); +            if (translate < 0)                  return -1; -            for(i = 0; i < repsize; i++) -                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i); +            if (translate == 0) +                goto exit; +            ch2 = ascii_table[ch];          } +        if (ch2 == 0xfe) { +            if (ignore) +                continue; +            goto exit; +        } +        assert(ch2 < 128); +        *out = ch2; +        out++;      } -    else -        return -1; -    return 0; +    res = 1; + +exit: +    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer); +    return res;  }  PyObject * @@ -8598,22 +8667,17 @@ _PyUnicode_TranslateCharmap(PyObject *input,                              const char *errors)  {      /* input object */ -    char *idata; +    char *data;      Py_ssize_t size, i;      int kind;      /* output buffer */ -    Py_UCS4 *output = NULL; -    Py_ssize_t osize; -    PyObject *res; -    /* current output position */ -    Py_ssize_t opos; +    _PyUnicodeWriter writer; +    /* error handler */      char *reason = "character maps to <undefined>";      PyObject *errorHandler = NULL;      PyObject *exc = NULL; -    /* the following variable is used for caching string comparisons -     * -1=not initialized, 0=unknown, 1=strict, 2=replace, -     * 3=ignore, 4=xmlcharrefreplace */ -    int known_errorHandler = -1; +    int ignore; +    int res;      if (mapping == NULL) {          PyErr_BadArgument(); @@ -8622,10 +8686,9 @@ _PyUnicode_TranslateCharmap(PyObject *input,      if (PyUnicode_READY(input) == -1)          return NULL; -    idata = (char*)PyUnicode_DATA(input); +    data = (char*)PyUnicode_DATA(input);      kind = PyUnicode_KIND(input);      size = PyUnicode_GET_LENGTH(input); -    i = 0;      if (size == 0) {          Py_INCREF(input); @@ -8634,121 +8697,81 @@ _PyUnicode_TranslateCharmap(PyObject *input,      /* allocate enough for a simple 1:1 translation without         replacements, if we need more, we'll resize */ -    osize = size; -    output = PyMem_Malloc(osize * sizeof(Py_UCS4)); -    opos = 0; -    if (output == NULL) { -        PyErr_NoMemory(); +    _PyUnicodeWriter_Init(&writer); +    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)          goto onError; + +    ignore = (errors != NULL && strcmp(errors, "ignore") == 0); + +    res = unicode_fast_translate(input, mapping, &writer, ignore); +    if (res < 0) { +        _PyUnicodeWriter_Dealloc(&writer); +        return NULL;      } +    if (res == 1) +        return _PyUnicodeWriter_Finish(&writer); +    i = writer.pos;      while (i<size) {          /* try to encode it */ -        PyObject *x = NULL; -        if (charmaptranslate_output(input, i, mapping, -                                    &output, &osize, &opos, &x)) { -            Py_XDECREF(x); +        int translate; +        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ +        Py_ssize_t newpos; +        /* startpos for collecting untranslatable chars */ +        Py_ssize_t collstart; +        Py_ssize_t collend; +        Py_UCS4 ch; + +        ch = PyUnicode_READ(kind, data, i); +        translate = charmaptranslate_output(ch, mapping, &writer); +        if (translate < 0)              goto onError; -        } -        Py_XDECREF(x); -        if (x!=Py_None) /* it worked => adjust input pointer */ + +        if (translate != 0) { +            /* it worked => adjust input pointer */              ++i; -        else { /* untranslatable character */ -            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ -            Py_ssize_t repsize; -            Py_ssize_t newpos; -            Py_ssize_t uni2; -            /* startpos for collecting untranslatable chars */ -            Py_ssize_t collstart = i; -            Py_ssize_t collend = i+1; -            Py_ssize_t coll; - -            /* find all untranslatable characters */ -            while (collend < size) { -                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x)) -                    goto onError; -                Py_XDECREF(x); -                if (x!=Py_None) -                    break; -                ++collend; -            } -            /* cache callback name lookup -             * (if not done yet, i.e. it's the first error) */ -            if (known_errorHandler==-1) { -                if ((errors==NULL) || (!strcmp(errors, "strict"))) -                    known_errorHandler = 1; -                else if (!strcmp(errors, "replace")) -                    known_errorHandler = 2; -                else if (!strcmp(errors, "ignore")) -                    known_errorHandler = 3; -                else if (!strcmp(errors, "xmlcharrefreplace")) -                    known_errorHandler = 4; -                else -                    known_errorHandler = 0; -            } -            switch (known_errorHandler) { -            case 1: /* strict */ -                make_translate_exception(&exc, -                                         input, collstart, collend, reason); -                if (exc != NULL) -                    PyCodec_StrictErrors(exc); +            continue; +        } + +        /* untranslatable character */ +        collstart = i; +        collend = i+1; + +        /* find all untranslatable characters */ +        while (collend < size) { +            PyObject *x; +            ch = PyUnicode_READ(kind, data, collend); +            if (charmaptranslate_lookup(ch, mapping, &x))                  goto onError; -            case 2: /* replace */ -                /* No need to check for space, this is a 1:1 replacement */ -                for (coll = collstart; coll<collend; coll++) -                    output[opos++] = '?'; -                /* fall through */ -            case 3: /* ignore */ -                i = collend; -                break; -            case 4: /* xmlcharrefreplace */ -                /* generate replacement (temporarily (mis)uses i) */ -                for (i = collstart; i < collend; ++i) { -                    char buffer[2+29+1+1]; -                    char *cp; -                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i)); -                    if (charmaptranslate_makespace(&output, &osize, -                                                   opos+strlen(buffer)+(size-collend))) -                        goto onError; -                    for (cp = buffer; *cp; ++cp) -                        output[opos++] = *cp; -                } -                i = collend; +            Py_XDECREF(x); +            if (x != Py_None)                  break; -            default: -                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, -                                                                 reason, input, &exc, -                                                                 collstart, collend, &newpos); -                if (repunicode == NULL) -                    goto onError; -                if (PyUnicode_READY(repunicode) == -1) { -                    Py_DECREF(repunicode); -                    goto onError; -                } -                /* generate replacement  */ -                repsize = PyUnicode_GET_LENGTH(repunicode); -                if (charmaptranslate_makespace(&output, &osize, -                                               opos+repsize+(size-collend))) { -                    Py_DECREF(repunicode); -                    goto onError; -                } -                for (uni2 = 0; repsize-->0; ++uni2) -                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2); -                i = newpos; +            ++collend; +        } + +        if (ignore) { +            i = collend; +        } +        else { +            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, +                                                             reason, input, &exc, +                                                             collstart, collend, &newpos); +            if (repunicode == NULL) +                goto onError; +            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {                  Py_DECREF(repunicode); +                goto onError;              } +            Py_DECREF(repunicode); +            i = newpos;          }      } -    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos); -    if (!res) -        goto onError; -    PyMem_Free(output);      Py_XDECREF(exc);      Py_XDECREF(errorHandler); -    return res; +    return _PyUnicodeWriter_Finish(&writer);    onError: -    PyMem_Free(output); +    _PyUnicodeWriter_Dealloc(&writer);      Py_XDECREF(exc);      Py_XDECREF(errorHandler);      return NULL; @@ -14011,24 +14034,14 @@ mainformatlong(PyObject *v,      if (!PyNumber_Check(v))          goto wrongtype; -    /* make sure number is a type of integer */ -    /* if not, issue deprecation warning for now */ +    /* make sure number is a type of integer for o, x, and X */      if (!PyLong_Check(v)) {          if (type == 'o' || type == 'x' || type == 'X') {              iobj = PyNumber_Index(v);              if (iobj == NULL) { -                PyErr_Clear(); -                if (PyErr_WarnEx(PyExc_DeprecationWarning, -                                 "automatic int conversions have been deprecated", -                                 1)) { -                    return -1; -                } -                iobj = PyNumber_Long(v); -                if (iobj == NULL ) { -                    if (PyErr_ExceptionMatches(PyExc_TypeError)) -                        goto wrongtype; -                    return -1; -                } +                if (PyErr_ExceptionMatches(PyExc_TypeError)) +                    goto wrongtype; +                return -1;              }          }          else { @@ -14089,10 +14102,23 @@ mainformatlong(PyObject *v,      return 0;  wrongtype: -    PyErr_Format(PyExc_TypeError, -            "%%%c format: a number is required, " -            "not %.200s", -            type, Py_TYPE(v)->tp_name); +    switch(type) +    { +        case 'o': +        case 'x': +        case 'X': +            PyErr_Format(PyExc_TypeError, +                    "%%%c format: an integer is required, " +                    "not %.200s", +                    type, Py_TYPE(v)->tp_name); +            break; +        default: +            PyErr_Format(PyExc_TypeError, +                    "%%%c format: a number is required, " +                    "not %.200s", +                    type, Py_TYPE(v)->tp_name); +            break; +    }      return -1;  } @@ -14110,22 +14136,10 @@ formatchar(PyObject *v)          PyObject *iobj;          long x;          /* make sure number is a type of integer */ -        /* if not, issue deprecation warning for now */          if (!PyLong_Check(v)) {              iobj = PyNumber_Index(v);              if (iobj == NULL) { -                PyErr_Clear(); -                if (PyErr_WarnEx(PyExc_DeprecationWarning, -                                 "automatic int conversions have been deprecated", -                                 1)) { -                    return -1; -                } -                iobj = PyNumber_Long(v); -                if (iobj == NULL ) { -                    if (PyErr_ExceptionMatches(PyExc_TypeError)) -                        goto onError; -                    return -1; -                } +                goto onError;              }              v = iobj;              Py_DECREF(iobj); | 
