diff options
Diffstat (limited to 'Objects/stringlib')
| -rw-r--r-- | Objects/stringlib/codecs.h | 201 | ||||
| -rw-r--r-- | Objects/stringlib/fastsearch.h | 150 | ||||
| -rw-r--r-- | Objects/stringlib/unicode_format.h | 2 | 
3 files changed, 197 insertions, 156 deletions
| diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 0fc6b582d2..2beb604f11 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -263,50 +263,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,  #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */      Py_ssize_t i;                /* index into s of next input byte */ -    PyObject *result;            /* result string object */      char *p;                     /* next free byte in output buffer */ -    Py_ssize_t nallocated;      /* number of result bytes allocated */ -    Py_ssize_t nneeded;            /* number of result bytes needed */  #if STRINGLIB_SIZEOF_CHAR > 1 -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL;      PyObject *rep = NULL; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;  #endif  #if STRINGLIB_SIZEOF_CHAR == 1      const Py_ssize_t max_char_size = 2; -    char stackbuf[MAX_SHORT_UNICHARS * 2];  #elif STRINGLIB_SIZEOF_CHAR == 2      const Py_ssize_t max_char_size = 3; -    char stackbuf[MAX_SHORT_UNICHARS * 3];  #else /*  STRINGLIB_SIZEOF_CHAR == 4 */      const Py_ssize_t max_char_size = 4; -    char stackbuf[MAX_SHORT_UNICHARS * 4];  #endif +    _PyBytesWriter writer;      assert(size >= 0); +    _PyBytesWriter_Init(&writer); -    if (size <= MAX_SHORT_UNICHARS) { -        /* Write into the stack buffer; nallocated can't overflow. -         * At the end, we'll allocate exactly as much heap space as it -         * turns out we need. -         */ -        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); -        result = NULL;   /* will allocate after we're done */ -        p = stackbuf; -    } -    else { -        if (size > PY_SSIZE_T_MAX / max_char_size) { -            /* integer overflow */ -            return PyErr_NoMemory(); -        } -        /* Overallocate on the heap, and give the excess back at the end. */ -        nallocated = size * max_char_size; -        result = PyBytes_FromStringAndSize(NULL, nallocated); -        if (result == NULL) -            return NULL; -        p = PyBytes_AS_STRING(result); +    if (size > PY_SSIZE_T_MAX / max_char_size) { +        /* integer overflow */ +        return PyErr_NoMemory();      } +    p = _PyBytesWriter_Alloc(&writer, size * max_char_size); +    if (p == NULL) +        return NULL; +      for (i = 0; i < size;) {          Py_UCS4 ch = data[i++]; @@ -326,72 +310,118 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,          }  #if STRINGLIB_SIZEOF_CHAR > 1          else if (Py_UNICODE_IS_SURROGATE(ch)) { -            Py_ssize_t newpos; -            Py_ssize_t repsize, k, startpos; +            Py_ssize_t startpos, endpos, newpos; +            Py_ssize_t k; +            if (error_handler == _Py_ERROR_UNKNOWN) +                error_handler = get_error_handler(errors); +              startpos = i-1; -            rep = unicode_encode_call_errorhandler( -                  errors, &errorHandler, "utf-8", "surrogates not allowed", -                  unicode, &exc, startpos, startpos+1, &newpos); -            if (!rep) -                goto error; - -            if (PyBytes_Check(rep)) -                repsize = PyBytes_GET_SIZE(rep); -            else -                repsize = PyUnicode_GET_LENGTH(rep); +            endpos = startpos+1; + +            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) +                endpos++; + +            /* Only overallocate the buffer if it's not the last write */ +            writer.overallocate = (endpos < size); + +            switch (error_handler) +            { +            case _Py_ERROR_REPLACE: +                memset(p, '?', endpos - startpos); +                p += (endpos - startpos); +                /* fall through the ignore handler */ +            case _Py_ERROR_IGNORE: +                i += (endpos - startpos - 1); +                break; -            if (repsize > max_char_size) { -                Py_ssize_t offset; +            case _Py_ERROR_SURROGATEPASS: +                for (k=startpos; k<endpos; k++) { +                    ch = data[k]; +                    *p++ = (char)(0xe0 | (ch >> 12)); +                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); +                    *p++ = (char)(0x80 | (ch & 0x3f)); +                } +                i += (endpos - startpos - 1); +                break; -                if (result == NULL) -                    offset = p - stackbuf; -                else -                    offset = p - PyBytes_AS_STRING(result); +            case _Py_ERROR_BACKSLASHREPLACE: +                /* substract preallocated bytes */ +                writer.min_size -= max_char_size * (endpos - startpos); +                p = backslashreplace(&writer, p, +                                     unicode, startpos, endpos); +                if (p == NULL) +                    goto error; +                i += (endpos - startpos - 1); +                break; -                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { -                    /* integer overflow */ -                    PyErr_NoMemory(); +            case _Py_ERROR_XMLCHARREFREPLACE: +                /* substract preallocated bytes */ +                writer.min_size -= max_char_size * (endpos - startpos); +                p = xmlcharrefreplace(&writer, p, +                                      unicode, startpos, endpos); +                if (p == NULL)                      goto error; +                i += (endpos - startpos - 1); +                break; + +            case _Py_ERROR_SURROGATEESCAPE: +                for (k=startpos; k<endpos; k++) { +                    ch = data[k]; +                    if (!(0xDC80 <= ch && ch <= 0xDCFF)) +                        break; +                    *p++ = (char)(ch & 0xff);                  } -                nallocated += repsize - max_char_size; -                if (result != NULL) { -                    if (_PyBytes_Resize(&result, nallocated) < 0) -                        goto error; -                } else { -                    result = PyBytes_FromStringAndSize(NULL, nallocated); -                    if (result == NULL) -                        goto error; -                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); +                if (k >= endpos) { +                    i += (endpos - startpos - 1); +                    break;                  } -                p = PyBytes_AS_STRING(result) + offset; -            } +                startpos = k; +                assert(startpos < endpos); +                /* fall through the default handler */ +            default: +                rep = unicode_encode_call_errorhandler( +                      errors, &error_handler_obj, "utf-8", "surrogates not allowed", +                      unicode, &exc, startpos, endpos, &newpos); +                if (!rep) +                    goto error; -            if (PyBytes_Check(rep)) { -                char *prep = PyBytes_AS_STRING(rep); -                for(k = repsize; k > 0; k--) -                    *p++ = *prep++; -            } else /* rep is unicode */ { -                enum PyUnicode_Kind repkind; -                void *repdata; +                /* substract preallocated bytes */ +                writer.min_size -= max_char_size; -                if (PyUnicode_READY(rep) < 0) -                    goto error; -                repkind = PyUnicode_KIND(rep); -                repdata = PyUnicode_DATA(rep); +                if (PyBytes_Check(rep)) { +                    p = _PyBytesWriter_WriteBytes(&writer, p, +                                                  PyBytes_AS_STRING(rep), +                                                  PyBytes_GET_SIZE(rep)); +                } +                else { +                    /* rep is unicode */ +                    if (PyUnicode_READY(rep) < 0) +                        goto error; -                for(k=0; k<repsize; k++) { -                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); -                    if (0x80 <= c) { +                    if (!PyUnicode_IS_ASCII(rep)) {                          raise_encode_exception(&exc, "utf-8",                                                 unicode,                                                 i-1, i,                                                 "surrogates not allowed");                          goto error;                      } -                    *p++ = (char)c; + +                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); +                    p = _PyBytesWriter_WriteBytes(&writer, p, +                                                  PyUnicode_DATA(rep), +                                                  PyUnicode_GET_LENGTH(rep));                  } + +                if (p == NULL) +                    goto error; +                Py_CLEAR(rep); + +                i = newpos;              } -            Py_CLEAR(rep); + +            /* If overallocation was disabled, ensure that it was the last +               write. Otherwise, we missed an optimization */ +            assert(writer.overallocate || i == size);          }          else  #if STRINGLIB_SIZEOF_CHAR > 2 @@ -416,31 +446,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,  #endif /* STRINGLIB_SIZEOF_CHAR > 1 */      } -    if (result == NULL) { -        /* This was stack allocated. */ -        nneeded = p - stackbuf; -        assert(nneeded <= nallocated); -        result = PyBytes_FromStringAndSize(stackbuf, nneeded); -    } -    else { -        /* Cut back to size actually needed. */ -        nneeded = p - PyBytes_AS_STRING(result); -        assert(nneeded <= nallocated); -        _PyBytes_Resize(&result, nneeded); -    } -  #if STRINGLIB_SIZEOF_CHAR > 1 -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);  #endif -    return result; +    return _PyBytesWriter_Finish(&writer, p);  #if STRINGLIB_SIZEOF_CHAR > 1   error:      Py_XDECREF(rep); -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc); -    Py_XDECREF(result); +    _PyBytesWriter_Dealloc(&writer);      return NULL;  #endif diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index cda68e77c8..98165ad114 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -32,52 +32,98 @@  #define STRINGLIB_BLOOM(mask, ch)     \      ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -  Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n, -                                   STRINGLIB_CHAR ch, unsigned char needle, -                                   int mode) +STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)  { -    if (mode == FAST_SEARCH) { -        const STRINGLIB_CHAR *ptr = s; -        const STRINGLIB_CHAR *e = s + n; -        while (ptr < e) { -            void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR)); -            if (candidate == NULL) -                return -1; -            ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); -            if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch) -                return (ptr - s); -            /* False positive */ -            ptr++; -        } +    const STRINGLIB_CHAR *p, *e; + +    p = s; +    e = s + n; +    if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 +        p = memchr(s, ch, n); +        if (p != NULL) +            return (p - s);          return -1; +#else +        /* use memchr if we can choose a needle without two many likely +           false positives */ +        unsigned char needle = ch & 0xff; +        /* If looking for a multiple of 256, we'd have too +           many false positives looking for the '\0' byte in UCS2 +           and UCS4 representations. */ +        if (needle != 0) { +            while (p < e) { +                void *candidate = memchr(p, needle, +                                         (e - p) * sizeof(STRINGLIB_CHAR)); +                if (candidate == NULL) +                    return -1; +                p = (const STRINGLIB_CHAR *) +                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); +                if (*p == ch) +                    return (p - s); +                /* False positive */ +                p++; +            } +            return -1; +        } +#endif      } +    while (p < e) { +        if (*p == ch) +            return (p - s); +        p++; +    } +    return -1; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) +{ +    const STRINGLIB_CHAR *p;  #ifdef HAVE_MEMRCHR      /* memrchr() is a GNU extension, available since glibc 2.1.91.         it doesn't seem as optimized as memchr(), but is still quite -       faster than our hand-written loop in FASTSEARCH below */ -    else if (mode == FAST_RSEARCH) { -        while (n > 0) { -            const STRINGLIB_CHAR *found; -            void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR)); -            if (candidate == NULL) -                return -1; -            found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); -            n = found - s; -            if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch) -                return n; -            /* False positive */ -        } +       faster than our hand-written loop below */ + +    if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 +        p = memrchr(s, ch, n); +        if (p != NULL) +            return (p - s);          return -1; -    } +#else +        /* use memrchr if we can choose a needle without two many likely +           false positives */ +        unsigned char needle = ch & 0xff; +        /* If looking for a multiple of 256, we'd have too +           many false positives looking for the '\0' byte in UCS2 +           and UCS4 representations. */ +        if (needle != 0) { +            while (n > 0) { +                void *candidate = memrchr(s, needle, +                                          n * sizeof(STRINGLIB_CHAR)); +                if (candidate == NULL) +                    return -1; +                p = (const STRINGLIB_CHAR *) +                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); +                n = p - s; +                if (*p == ch) +                    return n; +                /* False positive */ +            } +            return -1; +        }  #endif -    else { -        assert(0); /* Should never get here */ -        return 0;      } - -#undef DO_MEMCHR +#endif  /* HAVE_MEMRCHR */ +    p = s + n; +    while (p > s) { +        p--; +        if (*p == ch) +            return (p - s); +    } +    return -1;  }  Py_LOCAL_INLINE(Py_ssize_t) @@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,          if (m <= 0)              return -1;          /* use special case for 1-character strings */ -        if (n > 10 && (mode == FAST_SEARCH -#ifdef HAVE_MEMRCHR -                    || mode == FAST_RSEARCH -#endif -                    )) { -            /* use memchr if we can choose a needle without two many likely -               false positives */ -            unsigned char needle; -            needle = p[0] & 0xff; -#if STRINGLIB_SIZEOF_CHAR > 1 -            /* If looking for a multiple of 256, we'd have too -               many false positives looking for the '\0' byte in UCS2 -               and UCS4 representations. */ -            if (needle != 0) -#endif -                return STRINGLIB(fastsearch_memchr_1char) -                       (s, n, p[0], needle, mode); -        } -        if (mode == FAST_COUNT) { +        if (mode == FAST_SEARCH) +            return STRINGLIB(find_char)(s, n, p[0]); +        else if (mode == FAST_RSEARCH) +            return STRINGLIB(rfind_char)(s, n, p[0]); +        else {  /* FAST_COUNT */              for (i = 0; i < n; i++)                  if (s[i] == p[0]) {                      count++; @@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,                          return maxcount;                  }              return count; -        } else if (mode == FAST_SEARCH) { -            for (i = 0; i < n; i++) -                if (s[i] == p[0]) -                    return i; -        } else {    /* FAST_RSEARCH */ -            for (i = n - 1; i > -1; i--) -                if (s[i] == p[0]) -                    return i;          }          return -1;      } diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index aec221acff..d72e47d348 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -67,7 +67,7 @@ SubString_new_object(SubString *str)      return PyUnicode_Substring(str->str, str->start, str->end);  } -/* return a new string.  if str->str is NULL, return None */ +/* return a new string.  if str->str is NULL, return a new empty string */  Py_LOCAL_INLINE(PyObject *)  SubString_new_object_or_empty(SubString *str)  { | 
