diff options
Diffstat (limited to 'Objects/stringlib')
| -rw-r--r-- | Objects/stringlib/codecs.h | 208 | ||||
| -rw-r--r-- | Objects/stringlib/ctype.h | 5 | ||||
| -rw-r--r-- | Objects/stringlib/fastsearch.h | 150 | ||||
| -rw-r--r-- | Objects/stringlib/find.h | 82 | ||||
| -rw-r--r-- | Objects/stringlib/find_max_char.h | 5 | ||||
| -rw-r--r-- | Objects/stringlib/join.h | 8 | ||||
| -rw-r--r-- | Objects/stringlib/localeutil.h | 4 | ||||
| -rw-r--r-- | Objects/stringlib/transmogrify.h | 615 | ||||
| -rw-r--r-- | Objects/stringlib/unicode_format.h | 2 | 
9 files changed, 738 insertions, 341 deletions
| diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 0fc6b582d2..a9d0a349d9 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -1,6 +1,8 @@  /* stringlib: codec implementations */ -#if STRINGLIB_IS_UNICODE +#if !STRINGLIB_IS_UNICODE +# error "codecs.h is specific to Unicode" +#endif  /* Mask to quickly check whether a C 'long' contains a     non-ASCII, UTF8-encoded char. */ @@ -263,50 +265,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,  #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */      Py_ssize_t i;                /* index into s of next input byte */ -    PyObject *result;            /* result string object */      char *p;                     /* next free byte in output buffer */ -    Py_ssize_t nallocated;      /* number of result bytes allocated */ -    Py_ssize_t nneeded;            /* number of result bytes needed */  #if STRINGLIB_SIZEOF_CHAR > 1 -    PyObject *errorHandler = NULL; +    PyObject *error_handler_obj = NULL;      PyObject *exc = NULL;      PyObject *rep = NULL; +    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;  #endif  #if STRINGLIB_SIZEOF_CHAR == 1      const Py_ssize_t max_char_size = 2; -    char stackbuf[MAX_SHORT_UNICHARS * 2];  #elif STRINGLIB_SIZEOF_CHAR == 2      const Py_ssize_t max_char_size = 3; -    char stackbuf[MAX_SHORT_UNICHARS * 3];  #else /*  STRINGLIB_SIZEOF_CHAR == 4 */      const Py_ssize_t max_char_size = 4; -    char stackbuf[MAX_SHORT_UNICHARS * 4];  #endif +    _PyBytesWriter writer;      assert(size >= 0); +    _PyBytesWriter_Init(&writer); -    if (size <= MAX_SHORT_UNICHARS) { -        /* Write into the stack buffer; nallocated can't overflow. -         * At the end, we'll allocate exactly as much heap space as it -         * turns out we need. -         */ -        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); -        result = NULL;   /* will allocate after we're done */ -        p = stackbuf; -    } -    else { -        if (size > PY_SSIZE_T_MAX / max_char_size) { -            /* integer overflow */ -            return PyErr_NoMemory(); -        } -        /* Overallocate on the heap, and give the excess back at the end. */ -        nallocated = size * max_char_size; -        result = PyBytes_FromStringAndSize(NULL, nallocated); -        if (result == NULL) -            return NULL; -        p = PyBytes_AS_STRING(result); +    if (size > PY_SSIZE_T_MAX / max_char_size) { +        /* integer overflow */ +        return PyErr_NoMemory();      } +    p = _PyBytesWriter_Alloc(&writer, size * max_char_size); +    if (p == NULL) +        return NULL; +      for (i = 0; i < size;) {          Py_UCS4 ch = data[i++]; @@ -326,72 +312,119 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,          }  #if STRINGLIB_SIZEOF_CHAR > 1          else if (Py_UNICODE_IS_SURROGATE(ch)) { -            Py_ssize_t newpos; -            Py_ssize_t repsize, k, startpos; +            Py_ssize_t startpos, endpos, newpos; +            Py_ssize_t k; +            if (error_handler == _Py_ERROR_UNKNOWN) { +                error_handler = get_error_handler(errors); +            } +              startpos = i-1; -            rep = unicode_encode_call_errorhandler( -                  errors, &errorHandler, "utf-8", "surrogates not allowed", -                  unicode, &exc, startpos, startpos+1, &newpos); -            if (!rep) -                goto error; - -            if (PyBytes_Check(rep)) -                repsize = PyBytes_GET_SIZE(rep); -            else -                repsize = PyUnicode_GET_LENGTH(rep); +            endpos = startpos+1; + +            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos])) +                endpos++; + +            /* Only overallocate the buffer if it's not the last write */ +            writer.overallocate = (endpos < size); + +            switch (error_handler) +            { +            case _Py_ERROR_REPLACE: +                memset(p, '?', endpos - startpos); +                p += (endpos - startpos); +                /* fall through the ignore handler */ +            case _Py_ERROR_IGNORE: +                i += (endpos - startpos - 1); +                break; -            if (repsize > max_char_size) { -                Py_ssize_t offset; +            case _Py_ERROR_SURROGATEPASS: +                for (k=startpos; k<endpos; k++) { +                    ch = data[k]; +                    *p++ = (char)(0xe0 | (ch >> 12)); +                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); +                    *p++ = (char)(0x80 | (ch & 0x3f)); +                } +                i += (endpos - startpos - 1); +                break; -                if (result == NULL) -                    offset = p - stackbuf; -                else -                    offset = p - PyBytes_AS_STRING(result); +            case _Py_ERROR_BACKSLASHREPLACE: +                /* subtract preallocated bytes */ +                writer.min_size -= max_char_size * (endpos - startpos); +                p = backslashreplace(&writer, p, +                                     unicode, startpos, endpos); +                if (p == NULL) +                    goto error; +                i += (endpos - startpos - 1); +                break; -                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) { -                    /* integer overflow */ -                    PyErr_NoMemory(); +            case _Py_ERROR_XMLCHARREFREPLACE: +                /* subtract preallocated bytes */ +                writer.min_size -= max_char_size * (endpos - startpos); +                p = xmlcharrefreplace(&writer, p, +                                      unicode, startpos, endpos); +                if (p == NULL)                      goto error; +                i += (endpos - startpos - 1); +                break; + +            case _Py_ERROR_SURROGATEESCAPE: +                for (k=startpos; k<endpos; k++) { +                    ch = data[k]; +                    if (!(0xDC80 <= ch && ch <= 0xDCFF)) +                        break; +                    *p++ = (char)(ch & 0xff);                  } -                nallocated += repsize - max_char_size; -                if (result != NULL) { -                    if (_PyBytes_Resize(&result, nallocated) < 0) -                        goto error; -                } else { -                    result = PyBytes_FromStringAndSize(NULL, nallocated); -                    if (result == NULL) -                        goto error; -                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset); +                if (k >= endpos) { +                    i += (endpos - startpos - 1); +                    break;                  } -                p = PyBytes_AS_STRING(result) + offset; -            } +                startpos = k; +                assert(startpos < endpos); +                /* fall through the default handler */ +            default: +                rep = unicode_encode_call_errorhandler( +                      errors, &error_handler_obj, "utf-8", "surrogates not allowed", +                      unicode, &exc, startpos, endpos, &newpos); +                if (!rep) +                    goto error; -            if (PyBytes_Check(rep)) { -                char *prep = PyBytes_AS_STRING(rep); -                for(k = repsize; k > 0; k--) -                    *p++ = *prep++; -            } else /* rep is unicode */ { -                enum PyUnicode_Kind repkind; -                void *repdata; +                /* subtract preallocated bytes */ +                writer.min_size -= max_char_size; -                if (PyUnicode_READY(rep) < 0) -                    goto error; -                repkind = PyUnicode_KIND(rep); -                repdata = PyUnicode_DATA(rep); +                if (PyBytes_Check(rep)) { +                    p = _PyBytesWriter_WriteBytes(&writer, p, +                                                  PyBytes_AS_STRING(rep), +                                                  PyBytes_GET_SIZE(rep)); +                } +                else { +                    /* rep is unicode */ +                    if (PyUnicode_READY(rep) < 0) +                        goto error; -                for(k=0; k<repsize; k++) { -                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k); -                    if (0x80 <= c) { +                    if (!PyUnicode_IS_ASCII(rep)) {                          raise_encode_exception(&exc, "utf-8",                                                 unicode,                                                 i-1, i,                                                 "surrogates not allowed");                          goto error;                      } -                    *p++ = (char)c; + +                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND); +                    p = _PyBytesWriter_WriteBytes(&writer, p, +                                                  PyUnicode_DATA(rep), +                                                  PyUnicode_GET_LENGTH(rep));                  } + +                if (p == NULL) +                    goto error; +                Py_CLEAR(rep); + +                i = newpos;              } -            Py_CLEAR(rep); + +            /* If overallocation was disabled, ensure that it was the last +               write. Otherwise, we missed an optimization */ +            assert(writer.overallocate || i == size);          }          else  #if STRINGLIB_SIZEOF_CHAR > 2 @@ -416,31 +449,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,  #endif /* STRINGLIB_SIZEOF_CHAR > 1 */      } -    if (result == NULL) { -        /* This was stack allocated. */ -        nneeded = p - stackbuf; -        assert(nneeded <= nallocated); -        result = PyBytes_FromStringAndSize(stackbuf, nneeded); -    } -    else { -        /* Cut back to size actually needed. */ -        nneeded = p - PyBytes_AS_STRING(result); -        assert(nneeded <= nallocated); -        _PyBytes_Resize(&result, nneeded); -    } -  #if STRINGLIB_SIZEOF_CHAR > 1 -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc);  #endif -    return result; +    return _PyBytesWriter_Finish(&writer, p);  #if STRINGLIB_SIZEOF_CHAR > 1   error:      Py_XDECREF(rep); -    Py_XDECREF(errorHandler); +    Py_XDECREF(error_handler_obj);      Py_XDECREF(exc); -    Py_XDECREF(result); +    _PyBytesWriter_Dealloc(&writer);      return NULL;  #endif @@ -806,5 +826,3 @@ STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,  #undef SWAB4  #endif - -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/ctype.h b/Objects/stringlib/ctype.h index 739cf3d9eb..f0546256ed 100644 --- a/Objects/stringlib/ctype.h +++ b/Objects/stringlib/ctype.h @@ -1,5 +1,6 @@ -/* NOTE: this API is -ONLY- for use with single byte character strings. */ -/* Do not use it with Unicode. */ +#if STRINGLIB_IS_UNICODE +# error "ctype.h only compatible with byte-wise strings" +#endif  #include "bytes_methods.h" diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index cda68e77c8..98165ad114 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -32,52 +32,98 @@  #define STRINGLIB_BLOOM(mask, ch)     \      ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -  Py_LOCAL_INLINE(Py_ssize_t) -STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n, -                                   STRINGLIB_CHAR ch, unsigned char needle, -                                   int mode) +STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)  { -    if (mode == FAST_SEARCH) { -        const STRINGLIB_CHAR *ptr = s; -        const STRINGLIB_CHAR *e = s + n; -        while (ptr < e) { -            void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR)); -            if (candidate == NULL) -                return -1; -            ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); -            if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch) -                return (ptr - s); -            /* False positive */ -            ptr++; -        } +    const STRINGLIB_CHAR *p, *e; + +    p = s; +    e = s + n; +    if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 +        p = memchr(s, ch, n); +        if (p != NULL) +            return (p - s);          return -1; +#else +        /* use memchr if we can choose a needle without two many likely +           false positives */ +        unsigned char needle = ch & 0xff; +        /* If looking for a multiple of 256, we'd have too +           many false positives looking for the '\0' byte in UCS2 +           and UCS4 representations. */ +        if (needle != 0) { +            while (p < e) { +                void *candidate = memchr(p, needle, +                                         (e - p) * sizeof(STRINGLIB_CHAR)); +                if (candidate == NULL) +                    return -1; +                p = (const STRINGLIB_CHAR *) +                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); +                if (*p == ch) +                    return (p - s); +                /* False positive */ +                p++; +            } +            return -1; +        } +#endif      } +    while (p < e) { +        if (*p == ch) +            return (p - s); +        p++; +    } +    return -1; +} + +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) +{ +    const STRINGLIB_CHAR *p;  #ifdef HAVE_MEMRCHR      /* memrchr() is a GNU extension, available since glibc 2.1.91.         it doesn't seem as optimized as memchr(), but is still quite -       faster than our hand-written loop in FASTSEARCH below */ -    else if (mode == FAST_RSEARCH) { -        while (n > 0) { -            const STRINGLIB_CHAR *found; -            void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR)); -            if (candidate == NULL) -                return -1; -            found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); -            n = found - s; -            if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch) -                return n; -            /* False positive */ -        } +       faster than our hand-written loop below */ + +    if (n > 10) { +#if STRINGLIB_SIZEOF_CHAR == 1 +        p = memrchr(s, ch, n); +        if (p != NULL) +            return (p - s);          return -1; -    } +#else +        /* use memrchr if we can choose a needle without two many likely +           false positives */ +        unsigned char needle = ch & 0xff; +        /* If looking for a multiple of 256, we'd have too +           many false positives looking for the '\0' byte in UCS2 +           and UCS4 representations. */ +        if (needle != 0) { +            while (n > 0) { +                void *candidate = memrchr(s, needle, +                                          n * sizeof(STRINGLIB_CHAR)); +                if (candidate == NULL) +                    return -1; +                p = (const STRINGLIB_CHAR *) +                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR)); +                n = p - s; +                if (*p == ch) +                    return n; +                /* False positive */ +            } +            return -1; +        }  #endif -    else { -        assert(0); /* Should never get here */ -        return 0;      } - -#undef DO_MEMCHR +#endif  /* HAVE_MEMRCHR */ +    p = s + n; +    while (p > s) { +        p--; +        if (*p == ch) +            return (p - s); +    } +    return -1;  }  Py_LOCAL_INLINE(Py_ssize_t) @@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,          if (m <= 0)              return -1;          /* use special case for 1-character strings */ -        if (n > 10 && (mode == FAST_SEARCH -#ifdef HAVE_MEMRCHR -                    || mode == FAST_RSEARCH -#endif -                    )) { -            /* use memchr if we can choose a needle without two many likely -               false positives */ -            unsigned char needle; -            needle = p[0] & 0xff; -#if STRINGLIB_SIZEOF_CHAR > 1 -            /* If looking for a multiple of 256, we'd have too -               many false positives looking for the '\0' byte in UCS2 -               and UCS4 representations. */ -            if (needle != 0) -#endif -                return STRINGLIB(fastsearch_memchr_1char) -                       (s, n, p[0], needle, mode); -        } -        if (mode == FAST_COUNT) { +        if (mode == FAST_SEARCH) +            return STRINGLIB(find_char)(s, n, p[0]); +        else if (mode == FAST_RSEARCH) +            return STRINGLIB(rfind_char)(s, n, p[0]); +        else {  /* FAST_COUNT */              for (i = 0; i < n; i++)                  if (s[i] == p[0]) {                      count++; @@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,                          return maxcount;                  }              return count; -        } else if (mode == FAST_SEARCH) { -            for (i = 0; i < n; i++) -                if (s[i] == p[0]) -                    return i; -        } else {    /* FAST_RSEARCH */ -            for (i = n - 1; i > -1; i--) -                if (s[i] == p[0]) -                    return i;          }          return -1;      } diff --git a/Objects/stringlib/find.h b/Objects/stringlib/find.h index 14815f6e62..509b929739 100644 --- a/Objects/stringlib/find.h +++ b/Objects/stringlib/find.h @@ -117,85 +117,3 @@ STRINGLIB(parse_args_finds)(const char * function_name, PyObject *args,  }  #undef FORMAT_BUFFER_SIZE - -#if STRINGLIB_IS_UNICODE - -/* -Wraps stringlib_parse_args_finds() and additionally ensures that the -first argument is a unicode object. - -Note that we receive a pointer to the pointer of the substring object, -so when we create that object in this function we don't DECREF it, -because it continues living in the caller functions (those functions, -after finishing using the substring, must DECREF it). -*/ - -Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds_unicode)(const char * function_name, PyObject *args, -                                   PyObject **substring, -                                   Py_ssize_t *start, Py_ssize_t *end) -{ -    PyObject *tmp_substring; - -    if(STRINGLIB(parse_args_finds)(function_name, args, &tmp_substring, -                                  start, end)) { -        tmp_substring = PyUnicode_FromObject(tmp_substring); -        if (!tmp_substring) -            return 0; -        *substring = tmp_substring; -        return 1; -    } -    return 0; -} - -#else /* !STRINGLIB_IS_UNICODE */ - -/* -Wraps stringlib_parse_args_finds() and additionally checks whether the -first argument is an integer in range(0, 256). - -If this is the case, writes the integer value to the byte parameter -and sets subobj to NULL. Otherwise, sets the first argument to subobj -and doesn't touch byte. The other parameters are similar to those of -stringlib_parse_args_finds(). -*/ - -Py_LOCAL_INLINE(int) -STRINGLIB(parse_args_finds_byte)(const char *function_name, PyObject *args, -                                 PyObject **subobj, char *byte, -                                 Py_ssize_t *start, Py_ssize_t *end) -{ -    PyObject *tmp_subobj; -    Py_ssize_t ival; -    PyObject *err; - -    if(!STRINGLIB(parse_args_finds)(function_name, args, &tmp_subobj, -                                    start, end)) -        return 0; - -    if (!PyNumber_Check(tmp_subobj)) { -        *subobj = tmp_subobj; -        return 1; -    } - -    ival = PyNumber_AsSsize_t(tmp_subobj, PyExc_OverflowError); -    if (ival == -1) { -        err = PyErr_Occurred(); -        if (err && !PyErr_GivenExceptionMatches(err, PyExc_OverflowError)) { -            PyErr_Clear(); -            *subobj = tmp_subobj; -            return 1; -        } -    } - -    if (ival < 0 || ival > 255) { -        PyErr_SetString(PyExc_ValueError, "byte must be in range(0, 256)"); -        return 0; -    } - -    *subobj = NULL; -    *byte = (char)ival; -    return 1; -} - -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h index eb3fe886e2..8ccbc30944 100644 --- a/Objects/stringlib/find_max_char.h +++ b/Objects/stringlib/find_max_char.h @@ -1,6 +1,8 @@  /* Finding the optimal width of unicode characters in a buffer */ -#if STRINGLIB_IS_UNICODE +#if !STRINGLIB_IS_UNICODE +# error "find_max_char.h is specific to Unicode" +#endif  /* Mask to quickly check whether a C 'long' contains a     non-ASCII, UTF8-encoded char. */ @@ -129,5 +131,4 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)  #undef MAX_CHAR_UCS4  #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ -#endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h index cbf81be170..6f314e1524 100644 --- a/Objects/stringlib/join.h +++ b/Objects/stringlib/join.h @@ -1,6 +1,6 @@  /* stringlib: bytes joining implementation */ -#if STRINGLIB_SIZEOF_CHAR != 1 +#if STRINGLIB_IS_UNICODE  #error join.h only compatible with byte-wise strings  #endif @@ -107,7 +107,7 @@ STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable)          for (i = 0; i < nbufs; i++) {              Py_ssize_t n = buffers[i].len;              char *q = buffers[i].buf; -            Py_MEMCPY(p, q, n); +            memcpy(p, q, n);              p += n;          }          goto done; @@ -116,12 +116,12 @@ STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable)          Py_ssize_t n;          char *q;          if (i) { -            Py_MEMCPY(p, sepstr, seplen); +            memcpy(p, sepstr, seplen);              p += seplen;          }          n = buffers[i].len;          q = buffers[i].buf; -        Py_MEMCPY(p, q, n); +        memcpy(p, q, n);          p += n;      }      goto done; diff --git a/Objects/stringlib/localeutil.h b/Objects/stringlib/localeutil.h index 6e2f07342c..df501ed05c 100644 --- a/Objects/stringlib/localeutil.h +++ b/Objects/stringlib/localeutil.h @@ -2,8 +2,8 @@  #include <locale.h> -#ifndef STRINGLIB_IS_UNICODE -#   error "localeutil is specific to Unicode" +#if !STRINGLIB_IS_UNICODE +#   error "localeutil.h is specific to Unicode"  #endif  typedef struct { diff --git a/Objects/stringlib/transmogrify.h b/Objects/stringlib/transmogrify.h index b559b5356b..a314572a72 100644 --- a/Objects/stringlib/transmogrify.h +++ b/Objects/stringlib/transmogrify.h @@ -1,14 +1,21 @@ -/* NOTE: this API is -ONLY- for use with single byte character strings. */ -/* Do not use it with Unicode. */ +#if STRINGLIB_IS_UNICODE +# error "transmogrify.h only compatible with byte-wise strings" +#endif  /* the more complicated methods.  parts of these should be pulled out into the     shared code in bytes_methods.c to cut down on duplicate code bloat.  */ -PyDoc_STRVAR(expandtabs__doc__, -"B.expandtabs(tabsize=8) -> copy of B\n\ -\n\ -Return a copy of B where all tab characters are expanded using spaces.\n\ -If tabsize is not given, a tab size of 8 characters is assumed."); +static inline PyObject * +return_self(PyObject *self) +{ +#if !STRINGLIB_MUTABLE +    if (STRINGLIB_CHECK_EXACT(self)) { +        Py_INCREF(self); +        return self; +    } +#endif +    return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); +}  static PyObject*  stringlib_expandtabs(PyObject *self, PyObject *args, PyObject *kwds) @@ -83,7 +90,7 @@ stringlib_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)      return NULL;  } -Py_LOCAL_INLINE(PyObject *) +static inline PyObject *  pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)  {      PyObject *u; @@ -93,39 +100,25 @@ pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, char fill)      if (right < 0)          right = 0; -    if (left == 0 && right == 0 && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE -        /* We're defined as returning a copy;  If the object is mutable -         * that means we must make an identical copy. */ -        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else -        Py_INCREF(self); -        return (PyObject *)self; -#endif /* STRINGLIB_MUTABLE */ +    if (left == 0 && right == 0) { +        return return_self(self);      } -    u = STRINGLIB_NEW(NULL, -				   left + STRINGLIB_LEN(self) + right); +    u = STRINGLIB_NEW(NULL, left + STRINGLIB_LEN(self) + right);      if (u) {          if (left)              memset(STRINGLIB_STR(u), fill, left); -        Py_MEMCPY(STRINGLIB_STR(u) + left, -	       STRINGLIB_STR(self), -	       STRINGLIB_LEN(self)); +        memcpy(STRINGLIB_STR(u) + left, +               STRINGLIB_STR(self), +               STRINGLIB_LEN(self));          if (right)              memset(STRINGLIB_STR(u) + left + STRINGLIB_LEN(self), -		   fill, right); +                   fill, right);      }      return u;  } -PyDoc_STRVAR(ljust__doc__, -"B.ljust(width[, fillchar]) -> copy of B\n" -"\n" -"Return B left justified in a string of length width. Padding is\n" -"done using the specified fill character (default is a space)."); -  static PyObject *  stringlib_ljust(PyObject *self, PyObject *args)  { @@ -135,27 +128,14 @@ stringlib_ljust(PyObject *self, PyObject *args)      if (!PyArg_ParseTuple(args, "n|c:ljust", &width, &fillchar))          return NULL; -    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE -        /* We're defined as returning a copy;  If the object is mutable -         * that means we must make an identical copy. */ -        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else -        Py_INCREF(self); -        return (PyObject*) self; -#endif +    if (STRINGLIB_LEN(self) >= width) { +        return return_self(self);      }      return pad(self, 0, width - STRINGLIB_LEN(self), fillchar);  } -PyDoc_STRVAR(rjust__doc__, -"B.rjust(width[, fillchar]) -> copy of B\n" -"\n" -"Return B right justified in a string of length width. Padding is\n" -"done using the specified fill character (default is a space)"); -  static PyObject *  stringlib_rjust(PyObject *self, PyObject *args)  { @@ -165,27 +145,14 @@ stringlib_rjust(PyObject *self, PyObject *args)      if (!PyArg_ParseTuple(args, "n|c:rjust", &width, &fillchar))          return NULL; -    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE -        /* We're defined as returning a copy;  If the object is mutable -         * that means we must make an identical copy. */ -        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else -        Py_INCREF(self); -        return (PyObject*) self; -#endif +    if (STRINGLIB_LEN(self) >= width) { +        return return_self(self);      }      return pad(self, width - STRINGLIB_LEN(self), 0, fillchar);  } -PyDoc_STRVAR(center__doc__, -"B.center(width[, fillchar]) -> copy of B\n" -"\n" -"Return B centered in a string of length width.  Padding is\n" -"done using the specified fill character (default is a space)."); -  static PyObject *  stringlib_center(PyObject *self, PyObject *args)  { @@ -196,15 +163,8 @@ stringlib_center(PyObject *self, PyObject *args)      if (!PyArg_ParseTuple(args, "n|c:center", &width, &fillchar))          return NULL; -    if (STRINGLIB_LEN(self) >= width && STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE -        /* We're defined as returning a copy;  If the object is mutable -         * that means we must make an identical copy. */ -        return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else -        Py_INCREF(self); -        return (PyObject*) self; -#endif +    if (STRINGLIB_LEN(self) >= width) { +        return return_self(self);      }      marg = width - STRINGLIB_LEN(self); @@ -213,12 +173,6 @@ stringlib_center(PyObject *self, PyObject *args)      return pad(self, left, marg - left, fillchar);  } -PyDoc_STRVAR(zfill__doc__, -"B.zfill(width) -> copy of B\n" -"\n" -"Pad a numeric string B with zeros on the left, to fill a field\n" -"of the specified width.  B is never truncated."); -  static PyObject *  stringlib_zfill(PyObject *self, PyObject *args)  { @@ -231,21 +185,7 @@ stringlib_zfill(PyObject *self, PyObject *args)          return NULL;      if (STRINGLIB_LEN(self) >= width) { -        if (STRINGLIB_CHECK_EXACT(self)) { -#if STRINGLIB_MUTABLE -            /* We're defined as returning a copy;  If the object is mutable -             * that means we must make an identical copy. */ -            return STRINGLIB_NEW(STRINGLIB_STR(self), STRINGLIB_LEN(self)); -#else -            Py_INCREF(self); -            return (PyObject*) self; -#endif -        } -        else -            return STRINGLIB_NEW( -                STRINGLIB_STR(self), -                STRINGLIB_LEN(self) -            ); +        return return_self(self);      }      fill = width - STRINGLIB_LEN(self); @@ -262,5 +202,500 @@ stringlib_zfill(PyObject *self, PyObject *args)          p[fill] = '0';      } -    return (PyObject*) s; +    return s; +} + + +/* find and count characters and substrings */ + +#define findchar(target, target_len, c)                         \ +  ((char *)memchr((const void *)(target), c, target_len)) + + +static Py_ssize_t +countchar(const char *target, Py_ssize_t target_len, char c, +          Py_ssize_t maxcount) +{ +    Py_ssize_t count = 0; +    const char *start = target; +    const char *end = target + target_len; + +    while ((start = findchar(start, end - start, c)) != NULL) { +        count++; +        if (count >= maxcount) +            break; +        start += 1; +    } +    return count; +} + + +/* Algorithms for different cases of string replacement */ + +/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ +static PyObject * +stringlib_replace_interleave(PyObject *self, +                             const char *to_s, Py_ssize_t to_len, +                             Py_ssize_t maxcount) +{ +    const char *self_s; +    char *result_s; +    Py_ssize_t self_len, result_len; +    Py_ssize_t count, i; +    PyObject *result; + +    self_len = STRINGLIB_LEN(self); + +    /* 1 at the end plus 1 after every character; +       count = min(maxcount, self_len + 1) */ +    if (maxcount <= self_len) { +        count = maxcount; +    } +    else { +        /* Can't overflow: self_len + 1 <= maxcount <= PY_SSIZE_T_MAX. */ +        count = self_len + 1; +    } + +    /* Check for overflow */ +    /*   result_len = count * to_len + self_len; */ +    assert(count > 0); +    if (to_len > (PY_SSIZE_T_MAX - self_len) / count) { +        PyErr_SetString(PyExc_OverflowError, +                        "replace bytes are too long"); +        return NULL; +    } +    result_len = count * to_len + self_len; +    result = STRINGLIB_NEW(NULL, result_len); +    if (result == NULL) { +        return NULL; +    } + +    self_s = STRINGLIB_STR(self); +    result_s = STRINGLIB_STR(result); + +    if (to_len > 1) { +        /* Lay the first one down (guaranteed this will occur) */ +        memcpy(result_s, to_s, to_len); +        result_s += to_len; +        count -= 1; + +        for (i = 0; i < count; i++) { +            *result_s++ = *self_s++; +            memcpy(result_s, to_s, to_len); +            result_s += to_len; +        } +    } +    else { +        result_s[0] = to_s[0]; +        result_s += to_len; +        count -= 1; +        for (i = 0; i < count; i++) { +            *result_s++ = *self_s++; +            result_s[0] = to_s[0]; +            result_s += to_len; +        } +    } + +    /* Copy the rest of the original string */ +    memcpy(result_s, self_s, self_len - i); + +    return result; +} + +/* Special case for deleting a single character */ +/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ +static PyObject * +stringlib_replace_delete_single_character(PyObject *self, +                                          char from_c, Py_ssize_t maxcount) +{ +    const char *self_s, *start, *next, *end; +    char *result_s; +    Py_ssize_t self_len, result_len; +    Py_ssize_t count; +    PyObject *result; + +    self_len = STRINGLIB_LEN(self); +    self_s = STRINGLIB_STR(self); + +    count = countchar(self_s, self_len, from_c, maxcount); +    if (count == 0) { +        return return_self(self); +    } + +    result_len = self_len - count;  /* from_len == 1 */ +    assert(result_len>=0); + +    result = STRINGLIB_NEW(NULL, result_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); + +    start = self_s; +    end = self_s + self_len; +    while (count-- > 0) { +        next = findchar(start, end - start, from_c); +        if (next == NULL) +            break; +        memcpy(result_s, start, next - start); +        result_s += (next - start); +        start = next + 1; +    } +    memcpy(result_s, start, end - start); + +    return result; +} + +/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ + +static PyObject * +stringlib_replace_delete_substring(PyObject *self, +                                   const char *from_s, Py_ssize_t from_len, +                                   Py_ssize_t maxcount) +{ +    const char *self_s, *start, *next, *end; +    char *result_s; +    Py_ssize_t self_len, result_len; +    Py_ssize_t count, offset; +    PyObject *result; + +    self_len = STRINGLIB_LEN(self); +    self_s = STRINGLIB_STR(self); + +    count = stringlib_count(self_s, self_len, +                            from_s, from_len, +                            maxcount); + +    if (count == 0) { +        /* no matches */ +        return return_self(self); +    } + +    result_len = self_len - (count * from_len); +    assert (result_len>=0); + +    result = STRINGLIB_NEW(NULL, result_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); + +    start = self_s; +    end = self_s + self_len; +    while (count-- > 0) { +        offset = stringlib_find(start, end - start, +                                from_s, from_len, +                                0); +        if (offset == -1) +            break; +        next = start + offset; + +        memcpy(result_s, start, next - start); + +        result_s += (next - start); +        start = next + from_len; +    } +    memcpy(result_s, start, end - start); +    return result; +} + +/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ +static PyObject * +stringlib_replace_single_character_in_place(PyObject *self, +                                            char from_c, char to_c, +                                            Py_ssize_t maxcount) +{ +    const char *self_s, *end; +    char *result_s, *start, *next; +    Py_ssize_t self_len; +    PyObject *result; + +    /* The result string will be the same size */ +    self_s = STRINGLIB_STR(self); +    self_len = STRINGLIB_LEN(self); + +    next = findchar(self_s, self_len, from_c); + +    if (next == NULL) { +        /* No matches; return the original bytes */ +        return return_self(self); +    } + +    /* Need to make a new bytes */ +    result = STRINGLIB_NEW(NULL, self_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); +    memcpy(result_s, self_s, self_len); + +    /* change everything in-place, starting with this one */ +    start =  result_s + (next - self_s); +    *start = to_c; +    start++; +    end = result_s + self_len; + +    while (--maxcount > 0) { +        next = findchar(start, end - start, from_c); +        if (next == NULL) +            break; +        *next = to_c; +        start = next + 1; +    } + +    return result;  } + +/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_substring_in_place(PyObject *self, +                                     const char *from_s, Py_ssize_t from_len, +                                     const char *to_s, Py_ssize_t to_len, +                                     Py_ssize_t maxcount) +{ +    const char *self_s, *end; +    char *result_s, *start; +    Py_ssize_t self_len, offset; +    PyObject *result; + +    /* The result bytes will be the same size */ + +    self_s = STRINGLIB_STR(self); +    self_len = STRINGLIB_LEN(self); + +    offset = stringlib_find(self_s, self_len, +                            from_s, from_len, +                            0); +    if (offset == -1) { +        /* No matches; return the original bytes */ +        return return_self(self); +    } + +    /* Need to make a new bytes */ +    result = STRINGLIB_NEW(NULL, self_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); +    memcpy(result_s, self_s, self_len); + +    /* change everything in-place, starting with this one */ +    start =  result_s + offset; +    memcpy(start, to_s, from_len); +    start += from_len; +    end = result_s + self_len; + +    while ( --maxcount > 0) { +        offset = stringlib_find(start, end - start, +                                from_s, from_len, +                                0); +        if (offset == -1) +            break; +        memcpy(start + offset, to_s, from_len); +        start += offset + from_len; +    } + +    return result; +} + +/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_single_character(PyObject *self, +                                   char from_c, +                                   const char *to_s, Py_ssize_t to_len, +                                   Py_ssize_t maxcount) +{ +    const char *self_s, *start, *next, *end; +    char *result_s; +    Py_ssize_t self_len, result_len; +    Py_ssize_t count; +    PyObject *result; + +    self_s = STRINGLIB_STR(self); +    self_len = STRINGLIB_LEN(self); + +    count = countchar(self_s, self_len, from_c, maxcount); +    if (count == 0) { +        /* no matches, return unchanged */ +        return return_self(self); +    } + +    /* use the difference between current and new, hence the "-1" */ +    /*   result_len = self_len + count * (to_len-1)  */ +    assert(count > 0); +    if (to_len - 1 > (PY_SSIZE_T_MAX - self_len) / count) { +        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); +        return NULL; +    } +    result_len = self_len + count * (to_len - 1); + +    result = STRINGLIB_NEW(NULL, result_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); + +    start = self_s; +    end = self_s + self_len; +    while (count-- > 0) { +        next = findchar(start, end - start, from_c); +        if (next == NULL) +            break; + +        if (next == start) { +            /* replace with the 'to' */ +            memcpy(result_s, to_s, to_len); +            result_s += to_len; +            start += 1; +        } else { +            /* copy the unchanged old then the 'to' */ +            memcpy(result_s, start, next - start); +            result_s += (next - start); +            memcpy(result_s, to_s, to_len); +            result_s += to_len; +            start = next + 1; +        } +    } +    /* Copy the remainder of the remaining bytes */ +    memcpy(result_s, start, end - start); + +    return result; +} + +/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ +static PyObject * +stringlib_replace_substring(PyObject *self, +                            const char *from_s, Py_ssize_t from_len, +                            const char *to_s, Py_ssize_t to_len, +                            Py_ssize_t maxcount) +{ +    const char *self_s, *start, *next, *end; +    char *result_s; +    Py_ssize_t self_len, result_len; +    Py_ssize_t count, offset; +    PyObject *result; + +    self_s = STRINGLIB_STR(self); +    self_len = STRINGLIB_LEN(self); + +    count = stringlib_count(self_s, self_len, +                            from_s, from_len, +                            maxcount); + +    if (count == 0) { +        /* no matches, return unchanged */ +        return return_self(self); +    } + +    /* Check for overflow */ +    /*    result_len = self_len + count * (to_len-from_len) */ +    assert(count > 0); +    if (to_len - from_len > (PY_SSIZE_T_MAX - self_len) / count) { +        PyErr_SetString(PyExc_OverflowError, "replace bytes is too long"); +        return NULL; +    } +    result_len = self_len + count * (to_len - from_len); + +    result = STRINGLIB_NEW(NULL, result_len); +    if (result == NULL) { +        return NULL; +    } +    result_s = STRINGLIB_STR(result); + +    start = self_s; +    end = self_s + self_len; +    while (count-- > 0) { +        offset = stringlib_find(start, end - start, +                                from_s, from_len, +                                0); +        if (offset == -1) +            break; +        next = start + offset; +        if (next == start) { +            /* replace with the 'to' */ +            memcpy(result_s, to_s, to_len); +            result_s += to_len; +            start += from_len; +        } else { +            /* copy the unchanged old then the 'to' */ +            memcpy(result_s, start, next - start); +            result_s += (next - start); +            memcpy(result_s, to_s, to_len); +            result_s += to_len; +            start = next + from_len; +        } +    } +    /* Copy the remainder of the remaining bytes */ +    memcpy(result_s, start, end - start); + +    return result; +} + + +static PyObject * +stringlib_replace(PyObject *self, +                  const char *from_s, Py_ssize_t from_len, +                  const char *to_s, Py_ssize_t to_len, +                  Py_ssize_t maxcount) +{ +    if (maxcount < 0) { +        maxcount = PY_SSIZE_T_MAX; +    } else if (maxcount == 0 || STRINGLIB_LEN(self) == 0) { +        /* nothing to do; return the original bytes */ +        return return_self(self); +    } + +    /* Handle zero-length special cases */ +    if (from_len == 0) { +        if (to_len == 0) { +            /* nothing to do; return the original bytes */ +            return return_self(self); +        } +        /* insert the 'to' bytes everywhere.    */ +        /*    >>> b"Python".replace(b"", b".")  */ +        /*    b'.P.y.t.h.o.n.'                  */ +        return stringlib_replace_interleave(self, to_s, to_len, maxcount); +    } + +    /* Except for b"".replace(b"", b"A") == b"A" there is no way beyond this */ +    /* point for an empty self bytes to generate a non-empty bytes */ +    /* Special case so the remaining code always gets a non-empty bytes */ +    if (STRINGLIB_LEN(self) == 0) { +        return return_self(self); +    } + +    if (to_len == 0) { +        /* delete all occurrences of 'from' bytes */ +        if (from_len == 1) { +            return stringlib_replace_delete_single_character( +                self, from_s[0], maxcount); +        } else { +            return stringlib_replace_delete_substring( +                self, from_s, from_len, maxcount); +        } +    } + +    /* Handle special case where both bytes have the same length */ + +    if (from_len == to_len) { +        if (from_len == 1) { +            return stringlib_replace_single_character_in_place( +                self, from_s[0], to_s[0], maxcount); +        } else { +            return stringlib_replace_substring_in_place( +                self, from_s, from_len, to_s, to_len, maxcount); +        } +    } + +    /* Otherwise use the more generic algorithms */ +    if (from_len == 1) { +        return stringlib_replace_single_character( +            self, from_s[0], to_s, to_len, maxcount); +    } else { +        /* len('from')>=2, len('to')>=1 */ +        return stringlib_replace_substring( +            self, from_s, from_len, to_s, to_len, maxcount); +    } +} + +#undef findchar diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h index be09b5f6fa..14fa28ea54 100644 --- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -67,7 +67,7 @@ SubString_new_object(SubString *str)      return PyUnicode_Substring(str->str, str->start, str->end);  } -/* return a new string.  if str->str is NULL, return None */ +/* return a new string.  if str->str is NULL, return a new empty string */  Py_LOCAL_INLINE(PyObject *)  SubString_new_object_or_empty(SubString *str)  { | 
