diff options
Diffstat (limited to 'Objects/stringobject.c')
-rw-r--r-- | Objects/stringobject.c | 1408 |
1 files changed, 970 insertions, 438 deletions
diff --git a/Objects/stringobject.c b/Objects/stringobject.c index b34dcb2110..110c38ee01 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1,6 +1,7 @@ /* String object implementation */ #define PY_SSIZE_T_CLEAN + #include "Python.h" #include <ctype.h> @@ -176,14 +177,11 @@ PyString_FromFormatV(const char *format, va_list vargs) while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) ; - /* skip the 'l' in %ld, since it doesn't change the - width. although only %d is supported (see - "expand" section below), others can be easily - added */ - if (*f == 'l' && *(f+1) == 'd') - ++f; - /* likewise for %zd */ - if (*f == 'z' && *(f+1) == 'd') + /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since + * they don't affect the amount of space we reserve. + */ + if ((*f == 'l' || *f == 'z') && + (f[1] == 'd' || f[1] == 'u')) ++f; switch (*f) { @@ -193,7 +191,7 @@ PyString_FromFormatV(const char *format, va_list vargs) case '%': n++; break; - case 'd': case 'i': case 'x': + case 'd': case 'u': case 'i': case 'x': (void) va_arg(count, int); /* 20 bytes is enough to hold a 64-bit integer. Decimal takes the most space. @@ -255,14 +253,14 @@ PyString_FromFormatV(const char *format, va_list vargs) } while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f))) f++; - /* handle the long flag, but only for %ld. others - can be added when necessary. */ - if (*f == 'l' && *(f+1) == 'd') { + /* handle the long flag, but only for %ld and %lu. + others can be added when necessary. */ + if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { longflag = 1; ++f; } /* handle the size_t flag. */ - if (*f == 'z' && *(f+1) == 'd') { + if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { size_tflag = 1; ++f; } @@ -275,10 +273,22 @@ PyString_FromFormatV(const char *format, va_list vargs) if (longflag) sprintf(s, "%ld", va_arg(vargs, long)); else if (size_tflag) + sprintf(s, "%" PY_FORMAT_SIZE_T "d", + va_arg(vargs, Py_ssize_t)); + else + sprintf(s, "%d", va_arg(vargs, int)); + s += strlen(s); + break; + case 'u': + if (longflag) + sprintf(s, "%lu", + va_arg(vargs, unsigned long)); + else if (size_tflag) sprintf(s, "%" PY_FORMAT_SIZE_T "u", va_arg(vargs, size_t)); else - sprintf(s, "%d", va_arg(vargs, int)); + sprintf(s, "%u", + va_arg(vargs, unsigned int)); s += strlen(s); break; case 'i': @@ -680,6 +690,9 @@ PyObject *PyString_DecodeEscape(const char *s, return NULL; } +/* -------------------------------------------------------------------- */ +/* object api */ + static Py_ssize_t string_getsize(register PyObject *op) { @@ -754,8 +767,25 @@ PyString_AsStringAndSize(register PyObject *obj, return 0; } +/* -------------------------------------------------------------------- */ /* Methods */ +#define STRINGLIB_CHAR char + +#define STRINGLIB_CMP memcmp +#define STRINGLIB_LEN PyString_GET_SIZE +#define STRINGLIB_NEW PyString_FromStringAndSize +#define STRINGLIB_STR PyString_AS_STRING + +#define STRINGLIB_EMPTY nullstring + +#include "stringlib/fastsearch.h" + +#include "stringlib/count.h" +#include "stringlib/find.h" +#include "stringlib/partition.h" + + static int string_print(PyStringObject *op, FILE *fp, int flags) { @@ -900,7 +930,7 @@ string_length(PyStringObject *a) static PyObject * string_concat(register PyStringObject *a, register PyObject *bb) { - register size_t size; + register Py_ssize_t size; register PyStringObject *op; if (!PyString_Check(bb)) { #ifdef Py_USING_UNICODE @@ -924,7 +954,12 @@ string_concat(register PyStringObject *a, register PyObject *bb) return (PyObject *)a; } size = a->ob_size + b->ob_size; - /* XXX check overflow */ + if (size < 0) { + PyErr_SetString(PyExc_OverflowError, + "strings are too large to concat"); + return NULL; + } + /* Inline PyObject_NewVar */ op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); if (op == NULL) @@ -1017,65 +1052,36 @@ string_slice(register PyStringObject *a, register Py_ssize_t i, } static int -string_contains(PyObject *a, PyObject *el) +string_contains(PyObject *str_obj, PyObject *sub_obj) { - char *s = PyString_AS_STRING(a); - const char *sub = PyString_AS_STRING(el); - char *last; - Py_ssize_t len_sub = PyString_GET_SIZE(el); - Py_ssize_t shortsub; - char firstchar, lastchar; - - if (!PyString_CheckExact(el)) { + if (!PyString_CheckExact(sub_obj)) { #ifdef Py_USING_UNICODE - if (PyUnicode_Check(el)) - return PyUnicode_Contains(a, el); + if (PyUnicode_Check(sub_obj)) + return PyUnicode_Contains(str_obj, sub_obj); #endif - if (!PyString_Check(el)) { + if (!PyString_Check(sub_obj)) { PyErr_SetString(PyExc_TypeError, "'in <string>' requires string as left operand"); return -1; } } - if (len_sub == 0) - return 1; - /* last points to one char beyond the start of the rightmost - substring. When s<last, there is still room for a possible match - and s[0] through s[len_sub-1] will be in bounds. - shortsub is len_sub minus the last character which is checked - separately just before the memcmp(). That check helps prevent - false starts and saves the setup time for memcmp(). - */ - firstchar = sub[0]; - shortsub = len_sub - 1; - lastchar = sub[shortsub]; - last = s + PyString_GET_SIZE(a) - len_sub + 1; - while (s < last) { - s = (char *)memchr(s, firstchar, last-s); - if (s == NULL) - return 0; - assert(s < last); - if (s[shortsub] == lastchar && memcmp(s, sub, shortsub) == 0) - return 1; - s++; - } - return 0; + return stringlib_contains_obj(str_obj, sub_obj); } static PyObject * string_item(PyStringObject *a, register Py_ssize_t i) { + char pchar; PyObject *v; - char *pchar; if (i < 0 || i >= a->ob_size) { PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; } - pchar = a->ob_sval + i; - v = (PyObject *)characters[*pchar & UCHAR_MAX]; + pchar = a->ob_sval[i]; + v = (PyObject *)characters[pchar & UCHAR_MAX]; if (v == NULL) - v = PyString_FromStringAndSize(pchar, 1); + v = PyString_FromStringAndSize(&pchar, 1); else { #ifdef COUNT_ALLOCS one_strings++; @@ -1151,9 +1157,8 @@ string_richcompare(PyStringObject *a, PyStringObject *b, int op) int _PyString_Eq(PyObject *o1, PyObject *o2) { - PyStringObject *a, *b; - a = (PyStringObject*)o1; - b = (PyStringObject*)o2; + PyStringObject *a = (PyStringObject*) o1; + PyStringObject *b = (PyStringObject*) o2; return a->ob_size == b->ob_size && *a->ob_sval == *b->ob_sval && memcmp(a->ob_sval, b->ob_sval, a->ob_size) == 0; @@ -1308,6 +1313,27 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) + +/* Don't call if length < 2 */ +#define Py_STRING_MATCH(target, offset, pattern, length) \ + (target[offset] == pattern[0] && \ + target[offset+length-1] == pattern[length-1] && \ + !memcmp(target+offset+1, pattern+1, length-2) ) + + +/* Overallocate the initial list to reduce the number of reallocs for small + split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three + resizes, to sizes 4, 8, then 16. Most observed string splits are for human + text (roughly 11 words per line) and field delimited data (usually 1-10 + fields). For large strings the split algorithms are bandwidth limited + so increasing the preallocation likely will not improve things.*/ + +#define MAX_PREALLOC 12 + +/* 5 splits gives 6 elements */ +#define PREALLOC_SIZE(maxsplit) \ + (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) + #define SPLIT_APPEND(data, left, right) \ str = PyString_FromStringAndSize((data) + (left), \ (right) - (left)); \ @@ -1320,74 +1346,90 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; else \ Py_DECREF(str); -#define SPLIT_INSERT(data, left, right) \ +#define SPLIT_ADD(data, left, right) { \ str = PyString_FromStringAndSize((data) + (left), \ (right) - (left)); \ if (str == NULL) \ goto onError; \ - if (PyList_Insert(list, 0, str)) { \ - Py_DECREF(str); \ - goto onError; \ + if (count < MAX_PREALLOC) { \ + PyList_SET_ITEM(list, count, str); \ + } else { \ + if (PyList_Append(list, str)) { \ + Py_DECREF(str); \ + goto onError; \ + } \ + else \ + Py_DECREF(str); \ } \ - else \ - Py_DECREF(str); + count++; } -static PyObject * +/* Always force the list to the expected size. */ +#define FIX_PREALLOC_SIZE(list) ((PyListObject *)list)->ob_size = count; + +#define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; } +#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; } +#define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; } +#define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; } + +Py_LOCAL_INLINE(PyObject *) split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit) { - Py_ssize_t i, j; + Py_ssize_t i, j, count=0; PyObject *str; - PyObject *list = PyList_New(0); + PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); if (list == NULL) return NULL; - for (i = j = 0; i < len; ) { - while (i < len && isspace(Py_CHARMASK(s[i]))) - i++; - j = i; - while (i < len && !isspace(Py_CHARMASK(s[i]))) - i++; - if (j < i) { - if (maxsplit-- <= 0) - break; - SPLIT_APPEND(s, j, i); - while (i < len && isspace(Py_CHARMASK(s[i]))) - i++; - j = i; - } + i = j = 0; + + while (maxsplit-- > 0) { + SKIP_SPACE(s, i, len); + if (i==len) break; + j = i; i++; + SKIP_NONSPACE(s, i, len); + SPLIT_ADD(s, j, i); } - if (j < len) { - SPLIT_APPEND(s, j, len); + + if (i < len) { + /* Only occurs when maxsplit was reached */ + /* Skip any remaining whitespace and copy to end of string */ + SKIP_SPACE(s, i, len); + if (i != len) + SPLIT_ADD(s, i, len); } + FIX_PREALLOC_SIZE(list); return list; onError: Py_DECREF(list); return NULL; } -static PyObject * +Py_LOCAL_INLINE(PyObject *) split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) { - register Py_ssize_t i, j; + register Py_ssize_t i, j, count=0; PyObject *str; - PyObject *list = PyList_New(0); + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - for (i = j = 0; i < len; ) { - if (s[i] == ch) { - if (maxcount-- <= 0) + i = j = 0; + while ((j < len) && (maxcount-- > 0)) { + for(; j<len; j++) { + /* I found that using memchr makes no difference */ + if (s[j] == ch) { + SPLIT_ADD(s, i, j); + i = j = j + 1; break; - SPLIT_APPEND(s, j, i); - i = j = i + 1; - } else - i++; + } + } } - if (j <= len) { - SPLIT_APPEND(s, j, len); + if (i <= len) { + SPLIT_ADD(s, i, len); } + FIX_PREALLOC_SIZE(list); return list; onError: @@ -1407,10 +1449,12 @@ static PyObject * string_split(PyStringObject *self, PyObject *args) { Py_ssize_t len = PyString_GET_SIZE(self), n, i, j; - int err; - Py_ssize_t maxsplit = -1; + Py_ssize_t maxsplit = -1, count=0; const char *s = PyString_AS_STRING(self), *sub; - PyObject *list, *item, *subobj = Py_None; + PyObject *list, *str, *subobj = Py_None; +#ifdef USE_FAST + Py_ssize_t pos; +#endif if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) return NULL; @@ -1436,98 +1480,166 @@ string_split(PyStringObject *self, PyObject *args) else if (n == 1) return split_char(s, len, sub[0], maxsplit); - list = PyList_New(0); + list = PyList_New(PREALLOC_SIZE(maxsplit)); if (list == NULL) return NULL; +#ifdef USE_FAST i = j = 0; - while (i+n <= len) { - if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) { - if (maxsplit-- <= 0) + while (maxsplit-- > 0) { + pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH); + if (pos < 0) + break; + j = i+pos; + SPLIT_ADD(s, i, j); + i = j + n; + + } +#else + i = j = 0; + while ((j+n <= len) && (maxsplit-- > 0)) { + for (; j+n <= len; j++) { + if (Py_STRING_MATCH(s, j, sub, n)) { + SPLIT_ADD(s, i, j); + i = j = j + n; break; - item = PyString_FromStringAndSize(s+j, i-j); - if (item == NULL) - goto fail; - err = PyList_Append(list, item); - Py_DECREF(item); - if (err < 0) - goto fail; - i = j = i + n; + } } - else - i++; } - item = PyString_FromStringAndSize(s+j, len-j); - if (item == NULL) - goto fail; - err = PyList_Append(list, item); - Py_DECREF(item); - if (err < 0) - goto fail; - +#endif + SPLIT_ADD(s, i, len); + FIX_PREALLOC_SIZE(list); return list; - fail: + onError: Py_DECREF(list); return NULL; } +PyDoc_STRVAR(partition__doc__, +"S.partition(sep) -> (head, sep, tail)\n\ +\n\ +Searches for the separator sep in S, and returns the part before it,\n\ +the separator itself, and the part after it. If the separator is not\n\ +found, returns S and two empty strings."); + +static PyObject * +string_partition(PyStringObject *self, PyObject *sep_obj) +{ + const char *sep; + Py_ssize_t sep_len; + + if (PyString_Check(sep_obj)) { + sep = PyString_AS_STRING(sep_obj); + sep_len = PyString_GET_SIZE(sep_obj); + } +#ifdef Py_USING_UNICODE + else if (PyUnicode_Check(sep_obj)) + return PyUnicode_Partition((PyObject *) self, sep_obj); +#endif + else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len)) + return NULL; + + return stringlib_partition( + (PyObject*) self, + PyString_AS_STRING(self), PyString_GET_SIZE(self), + sep_obj, sep, sep_len + ); +} + +PyDoc_STRVAR(rpartition__doc__, +"S.rpartition(sep) -> (head, sep, tail)\n\ +\n\ +Searches for the separator sep in S, starting at the end of S, and returns\n\ +the part before it, the separator itself, and the part after it. If the\n\ +separator is not found, returns S and two empty strings."); + static PyObject * +string_rpartition(PyStringObject *self, PyObject *sep_obj) +{ + const char *sep; + Py_ssize_t sep_len; + + if (PyString_Check(sep_obj)) { + sep = PyString_AS_STRING(sep_obj); + sep_len = PyString_GET_SIZE(sep_obj); + } +#ifdef Py_USING_UNICODE + else if (PyUnicode_Check(sep_obj)) + return PyUnicode_Partition((PyObject *) self, sep_obj); +#endif + else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len)) + return NULL; + + return stringlib_rpartition( + (PyObject*) self, + PyString_AS_STRING(self), PyString_GET_SIZE(self), + sep_obj, sep, sep_len + ); +} + +Py_LOCAL_INLINE(PyObject *) rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit) { - Py_ssize_t i, j; + Py_ssize_t i, j, count=0; PyObject *str; - PyObject *list = PyList_New(0); + PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); if (list == NULL) return NULL; - for (i = j = len - 1; i >= 0; ) { - while (i >= 0 && isspace(Py_CHARMASK(s[i]))) - i--; - j = i; - while (i >= 0 && !isspace(Py_CHARMASK(s[i]))) - i--; - if (j > i) { - if (maxsplit-- <= 0) - break; - SPLIT_INSERT(s, i + 1, j + 1); - while (i >= 0 && isspace(Py_CHARMASK(s[i]))) - i--; - j = i; - } - } - if (j >= 0) { - SPLIT_INSERT(s, 0, j + 1); - } + i = j = len-1; + + while (maxsplit-- > 0) { + RSKIP_SPACE(s, i); + if (i<0) break; + j = i; i--; + RSKIP_NONSPACE(s, i); + SPLIT_ADD(s, i + 1, j + 1); + } + if (i >= 0) { + /* Only occurs when maxsplit was reached */ + /* Skip any remaining whitespace and copy to beginning of string */ + RSKIP_SPACE(s, i); + if (i >= 0) + SPLIT_ADD(s, 0, i + 1); + + } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; return list; onError: Py_DECREF(list); return NULL; } -static PyObject * +Py_LOCAL_INLINE(PyObject *) rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) { - register Py_ssize_t i, j; + register Py_ssize_t i, j, count=0; PyObject *str; - PyObject *list = PyList_New(0); + PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) return NULL; - for (i = j = len - 1; i >= 0; ) { - if (s[i] == ch) { - if (maxcount-- <= 0) + i = j = len - 1; + while ((i >= 0) && (maxcount-- > 0)) { + for (; i >= 0; i--) { + if (s[i] == ch) { + SPLIT_ADD(s, i + 1, j + 1); + j = i = i - 1; break; - SPLIT_INSERT(s, i + 1, j + 1); - j = i = i - 1; - } else - i--; + } + } } if (j >= -1) { - SPLIT_INSERT(s, 0, j + 1); + SPLIT_ADD(s, 0, j + 1); } + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; return list; onError: @@ -1548,10 +1660,9 @@ static PyObject * string_rsplit(PyStringObject *self, PyObject *args) { Py_ssize_t len = PyString_GET_SIZE(self), n, i, j; - int err; - Py_ssize_t maxsplit = -1; + Py_ssize_t maxsplit = -1, count=0; const char *s = PyString_AS_STRING(self), *sub; - PyObject *list, *item, *subobj = Py_None; + PyObject *list, *str, *subobj = Py_None; if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) return NULL; @@ -1577,40 +1688,30 @@ string_rsplit(PyStringObject *self, PyObject *args) else if (n == 1) return rsplit_char(s, len, sub[0], maxsplit); - list = PyList_New(0); + list = PyList_New(PREALLOC_SIZE(maxsplit)); if (list == NULL) return NULL; j = len; i = j - n; - while (i >= 0) { - if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) { - if (maxsplit-- <= 0) + + while ( (i >= 0) && (maxsplit-- > 0) ) { + for (; i>=0; i--) { + if (Py_STRING_MATCH(s, i, sub, n)) { + SPLIT_ADD(s, i + n, j); + j = i; + i -= n; break; - item = PyString_FromStringAndSize(s+i+n, j-i-n); - if (item == NULL) - goto fail; - err = PyList_Insert(list, 0, item); - Py_DECREF(item); - if (err < 0) - goto fail; - j = i; - i -= n; + } } - else - i--; } - item = PyString_FromStringAndSize(s, j); - if (item == NULL) - goto fail; - err = PyList_Insert(list, 0, item); - Py_DECREF(item); - if (err < 0) - goto fail; - + SPLIT_ADD(s, 0, j); + FIX_PREALLOC_SIZE(list); + if (PyList_Reverse(list) < 0) + goto onError; return list; - fail: +onError: Py_DECREF(list); return NULL; } @@ -1727,7 +1828,7 @@ _PyString_Join(PyObject *sep, PyObject *x) return string_join((PyStringObject *)sep, x); } -static void +Py_LOCAL_INLINE(void) string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) { if (*end > len) @@ -1742,50 +1843,38 @@ string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) *start = 0; } -static Py_ssize_t +Py_LOCAL_INLINE(Py_ssize_t) string_find_internal(PyStringObject *self, PyObject *args, int dir) { - const char *s = PyString_AS_STRING(self), *sub; - Py_ssize_t len = PyString_GET_SIZE(self); - Py_ssize_t n, i = 0, last = PY_SSIZE_T_MAX; PyObject *subobj; + const char *sub; + Py_ssize_t sub_len; + Py_ssize_t start=0, end=PY_SSIZE_T_MAX; /* XXX ssize_t i */ - if (!PyArg_ParseTuple(args, "O|O&O&:find/rfind/index/rindex", - &subobj, _PyEval_SliceIndex, &i, _PyEval_SliceIndex, &last)) + if (!PyArg_ParseTuple(args, "O|O&O&:find/rfind/index/rindex", &subobj, + _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return -2; if (PyString_Check(subobj)) { sub = PyString_AS_STRING(subobj); - n = PyString_GET_SIZE(subobj); + sub_len = PyString_GET_SIZE(subobj); } #ifdef Py_USING_UNICODE else if (PyUnicode_Check(subobj)) - return PyUnicode_Find((PyObject *)self, subobj, i, last, dir); + return PyUnicode_Find( + (PyObject *)self, subobj, start, end, dir); #endif - else if (PyObject_AsCharBuffer(subobj, &sub, &n)) + else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len)) return -2; - string_adjust_indices(&i, &last, len); - - if (dir > 0) { - if (n == 0 && i <= last) - return (long)i; - last -= n; - for (; i <= last; ++i) - if (s[i] == sub[0] && memcmp(&s[i], sub, n) == 0) - return (long)i; - } - else { - Py_ssize_t j; - - if (n == 0 && i <= last) - return last; - for (j = last-n; j >= i; --j) - if (s[j] == sub[0] && memcmp(&s[j], sub, n) == 0) - return j; - } - - return -1; + if (dir > 0) + return stringlib_find_slice( + PyString_AS_STRING(self), PyString_GET_SIZE(self), + sub, sub_len, start, end); + else + return stringlib_rfind_slice( + PyString_AS_STRING(self), PyString_GET_SIZE(self), + sub, sub_len, start, end); } @@ -1867,7 +1956,7 @@ string_rindex(PyStringObject *self, PyObject *args) } -static PyObject * +Py_LOCAL_INLINE(PyObject *) do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj) { char *s = PyString_AS_STRING(self); @@ -1900,7 +1989,7 @@ do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj) } -static PyObject * +Py_LOCAL_INLINE(PyObject *) do_strip(PyStringObject *self, int striptype) { char *s = PyString_AS_STRING(self); @@ -1930,7 +2019,7 @@ do_strip(PyStringObject *self, int striptype) } -static PyObject * +Py_LOCAL_INLINE(PyObject *) do_argstrip(PyStringObject *self, int striptype, PyObject *args) { PyObject *sep = NULL; @@ -2024,57 +2113,68 @@ PyDoc_STRVAR(lower__doc__, \n\ Return a copy of the string S converted to lowercase."); +/* _tolower and _toupper are defined by SUSv2, but they're not ISO C */ +#ifndef _tolower +#define _tolower tolower +#endif + static PyObject * string_lower(PyStringObject *self) { - char *s = PyString_AS_STRING(self), *s_new; + char *s; Py_ssize_t i, n = PyString_GET_SIZE(self); PyObject *newobj; newobj = PyString_FromStringAndSize(NULL, n); - if (newobj == NULL) + if (!newobj) return NULL; - s_new = PyString_AsString(newobj); + + s = PyString_AS_STRING(newobj); + + memcpy(s, PyString_AS_STRING(self), n); + for (i = 0; i < n; i++) { - int c = Py_CHARMASK(*s++); - if (isupper(c)) { - *s_new = tolower(c); - } else - *s_new = c; - s_new++; + int c = Py_CHARMASK(s[i]); + if (isupper(c)) + s[i] = _tolower(c); } + return newobj; } - PyDoc_STRVAR(upper__doc__, "S.upper() -> string\n\ \n\ Return a copy of the string S converted to uppercase."); +#ifndef _toupper +#define _toupper toupper +#endif + static PyObject * string_upper(PyStringObject *self) { - char *s = PyString_AS_STRING(self), *s_new; + char *s; Py_ssize_t i, n = PyString_GET_SIZE(self); PyObject *newobj; newobj = PyString_FromStringAndSize(NULL, n); - if (newobj == NULL) + if (!newobj) return NULL; - s_new = PyString_AsString(newobj); + + s = PyString_AS_STRING(newobj); + + memcpy(s, PyString_AS_STRING(self), n); + for (i = 0; i < n; i++) { - int c = Py_CHARMASK(*s++); - if (islower(c)) { - *s_new = toupper(c); - } else - *s_new = c; - s_new++; + int c = Py_CHARMASK(s[i]); + if (islower(c)) + s[i] = _toupper(c); } + return newobj; } - PyDoc_STRVAR(title__doc__, "S.title() -> string\n\ \n\ @@ -2150,62 +2250,44 @@ string_capitalize(PyStringObject *self) PyDoc_STRVAR(count__doc__, "S.count(sub[, start[, end]]) -> int\n\ \n\ -Return the number of occurrences of substring sub in string\n\ -S[start:end]. Optional arguments start and end are\n\ -interpreted as in slice notation."); +Return the number of non-overlapping occurrences of substring sub in\n\ +string S[start:end]. Optional arguments start and end are interpreted\n\ +as in slice notation."); static PyObject * string_count(PyStringObject *self, PyObject *args) { - const char *s = PyString_AS_STRING(self), *sub, *t; - Py_ssize_t len = PyString_GET_SIZE(self), n; - Py_ssize_t i = 0, last = PY_SSIZE_T_MAX; - Py_ssize_t m, r; - PyObject *subobj; + PyObject *sub_obj; + const char *str = PyString_AS_STRING(self), *sub; + Py_ssize_t sub_len; + Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; - if (!PyArg_ParseTuple(args, "O|O&O&:count", &subobj, - _PyEval_SliceIndex, &i, _PyEval_SliceIndex, &last)) + if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj, + _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) return NULL; - if (PyString_Check(subobj)) { - sub = PyString_AS_STRING(subobj); - n = PyString_GET_SIZE(subobj); + if (PyString_Check(sub_obj)) { + sub = PyString_AS_STRING(sub_obj); + sub_len = PyString_GET_SIZE(sub_obj); } #ifdef Py_USING_UNICODE - else if (PyUnicode_Check(subobj)) { + else if (PyUnicode_Check(sub_obj)) { Py_ssize_t count; - count = PyUnicode_Count((PyObject *)self, subobj, i, last); + count = PyUnicode_Count((PyObject *)self, sub_obj, start, end); if (count == -1) return NULL; else - return PyInt_FromLong((long) count); + return PyInt_FromSsize_t(count); } #endif - else if (PyObject_AsCharBuffer(subobj, &sub, &n)) + else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len)) return NULL; - string_adjust_indices(&i, &last, len); - - m = last + 1 - n; - if (n == 0) - return PyInt_FromSsize_t(m-i); + string_adjust_indices(&start, &end, PyString_GET_SIZE(self)); - r = 0; - while (i < m) { - if (!memcmp(s+i, sub, n)) { - r++; - i += n; - } else { - i++; - } - if (i >= m) - break; - t = (const char *)memchr(s+i, sub[0], m-i); - if (t == NULL) - break; - i = t - s; - } - return PyInt_FromSsize_t(r); + return PyInt_FromSsize_t( + stringlib_count(str + start, end - start, sub, sub_len) + ); } PyDoc_STRVAR(swapcase__doc__, @@ -2359,156 +2441,616 @@ string_translate(PyStringObject *self, PyObject *args) } -/* What follows is used for implementing replace(). Perry Stoll. */ +#define FORWARD 1 +#define REVERSE -1 -/* - mymemfind +/* find and count characters and substrings */ - strstr replacement for arbitrary blocks of memory. +#define findchar(target, target_len, c) \ + ((char *)memchr((const void *)(target), c, target_len)) - Locates the first occurrence in the memory pointed to by MEM of the - contents of memory pointed to by PAT. Returns the index into MEM if - found, or -1 if not found. If len of PAT is greater than length of - MEM, the function returns -1. -*/ -static Py_ssize_t -mymemfind(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len) +/* String ops must return a string. */ +/* If the object is subclass of string, create a copy */ +Py_LOCAL(PyStringObject *) +return_self(PyStringObject *self) { - register Py_ssize_t ii; + if (PyString_CheckExact(self)) { + Py_INCREF(self); + return self; + } + return (PyStringObject *)PyString_FromStringAndSize( + PyString_AS_STRING(self), + PyString_GET_SIZE(self)); +} - /* pattern can not occur in the last pat_len-1 chars */ - len -= pat_len; +Py_LOCAL_INLINE(Py_ssize_t) +countchar(char *target, int target_len, char c, Py_ssize_t maxcount) +{ + Py_ssize_t count=0; + char *start=target; + char *end=target+target_len; - for (ii = 0; ii <= len; ii++) { - if (mem[ii] == pat[0] && memcmp(&mem[ii], pat, pat_len) == 0) { - return ii; - } + while ( (start=findchar(start, end-start, c)) != NULL ) { + count++; + if (count >= maxcount) + break; + start += 1; + } + return count; +} + +Py_LOCAL(Py_ssize_t) +findstring(char *target, Py_ssize_t target_len, + char *pattern, Py_ssize_t pattern_len, + Py_ssize_t start, + Py_ssize_t end, + int direction) +{ + if (start < 0) { + start += target_len; + if (start < 0) + start = 0; + } + if (end > target_len) { + end = target_len; + } else if (end < 0) { + end += target_len; + if (end < 0) + end = 0; + } + + /* zero-length substrings always match at the first attempt */ + if (pattern_len == 0) + return (direction > 0) ? start : end; + + end -= pattern_len; + + if (direction < 0) { + for (; end >= start; end--) + if (Py_STRING_MATCH(target, end, pattern, pattern_len)) + return end; + } else { + for (; start <= end; start++) + if (Py_STRING_MATCH(target, start, pattern, pattern_len)) + return start; } return -1; } -/* - mymemcnt +Py_LOCAL_INLINE(Py_ssize_t) +countstring(char *target, Py_ssize_t target_len, + char *pattern, Py_ssize_t pattern_len, + Py_ssize_t start, + Py_ssize_t end, + int direction, Py_ssize_t maxcount) +{ + Py_ssize_t count=0; + + if (start < 0) { + start += target_len; + if (start < 0) + start = 0; + } + if (end > target_len) { + end = target_len; + } else if (end < 0) { + end += target_len; + if (end < 0) + end = 0; + } + + /* zero-length substrings match everywhere */ + if (pattern_len == 0 || maxcount == 0) { + if (target_len+1 < maxcount) + return target_len+1; + return maxcount; + } + + end -= pattern_len; + if (direction < 0) { + for (; (end >= start); end--) + if (Py_STRING_MATCH(target, end, pattern, pattern_len)) { + count++; + if (--maxcount <= 0) break; + end -= pattern_len-1; + } + } else { + for (; (start <= end); start++) + if (Py_STRING_MATCH(target, start, pattern, pattern_len)) { + count++; + if (--maxcount <= 0) + break; + start += pattern_len-1; + } + } + return count; +} - Return the number of distinct times PAT is found in MEM. - meaning mem=1111 and pat==11 returns 2. - mem=11111 and pat==11 also return 2. - */ -static Py_ssize_t -mymemcnt(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len) + +/* Algorithms for different cases of string replacement */ + +/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_interleave(PyStringObject *self, + PyStringObject *to, + Py_ssize_t maxcount) { - register Py_ssize_t offset = 0; - Py_ssize_t nfound = 0; + char *self_s, *to_s, *result_s; + Py_ssize_t self_len, to_len, result_len; + Py_ssize_t count, i, product; + PyStringObject *result; + + self_len = PyString_GET_SIZE(self); + to_len = PyString_GET_SIZE(to); + + /* 1 at the end plus 1 after every character */ + count = self_len+1; + if (maxcount < count) + count = maxcount; + + /* Check for overflow */ + /* result_len = count * to_len + self_len; */ + product = count * to_len; + if (product / to_len != count) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + result_len = product + self_len; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, + "replace string is too long"); + return NULL; + } + + if (! (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) ) + return NULL; - while (len >= 0) { - offset = mymemfind(mem, len, pat, pat_len); - if (offset == -1) - break; - mem += offset + pat_len; - len -= offset + pat_len; - nfound++; + self_s = PyString_AS_STRING(self); + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + result_s = PyString_AS_STRING(result); + + /* TODO: special case single character, which doesn't need memcpy */ + + /* Lay the first one down (guaranteed this will occur) */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + count -= 1; + + for (i=0; i<count; i++) { + *result_s++ = *self_s++; + memcpy(result_s, to_s, to_len); + result_s += to_len; } - return nfound; + + /* Copy the rest of the original string */ + memcpy(result_s, self_s, self_len-i); + + return result; } -/* - mymemreplace +/* Special case for deleting a single character */ +/* len(self)>=1, len(from)==1, to="", maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_delete_single_character(PyStringObject *self, + char from_c, Py_ssize_t maxcount) +{ + char *self_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, result_len; + Py_ssize_t count; + PyStringObject *result; - Return a string in which all occurrences of PAT in memory STR are - replaced with SUB. + self_len = PyString_GET_SIZE(self); + self_s = PyString_AS_STRING(self); - If length of PAT is less than length of STR or there are no occurrences - of PAT in STR, then the original string is returned. Otherwise, a new - string is allocated here and returned. + count = countchar(self_s, self_len, from_c, maxcount); + if (count == 0) { + return return_self(self); + } + + result_len = self_len - count; /* from_len == 1 */ + assert(result_len>=0); - on return, out_len is: - the length of output string, or - -1 if the input string is returned, or - unchanged if an error occurs (no memory). + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); - return value is: - the new string allocated locally, or - NULL if an error occurred. -*/ -static char * -mymemreplace(const char *str, Py_ssize_t len, /* input string */ - const char *pat, Py_ssize_t pat_len, /* pattern string to find */ - const char *sub, Py_ssize_t sub_len, /* substitution string */ - Py_ssize_t count, /* number of replacements */ - Py_ssize_t *out_len) + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + memcpy(result_s, start, next-start); + result_s += (next-start); + start = next+1; + } + memcpy(result_s, start, end-start); + + return result; +} + +/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */ + +Py_LOCAL(PyStringObject *) +replace_delete_substring(PyStringObject *self, PyStringObject *from, + Py_ssize_t maxcount) { + char *self_s, *from_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, from_len, result_len; + Py_ssize_t count, offset; + PyStringObject *result; + + self_len = PyString_GET_SIZE(self); + self_s = PyString_AS_STRING(self); + from_len = PyString_GET_SIZE(from); + from_s = PyString_AS_STRING(from); + + count = countstring(self_s, self_len, + from_s, from_len, + 0, self_len, 1, + maxcount); + + if (count == 0) { + /* no matches */ + return return_self(self); + } + + result_len = self_len - (count * from_len); + assert (result_len>=0); + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL ) + return NULL; + + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset == -1) + break; + next = start + offset; + + memcpy(result_s, start, next-start); + + result_s += (next-start); + start = next+from_len; + } + memcpy(result_s, start, end-start); + return result; +} + +/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_single_character_in_place(PyStringObject *self, + char from_c, char to_c, + Py_ssize_t maxcount) { - char *out_s; - char *new_s; - Py_ssize_t nfound, offset, new_len; - - if (len == 0 || (pat_len == 0 && sub_len == 0) || pat_len > len) - goto return_same; - - /* find length of output string */ - nfound = (pat_len > 0) ? mymemcnt(str, len, pat, pat_len) : len + 1; - if (count < 0) - count = PY_SSIZE_T_MAX; - else if (nfound > count) - nfound = count; - if (nfound == 0) - goto return_same; - - new_len = len + nfound*(sub_len - pat_len); - if (new_len == 0) { - /* Have to allocate something for the caller to free(). */ - out_s = (char *)PyMem_MALLOC(1); - if (out_s == NULL) - return NULL; - out_s[0] = '\0'; + char *self_s, *result_s, *start, *end, *next; + Py_ssize_t self_len; + PyStringObject *result; + + /* The result string will be the same size */ + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + next = findchar(self_s, self_len, from_c); + + if (next == NULL) { + /* No matches; return the original string */ + return return_self(self); + } + + /* Need to make a new string */ + result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len); + if (result == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + memcpy(result_s, self_s, self_len); + + /* change everything in-place, starting with this one */ + start = result_s + (next-self_s); + *start = to_c; + start++; + end = result_s + self_len; + + while (--maxcount > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + *next = to_c; + start = next+1; } - else { - assert(new_len > 0); - new_s = (char *)PyMem_MALLOC(new_len); - if (new_s == NULL) - return NULL; - out_s = new_s; + + return result; +} - if (pat_len > 0) { - for (; nfound > 0; --nfound) { - /* find index of next instance of pattern */ - offset = mymemfind(str, len, pat, pat_len); - if (offset == -1) - break; +/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_substring_in_place(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) +{ + char *result_s, *start, *end; + char *self_s, *from_s, *to_s; + Py_ssize_t self_len, from_len, offset; + PyStringObject *result; + + /* The result string will be the same size */ + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + from_s = PyString_AS_STRING(from); + from_len = PyString_GET_SIZE(from); + to_s = PyString_AS_STRING(to); + + offset = findstring(self_s, self_len, + from_s, from_len, + 0, self_len, FORWARD); + + if (offset == -1) { + /* No matches; return the original string */ + return return_self(self); + } + + /* Need to make a new string */ + result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len); + if (result == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + memcpy(result_s, self_s, self_len); + + + /* change everything in-place, starting with this one */ + start = result_s + offset; + memcpy(start, to_s, from_len); + start += from_len; + end = result_s + self_len; + + while ( --maxcount > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset==-1) + break; + memcpy(start+offset, to_s, from_len); + start += offset+from_len; + } + + return result; +} - /* copy non matching part of input string */ - memcpy(new_s, str, offset); - str += offset + pat_len; - len -= offset + pat_len; +/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_single_character(PyStringObject *self, + char from_c, + PyStringObject *to, + Py_ssize_t maxcount) +{ + char *self_s, *to_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, to_len, result_len; + Py_ssize_t count, product; + PyStringObject *result; + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + + count = countchar(self_s, self_len, from_c, maxcount); + + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + + /* use the difference between current and new, hence the "-1" */ + /* result_len = self_len + count * (to_len-1) */ + product = count * (to_len-1); + if (product / (to_len-1) != count) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + result_len = self_len + product; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + next = findchar(start, end-start, from_c); + if (next == NULL) + break; + + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += 1; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next-start); + result_s += (next-start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next+1; + } + } + /* Copy the remainder of the remaining string */ + memcpy(result_s, start, end-start); + + return result; +} - /* copy substitute into the output string */ - new_s += offset; - memcpy(new_s, sub, sub_len); - new_s += sub_len; - } - /* copy any remaining values into output string */ - if (len > 0) - memcpy(new_s, str, len); +/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */ +Py_LOCAL(PyStringObject *) +replace_substring(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) { + char *self_s, *from_s, *to_s, *result_s; + char *start, *next, *end; + Py_ssize_t self_len, from_len, to_len, result_len; + Py_ssize_t count, offset, product; + PyStringObject *result; + + self_s = PyString_AS_STRING(self); + self_len = PyString_GET_SIZE(self); + from_s = PyString_AS_STRING(from); + from_len = PyString_GET_SIZE(from); + + count = countstring(self_s, self_len, + from_s, from_len, + 0, self_len, FORWARD, maxcount); + if (count == 0) { + /* no matches, return unchanged */ + return return_self(self); + } + + to_s = PyString_AS_STRING(to); + to_len = PyString_GET_SIZE(to); + + /* Check for overflow */ + /* result_len = self_len + count * (to_len-from_len) */ + product = count * (to_len-from_len); + if (product / (to_len-from_len) != count) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + result_len = self_len + product; + if (result_len < 0) { + PyErr_SetString(PyExc_OverflowError, "replace string is too long"); + return NULL; + } + + if ( (result = (PyStringObject *) + PyString_FromStringAndSize(NULL, result_len)) == NULL) + return NULL; + result_s = PyString_AS_STRING(result); + + start = self_s; + end = self_s + self_len; + while (count-- > 0) { + offset = findstring(start, end-start, + from_s, from_len, + 0, end-start, FORWARD); + if (offset == -1) + break; + next = start+offset; + if (next == start) { + /* replace with the 'to' */ + memcpy(result_s, to_s, to_len); + result_s += to_len; + start += from_len; + } else { + /* copy the unchanged old then the 'to' */ + memcpy(result_s, start, next-start); + result_s += (next-start); + memcpy(result_s, to_s, to_len); + result_s += to_len; + start = next+from_len; } - else { - for (;;++str, --len) { - memcpy(new_s, sub, sub_len); - new_s += sub_len; - if (--nfound <= 0) { - memcpy(new_s, str, len); - break; - } - *new_s++ = *str; - } + } + /* Copy the remainder of the remaining string */ + memcpy(result_s, start, end-start); + + return result; +} + + +Py_LOCAL(PyStringObject *) +replace(PyStringObject *self, + PyStringObject *from, + PyStringObject *to, + Py_ssize_t maxcount) +{ + Py_ssize_t from_len, to_len; + + if (maxcount < 0) { + maxcount = PY_SSIZE_T_MAX; + } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) { + /* nothing to do; return the original string */ + return return_self(self); + } + + from_len = PyString_GET_SIZE(from); + to_len = PyString_GET_SIZE(to); + + if (maxcount == 0 || + (from_len == 0 && to_len == 0)) { + /* nothing to do; return the original string */ + return return_self(self); + } + + /* Handle zero-length special cases */ + + if (from_len == 0) { + /* insert the 'to' string everywhere. */ + /* >>> "Python".replace("", ".") */ + /* '.P.y.t.h.o.n.' */ + return replace_interleave(self, to, maxcount); + } + + /* Except for "".replace("", "A") == "A" there is no way beyond this */ + /* point for an empty self string to generate a non-empty string */ + /* Special case so the remaining code always gets a non-empty string */ + if (PyString_GET_SIZE(self) == 0) { + return return_self(self); + } + + if (to_len == 0) { + /* delete all occurances of 'from' string */ + if (from_len == 1) { + return replace_delete_single_character( + self, PyString_AS_STRING(from)[0], maxcount); + } else { + return replace_delete_substring(self, from, maxcount); } } - *out_len = new_len; - return out_s; - return_same: - *out_len = -1; - return (char *)str; /* cast away const */ -} + /* Handle special case where both strings have the same length */ + + if (from_len == to_len) { + if (from_len == 1) { + return replace_single_character_in_place( + self, + PyString_AS_STRING(from)[0], + PyString_AS_STRING(to)[0], + maxcount); + } else { + return replace_substring_in_place( + self, from, to, maxcount); + } + } + /* Otherwise use the more generic algorithms */ + if (from_len == 1) { + return replace_single_character(self, PyString_AS_STRING(from)[0], + to, maxcount); + } else { + /* len('from')>=2, len('to')>=1 */ + return replace_substring(self, from, to, maxcount); + } +} PyDoc_STRVAR(replace__doc__, "S.replace (old, new[, count]) -> string\n\ @@ -2520,66 +3062,42 @@ given, only the first count occurrences are replaced."); static PyObject * string_replace(PyStringObject *self, PyObject *args) { - const char *str = PyString_AS_STRING(self), *sub, *repl; - char *new_s; - const Py_ssize_t len = PyString_GET_SIZE(self); - Py_ssize_t sub_len, repl_len, out_len; Py_ssize_t count = -1; - PyObject *newobj; - PyObject *subobj, *replobj; + PyObject *from, *to; + const char *tmp_s; + Py_ssize_t tmp_len; - if (!PyArg_ParseTuple(args, "OO|n:replace", - &subobj, &replobj, &count)) + if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count)) return NULL; - if (PyString_Check(subobj)) { - sub = PyString_AS_STRING(subobj); - sub_len = PyString_GET_SIZE(subobj); + if (PyString_Check(from)) { + /* Can this be made a '!check' after the Unicode check? */ } #ifdef Py_USING_UNICODE - else if (PyUnicode_Check(subobj)) + if (PyUnicode_Check(from)) return PyUnicode_Replace((PyObject *)self, - subobj, replobj, count); + from, to, count); #endif - else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len)) + else if (PyObject_AsCharBuffer(from, &tmp_s, &tmp_len)) return NULL; - if (PyString_Check(replobj)) { - repl = PyString_AS_STRING(replobj); - repl_len = PyString_GET_SIZE(replobj); + if (PyString_Check(to)) { + /* Can this be made a '!check' after the Unicode check? */ } #ifdef Py_USING_UNICODE - else if (PyUnicode_Check(replobj)) + else if (PyUnicode_Check(to)) return PyUnicode_Replace((PyObject *)self, - subobj, replobj, count); + from, to, count); #endif - else if (PyObject_AsCharBuffer(replobj, &repl, &repl_len)) + else if (PyObject_AsCharBuffer(to, &tmp_s, &tmp_len)) return NULL; - new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len); - if (new_s == NULL) { - PyErr_NoMemory(); - return NULL; - } - if (out_len == -1) { - if (PyString_CheckExact(self)) { - /* we're returning another reference to self */ - newobj = (PyObject*)self; - Py_INCREF(newobj); - } - else { - newobj = PyString_FromStringAndSize(str, len); - if (newobj == NULL) - return NULL; - } - } - else { - newobj = PyString_FromStringAndSize(new_s, out_len); - PyMem_FREE(new_s); - } - return newobj; + return (PyObject *)replace((PyStringObject *) self, + (PyStringObject *) from, + (PyStringObject *) to, count); } +/** End DALKE **/ PyDoc_STRVAR(startswith__doc__, "S.startswith(prefix[, start[, end]]) -> bool\n\ @@ -2820,7 +3338,7 @@ string_expandtabs(PyStringObject *self, PyObject *args) return u; } -static PyObject * +Py_LOCAL_INLINE(PyObject *) pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill) { PyObject *u; @@ -3237,6 +3755,14 @@ string_splitlines(PyStringObject *self, PyObject *args) data = PyString_AS_STRING(self); len = PyString_GET_SIZE(self); + /* This does not use the preallocated list because splitlines is + usually run with hundreds of newlines. The overhead of + switching between PyList_SET_ITEM and append causes about a + 2-3% slowdown for that common case. A smarter implementation + could move the if check out, so the SET_ITEMs are done first + and the appends only done when the prealloc buffer is full. + That's too much work for little gain.*/ + list = PyList_New(0); if (!list) goto onError; @@ -3274,6 +3800,9 @@ string_splitlines(PyStringObject *self, PyObject *args) } #undef SPLIT_APPEND +#undef SPLIT_ADD +#undef MAX_PREALLOC +#undef PREALLOC_SIZE static PyObject * string_getnewargs(PyStringObject *v) @@ -3303,6 +3832,7 @@ string_methods[] = { {"count", (PyCFunction)string_count, METH_VARARGS, count__doc__}, {"endswith", (PyCFunction)string_endswith, METH_VARARGS, endswith__doc__}, + {"partition", (PyCFunction)string_partition, METH_O, partition__doc__}, {"find", (PyCFunction)string_find, METH_VARARGS, find__doc__}, {"index", (PyCFunction)string_index, METH_VARARGS, index__doc__}, {"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__}, @@ -3310,6 +3840,8 @@ string_methods[] = { {"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__}, {"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__}, {"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__}, + {"rpartition", (PyCFunction)string_rpartition, METH_O, + rpartition__doc__}, {"startswith", (PyCFunction)string_startswith, METH_VARARGS, startswith__doc__}, {"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__}, @@ -3566,7 +4098,7 @@ _PyString_Resize(PyObject **pv, Py_ssize_t newsize) /* Helpers for formatstring */ -static PyObject * +Py_LOCAL_INLINE(PyObject *) getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) { Py_ssize_t argidx = *p_argidx; @@ -3595,7 +4127,7 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) #define F_ALT (1<<3) #define F_ZERO (1<<4) -static int +Py_LOCAL_INLINE(int) formatfloat(char *buf, size_t buflen, int flags, int prec, int type, PyObject *v) { @@ -3782,7 +4314,7 @@ _PyString_FormatLong(PyObject *val, int flags, int prec, int type, return result; } -static int +Py_LOCAL_INLINE(int) formatint(char *buf, size_t buflen, int flags, int prec, int type, PyObject *v) { @@ -3854,7 +4386,7 @@ formatint(char *buf, size_t buflen, int flags, return (int)strlen(buf); } -static int +Py_LOCAL_INLINE(int) formatchar(char *buf, size_t buflen, PyObject *v) { /* presume that the buffer is at least 2 characters long */ |