diff options
Diffstat (limited to 'Objects/stringobject.c')
-rw-r--r-- | Objects/stringobject.c | 820 |
1 files changed, 202 insertions, 618 deletions
diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 9e2673d4fa..49d18645e6 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -4,9 +4,10 @@ #include "Python.h" #include <ctype.h> +#include <stddef.h> #ifdef COUNT_ALLOCS -int null_strings, one_strings; +Py_ssize_t null_strings, one_strings; #endif static PyStringObject *characters[UCHAR_MAX + 1]; @@ -22,11 +23,15 @@ static PyStringObject *nullstring; */ static PyObject *interned; -/* - For both PyString_FromString() and PyString_FromStringAndSize(), the - parameter `size' denotes number of characters to allocate, not counting any - null terminating character. +/* PyStringObject_SIZE gives the basic size of a string; any memory allocation + for a string of length n should request PyStringObject_SIZE + n bytes. + + Using PyStringObject_SIZE instead of sizeof(PyStringObject) saves + 3 bytes per string allocation on a typical system. +*/ +#define PyStringObject_SIZE (offsetof(PyStringObject, ob_sval) + 1) +/* For PyString_FromString(), the parameter `str' points to a null-terminated string containing exactly `size' bytes. @@ -43,8 +48,8 @@ static PyObject *interned; The PyObject member `op->ob_size', which denotes the number of "extra items" in a variable-size object, will contain the number of bytes - allocated for string data, not counting the null terminating character. It - is therefore equal to the equal to the `size' parameter (for + allocated for string data, not counting the null terminating character. + It is therefore equal to the `size' parameter (for PyString_FromStringAndSize()) or the length of the string in the `str' parameter (for PyString_FromString()). */ @@ -74,13 +79,13 @@ PyString_FromStringAndSize(const char *str, Py_ssize_t size) return (PyObject *)op; } - if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) { + if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) { PyErr_SetString(PyExc_OverflowError, "string is too large"); return NULL; } /* Inline PyObject_NewVar */ - op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); + op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size); if (op == NULL) return PyErr_NoMemory(); PyObject_INIT_VAR(op, &PyString_Type, size); @@ -114,7 +119,7 @@ PyString_FromString(const char *str) assert(str != NULL); size = strlen(str); - if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) { + if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) { PyErr_SetString(PyExc_OverflowError, "string is too long for a Python string"); return NULL; @@ -135,7 +140,7 @@ PyString_FromString(const char *str) } /* Inline PyObject_NewVar */ - op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); + op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size); if (op == NULL) return PyErr_NoMemory(); PyObject_INIT_VAR(op, &PyString_Type, size); @@ -180,6 +185,9 @@ PyString_FromFormatV(const char *format, va_list vargs) /* step 1: figure out how large a buffer we need */ for (f = format; *f; f++) { if (*f == '%') { +#ifdef HAVE_LONG_LONG + int longlongflag = 0; +#endif const char* p = f; while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) ; @@ -187,9 +195,21 @@ PyString_FromFormatV(const char *format, va_list vargs) /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since * they don't affect the amount of space we reserve. */ - if ((*f == 'l' || *f == 'z') && - (f[1] == 'd' || f[1] == 'u')) + if (*f == 'l') { + if (f[1] == 'd' || f[1] == 'u') { + ++f; + } +#ifdef HAVE_LONG_LONG + else if (f[1] == 'l' && + (f[2] == 'd' || f[2] == 'u')) { + longlongflag = 1; + f += 2; + } +#endif + } + else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { ++f; + } switch (*f) { case 'c': @@ -200,10 +220,21 @@ PyString_FromFormatV(const char *format, va_list vargs) break; case 'd': case 'u': case 'i': case 'x': (void) va_arg(count, int); - /* 20 bytes is enough to hold a 64-bit - integer. Decimal takes the most space. - This isn't enough for octal. */ - n += 20; +#ifdef HAVE_LONG_LONG + /* Need at most + ceil(log10(256)*SIZEOF_LONG_LONG) digits, + plus 1 for the sign. 53/22 is an upper + bound for log10(256). */ + if (longlongflag) + n += 2 + (SIZEOF_LONG_LONG*53-1) / 22; + else +#endif + /* 20 bytes is enough to hold a 64-bit + integer. Decimal takes the most + space. This isn't enough for + octal. */ + n += 20; + break; case 's': s = va_arg(count, char*); @@ -246,6 +277,9 @@ PyString_FromFormatV(const char *format, va_list vargs) const char* p = f++; Py_ssize_t i; int longflag = 0; +#ifdef HAVE_LONG_LONG + int longlongflag = 0; +#endif int size_tflag = 0; /* parse the width.precision part (we're only interested in the precision value, if any) */ @@ -260,14 +294,22 @@ PyString_FromFormatV(const char *format, va_list vargs) } while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f))) f++; - /* handle the long flag, but only for %ld and %lu. - others can be added when necessary. */ - if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { - longflag = 1; - ++f; + /* Handle %ld, %lu, %lld and %llu. */ + if (*f == 'l') { + if (f[1] == 'd' || f[1] == 'u') { + longflag = 1; + ++f; + } +#ifdef HAVE_LONG_LONG + else if (f[1] == 'l' && + (f[2] == 'd' || f[2] == 'u')) { + longlongflag = 1; + f += 2; + } +#endif } /* handle the size_t flag. */ - if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { size_tflag = 1; ++f; } @@ -279,6 +321,11 @@ PyString_FromFormatV(const char *format, va_list vargs) case 'd': if (longflag) sprintf(s, "%ld", va_arg(vargs, long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + sprintf(s, "%" PY_FORMAT_LONG_LONG "d", + va_arg(vargs, PY_LONG_LONG)); +#endif else if (size_tflag) sprintf(s, "%" PY_FORMAT_SIZE_T "d", va_arg(vargs, Py_ssize_t)); @@ -290,6 +337,11 @@ PyString_FromFormatV(const char *format, va_list vargs) if (longflag) sprintf(s, "%lu", va_arg(vargs, unsigned long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + sprintf(s, "%" PY_FORMAT_LONG_LONG "u", + va_arg(vargs, PY_LONG_LONG)); +#endif else if (size_tflag) sprintf(s, "%" PY_FORMAT_SIZE_T "u", va_arg(vargs, size_t)); @@ -339,7 +391,8 @@ PyString_FromFormatV(const char *format, va_list vargs) } end: - _PyString_Resize(&string, s - PyString_AS_STRING(string)); + if (_PyString_Resize(&string, s - PyString_AS_STRING(string))) + return NULL; return string; } @@ -687,12 +740,12 @@ PyObject *PyString_DecodeEscape(const char *s, default: *p++ = '\\'; s--; - goto non_esc; /* an arbitry number of unescaped + goto non_esc; /* an arbitrary number of unescaped UTF-8 bytes may follow. */ } } - if (p-buf < newlen) - _PyString_Resize(&v, p - buf); + if (p-buf < newlen && _PyString_Resize(&v, p - buf)) + goto failed; return v; failed: Py_DECREF(v); @@ -785,6 +838,7 @@ PyString_AsStringAndSize(register PyObject *obj, #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" #define _Py_InsertThousandsGrouping _PyString_InsertThousandsGrouping #include "stringlib/localeutil.h" @@ -920,8 +974,8 @@ PyString_Repr(PyObject *obj, int smartquotes) assert(newsize - (p - PyString_AS_STRING(v)) >= 1); *p++ = quote; *p = '\0'; - _PyString_Resize( - &v, (p - PyString_AS_STRING(v))); + if (_PyString_Resize(&v, (p - PyString_AS_STRING(v)))) + return NULL; return v; } } @@ -994,12 +1048,12 @@ string_concat(register PyStringObject *a, register PyObject *bb) } /* Inline PyObject_NewVar */ - if (size > PY_SSIZE_T_MAX - sizeof(PyStringObject)) { + if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) { PyErr_SetString(PyExc_OverflowError, "strings are too large to concat"); return NULL; } - op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size); + op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size); if (op == NULL) return PyErr_NoMemory(); PyObject_INIT_VAR(op, &PyString_Type, size); @@ -1036,13 +1090,12 @@ string_repeat(register PyStringObject *a, register Py_ssize_t n) return (PyObject *)a; } nbytes = (size_t)size; - if (nbytes + sizeof(PyStringObject) <= nbytes) { + if (nbytes + PyStringObject_SIZE <= nbytes) { PyErr_SetString(PyExc_OverflowError, "repeated string is too long"); return NULL; } - op = (PyStringObject *) - PyObject_MALLOC(sizeof(PyStringObject) + nbytes); + op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + nbytes); if (op == NULL) return PyErr_NoMemory(); PyObject_INIT_VAR(op, &PyString_Type, size); @@ -1380,145 +1433,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; #define STRIPNAME(i) (stripformat[i]+3) - -/* Don't call if length < 2 */ -#define Py_STRING_MATCH(target, offset, pattern, length) \ - (target[offset] == pattern[0] && \ - target[offset+length-1] == pattern[length-1] && \ - !memcmp(target+offset+1, pattern+1, length-2) ) - - -/* Overallocate the initial list to reduce the number of reallocs for small - split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three - resizes, to sizes 4, 8, then 16. Most observed string splits are for human - text (roughly 11 words per line) and field delimited data (usually 1-10 - fields). For large strings the split algorithms are bandwidth limited - so increasing the preallocation likely will not improve things.*/ - -#define MAX_PREALLOC 12 - -/* 5 splits gives 6 elements */ -#define PREALLOC_SIZE(maxsplit) \ - (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1) - -#define SPLIT_APPEND(data, left, right) \ - str = PyString_FromStringAndSize((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -#define SPLIT_ADD(data, left, right) { \ - str = PyString_FromStringAndSize((data) + (left), \ - (right) - (left)); \ - if (str == NULL) \ - goto onError; \ - if (count < MAX_PREALLOC) { \ - PyList_SET_ITEM(list, count, str); \ - } else { \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); \ - } \ - count++; } - -/* Always force the list to the expected size. */ -#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count - -#define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; } -#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; } -#define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; } -#define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; } - -Py_LOCAL_INLINE(PyObject *) -split_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit) -{ - const char *s = PyString_AS_STRING(self); - Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); - - if (list == NULL) - return NULL; - - i = j = 0; - - while (maxsplit-- > 0) { - SKIP_SPACE(s, i, len); - if (i==len) break; - j = i; i++; - SKIP_NONSPACE(s, i, len); - if (j == 0 && i == len && PyString_CheckExact(self)) { - /* No whitespace in self, so just use it as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - break; - } - SPLIT_ADD(s, j, i); - } - - if (i < len) { - /* Only occurs when maxsplit was reached */ - /* Skip any remaining whitespace and copy to end of string */ - SKIP_SPACE(s, i, len); - if (i != len) - SPLIT_ADD(s, i, len); - } - FIX_PREALLOC_SIZE(list); - return list; - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -split_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - const char *s = PyString_AS_STRING(self); - register Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = 0; - while ((j < len) && (maxcount-- > 0)) { - for(; j<len; j++) { - /* I found that using memchr makes no difference */ - if (s[j] == ch) { - SPLIT_ADD(s, i, j); - i = j = j + 1; - break; - } - } - } - if (i == 0 && count == 0 && PyString_CheckExact(self)) { - /* ch not in self, so just use self as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - } - else if (i <= len) { - SPLIT_ADD(s, i, len); - } - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; -} - PyDoc_STRVAR(split__doc__, "S.split([sep [,maxsplit]]) -> list of strings\n\ \n\ @@ -1531,20 +1445,17 @@ from the result."); static PyObject * string_split(PyStringObject *self, PyObject *args) { - Py_ssize_t len = PyString_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; + Py_ssize_t len = PyString_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; const char *s = PyString_AS_STRING(self), *sub; - PyObject *list, *str, *subobj = Py_None; -#ifdef USE_FAST - Py_ssize_t pos; -#endif + PyObject *subobj = Py_None; if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return split_whitespace(self, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); if (PyString_Check(subobj)) { sub = PyString_AS_STRING(subobj); n = PyString_GET_SIZE(subobj); @@ -1556,46 +1467,7 @@ string_split(PyStringObject *self, PyObject *args) else if (PyObject_AsCharBuffer(subobj, &sub, &n)) return NULL; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else if (n == 1) - return split_char(self, len, sub[0], maxsplit); - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) - return NULL; - -#ifdef USE_FAST - i = j = 0; - while (maxsplit-- > 0) { - pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH); - if (pos < 0) - break; - j = i+pos; - SPLIT_ADD(s, i, j); - i = j + n; - } -#else - i = j = 0; - while ((j+n <= len) && (maxsplit-- > 0)) { - for (; j+n <= len; j++) { - if (Py_STRING_MATCH(s, j, sub, n)) { - SPLIT_ADD(s, i, j); - i = j = j + n; - break; - } - } - } -#endif - SPLIT_ADD(s, i, len); - FIX_PREALLOC_SIZE(list); - return list; - - onError: - Py_DECREF(list); - return NULL; + return stringlib_split((PyObject*) self, s, len, sub, n, maxsplit); } PyDoc_STRVAR(partition__doc__, @@ -1660,90 +1532,6 @@ string_rpartition(PyStringObject *self, PyObject *sep_obj) ); } -Py_LOCAL_INLINE(PyObject *) -rsplit_whitespace(PyStringObject *self, Py_ssize_t len, Py_ssize_t maxsplit) -{ - const char *s = PyString_AS_STRING(self); - Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit)); - - if (list == NULL) - return NULL; - - i = j = len-1; - - while (maxsplit-- > 0) { - RSKIP_SPACE(s, i); - if (i<0) break; - j = i; i--; - RSKIP_NONSPACE(s, i); - if (j == len-1 && i < 0 && PyString_CheckExact(self)) { - /* No whitespace in self, so just use it as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - break; - } - SPLIT_ADD(s, i + 1, j + 1); - } - if (i >= 0) { - /* Only occurs when maxsplit was reached */ - /* Skip any remaining whitespace and copy to beginning of string */ - RSKIP_SPACE(s, i); - if (i >= 0) - SPLIT_ADD(s, 0, i + 1); - - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - onError: - Py_DECREF(list); - return NULL; -} - -Py_LOCAL_INLINE(PyObject *) -rsplit_char(PyStringObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount) -{ - const char *s = PyString_AS_STRING(self); - register Py_ssize_t i, j, count=0; - PyObject *str; - PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); - - if (list == NULL) - return NULL; - - i = j = len - 1; - while ((i >= 0) && (maxcount-- > 0)) { - for (; i >= 0; i--) { - if (s[i] == ch) { - SPLIT_ADD(s, i + 1, j + 1); - j = i = i - 1; - break; - } - } - } - if (i < 0 && count == 0 && PyString_CheckExact(self)) { - /* ch not in self, so just use self as list[0] */ - Py_INCREF(self); - PyList_SET_ITEM(list, 0, (PyObject *)self); - count++; - } - else if (j >= -1) { - SPLIT_ADD(s, 0, j + 1); - } - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - PyDoc_STRVAR(rsplit__doc__, "S.rsplit([sep [,maxsplit]]) -> list of strings\n\ \n\ @@ -1756,17 +1544,17 @@ is a separator."); static PyObject * string_rsplit(PyStringObject *self, PyObject *args) { - Py_ssize_t len = PyString_GET_SIZE(self), n, i, j; - Py_ssize_t maxsplit = -1, count=0; - const char *s, *sub; - PyObject *list, *str, *subobj = Py_None; + Py_ssize_t len = PyString_GET_SIZE(self), n; + Py_ssize_t maxsplit = -1; + const char *s = PyString_AS_STRING(self), *sub; + PyObject *subobj = Py_None; if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit)) return NULL; if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (subobj == Py_None) - return rsplit_whitespace(self, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); if (PyString_Check(subobj)) { sub = PyString_AS_STRING(subobj); n = PyString_GET_SIZE(subobj); @@ -1778,40 +1566,7 @@ string_rsplit(PyStringObject *self, PyObject *args) else if (PyObject_AsCharBuffer(subobj, &sub, &n)) return NULL; - if (n == 0) { - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else if (n == 1) - return rsplit_char(self, len, sub[0], maxsplit); - - list = PyList_New(PREALLOC_SIZE(maxsplit)); - if (list == NULL) - return NULL; - - j = len; - i = j - n; - - s = PyString_AS_STRING(self); - while ( (i >= 0) && (maxsplit-- > 0) ) { - for (; i>=0; i--) { - if (Py_STRING_MATCH(s, i, sub, n)) { - SPLIT_ADD(s, i + n, j); - j = i; - i -= n; - break; - } - } - } - SPLIT_ADD(s, 0, j); - FIX_PREALLOC_SIZE(list); - if (PyList_Reverse(list) < 0) - goto onError; - return list; - -onError: - Py_DECREF(list); - return NULL; + return stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit); } @@ -1926,20 +1681,20 @@ _PyString_Join(PyObject *sep, PyObject *x) return string_join((PyStringObject *)sep, x); } -Py_LOCAL_INLINE(void) -string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len) -{ - if (*end > len) - *end = len; - else if (*end < 0) - *end += len; - if (*end < 0) - *end = 0; - if (*start < 0) - *start += len; - if (*start < 0) - *start = 0; -} +/* helper macro to fixup start/end slice values */ +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } Py_LOCAL_INLINE(Py_ssize_t) string_find_internal(PyStringObject *self, PyObject *args, int dir) @@ -1948,19 +1703,9 @@ string_find_internal(PyStringObject *self, PyObject *args, int dir) const char *sub; Py_ssize_t sub_len; Py_ssize_t start=0, end=PY_SSIZE_T_MAX; - PyObject *obj_start=Py_None, *obj_end=Py_None; - if (!PyArg_ParseTuple(args, "O|OO:find/rfind/index/rindex", &subobj, - &obj_start, &obj_end)) - return -2; - /* To support None in "start" and "end" arguments, meaning - the same as if they were not passed. - */ - if (obj_start != Py_None) - if (!_PyEval_SliceIndex(obj_start, &start)) - return -2; - if (obj_end != Py_None) - if (!_PyEval_SliceIndex(obj_end, &end)) + if (!stringlib_parse_args_finds("find/rfind/index/rindex", + args, &subobj, &start, &end)) return -2; if (PyString_Check(subobj)) { @@ -1992,7 +1737,7 @@ PyDoc_STRVAR(find__doc__, "S.find(sub [,start [,end]]) -> int\n\ \n\ Return the lowest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -2031,7 +1776,7 @@ PyDoc_STRVAR(rfind__doc__, "S.rfind(sub [,start [,end]]) -> int\n\ \n\ Return the highest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -2372,8 +2117,7 @@ string_count(PyStringObject *self, PyObject *args) Py_ssize_t sub_len; Py_ssize_t start = 0, end = PY_SSIZE_T_MAX; - if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!stringlib_parse_args_finds("count", args, &sub_obj, &start, &end)) return NULL; if (PyString_Check(sub_obj)) { @@ -2393,10 +2137,10 @@ string_count(PyStringObject *self, PyObject *args) else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len)) return NULL; - string_adjust_indices(&start, &end, PyString_GET_SIZE(self)); + ADJUST_INDICES(start, end, PyString_GET_SIZE(self)); return PyInt_FromSsize_t( - stringlib_count(str + start, end - start, sub, sub_len) + stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX) ); } @@ -2439,7 +2183,9 @@ PyDoc_STRVAR(translate__doc__, Return a copy of the string S, where all characters occurring\n\ in the optional argument deletechars are removed, and the\n\ remaining characters have been mapped through the given\n\ -translation table, which must be a string of length 256."); +translation table, which must be a string of length 256 or None.\n\ +If the table argument is None, no translation is applied and\n\ +the operation simply removes the characters in deletechars."); static PyObject * string_translate(PyStringObject *self, PyObject *args) @@ -2553,15 +2299,12 @@ string_translate(PyStringObject *self, PyObject *args) return input_obj; } /* Fix the size of the resulting string */ - if (inlen > 0) - _PyString_Resize(&result, output - output_start); + if (inlen > 0 && _PyString_Resize(&result, output - output_start)) + return NULL; return result; } -#define FORWARD 1 -#define REVERSE -1 - /* find and count characters and substrings */ #define findchar(target, target_len, c) \ @@ -2597,93 +2340,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount) return count; } -Py_LOCAL(Py_ssize_t) -findstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction) -{ - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings always match at the first attempt */ - if (pattern_len == 0) - return (direction > 0) ? start : end; - - end -= pattern_len; - - if (direction < 0) { - for (; end >= start; end--) - if (Py_STRING_MATCH(target, end, pattern, pattern_len)) - return end; - } else { - for (; start <= end; start++) - if (Py_STRING_MATCH(target, start, pattern, pattern_len)) - return start; - } - return -1; -} - -Py_LOCAL_INLINE(Py_ssize_t) -countstring(const char *target, Py_ssize_t target_len, - const char *pattern, Py_ssize_t pattern_len, - Py_ssize_t start, - Py_ssize_t end, - int direction, Py_ssize_t maxcount) -{ - Py_ssize_t count=0; - - if (start < 0) { - start += target_len; - if (start < 0) - start = 0; - } - if (end > target_len) { - end = target_len; - } else if (end < 0) { - end += target_len; - if (end < 0) - end = 0; - } - - /* zero-length substrings match everywhere */ - if (pattern_len == 0 || maxcount == 0) { - if (target_len+1 < maxcount) - return target_len+1; - return maxcount; - } - - end -= pattern_len; - if (direction < 0) { - for (; (end >= start); end--) - if (Py_STRING_MATCH(target, end, pattern, pattern_len)) { - count++; - if (--maxcount <= 0) break; - end -= pattern_len-1; - } - } else { - for (; (start <= end); start++) - if (Py_STRING_MATCH(target, start, pattern, pattern_len)) { - count++; - if (--maxcount <= 0) - break; - start += pattern_len-1; - } - } - return count; -} - /* Algorithms for different cases of string replacement */ @@ -2804,10 +2460,9 @@ replace_delete_substring(PyStringObject *self, self_len = PyString_GET_SIZE(self); self_s = PyString_AS_STRING(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, 1, - maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); if (count == 0) { /* no matches */ @@ -2826,9 +2481,9 @@ replace_delete_substring(PyStringObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start + offset; @@ -2904,9 +2559,9 @@ replace_substring_in_place(PyStringObject *self, self_s = PyString_AS_STRING(self); self_len = PyString_GET_SIZE(self); - offset = findstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD); + offset = stringlib_find(self_s, self_len, + from_s, from_len, + 0); if (offset == -1) { /* No matches; return the original string */ return return_self(self); @@ -2926,9 +2581,9 @@ replace_substring_in_place(PyStringObject *self, end = result_s + self_len; while ( --maxcount > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset==-1) break; Py_MEMCPY(start+offset, to_s, from_len); @@ -3020,9 +2675,10 @@ replace_substring(PyStringObject *self, self_s = PyString_AS_STRING(self); self_len = PyString_GET_SIZE(self); - count = countstring(self_s, self_len, - from_s, from_len, - 0, self_len, FORWARD, maxcount); + count = stringlib_count(self_s, self_len, + from_s, from_len, + maxcount); + if (count == 0) { /* no matches, return unchanged */ return return_self(self); @@ -3049,9 +2705,9 @@ replace_substring(PyStringObject *self, start = self_s; end = self_s + self_len; while (count-- > 0) { - offset = findstring(start, end-start, - from_s, from_len, - 0, end-start, FORWARD); + offset = stringlib_find(start, end-start, + from_s, from_len, + 0); if (offset == -1) break; next = start+offset; @@ -3221,7 +2877,7 @@ _string_tailmatch(PyStringObject *self, PyObject *substr, Py_ssize_t start, return -1; str = PyString_AS_STRING(self); - string_adjust_indices(&start, &end, len); + ADJUST_INDICES(start, end, len); if (direction < 0) { /* startswith */ @@ -3257,8 +2913,7 @@ string_startswith(PyStringObject *self, PyObject *args) PyObject *subobj; int result; - if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -3275,8 +2930,12 @@ string_startswith(PyStringObject *self, PyObject *args) Py_RETURN_FALSE; } result = _string_tailmatch(self, subobj, start, end, -1); - if (result == -1) + if (result == -1) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + PyErr_Format(PyExc_TypeError, "startswith first arg must be str, " + "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); return NULL; + } else return PyBool_FromLong(result); } @@ -3298,8 +2957,7 @@ string_endswith(PyStringObject *self, PyObject *args) PyObject *subobj; int result; - if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, - _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) + if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end)) return NULL; if (PyTuple_Check(subobj)) { Py_ssize_t i; @@ -3316,8 +2974,12 @@ string_endswith(PyStringObject *self, PyObject *args) Py_RETURN_FALSE; } result = _string_tailmatch(self, subobj, start, end, +1); - if (result == -1) + if (result == -1) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) + PyErr_Format(PyExc_TypeError, "endswith first arg must be str, " + "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name); return NULL; + } else return PyBool_FromLong(result); } @@ -3334,13 +2996,15 @@ a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ codecs.register_error that is able to handle UnicodeEncodeErrors."); static PyObject * -string_encode(PyStringObject *self, PyObject *args) +string_encode(PyStringObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; PyObject *v; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", + kwlist, &encoding, &errors)) return NULL; v = PyString_AsEncodedObject((PyObject *)self, encoding, errors); if (v == NULL) @@ -3371,13 +3035,15 @@ as well as any other name registered with codecs.register_error that is\n\ able to handle UnicodeDecodeErrors."); static PyObject * -string_decode(PyStringObject *self, PyObject *args) +string_decode(PyStringObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; PyObject *v; - if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode", + kwlist, &encoding, &errors)) return NULL; v = PyString_AsDecodedObject((PyObject *)self, encoding, errors); if (v == NULL) @@ -3885,62 +3551,15 @@ is given and true."); static PyObject* string_splitlines(PyStringObject *self, PyObject *args) { - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; int keepends = 0; - PyObject *list; - PyObject *str; - char *data; if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) return NULL; - data = PyString_AS_STRING(self); - len = PyString_GET_SIZE(self); - - /* This does not use the preallocated list because splitlines is - usually run with hundreds of newlines. The overhead of - switching between PyList_SET_ITEM and append causes about a - 2-3% slowdown for that common case. A smarter implementation - could move the if check out, so the SET_ITEMs are done first - and the appends only done when the prealloc buffer is full. - That's too much work for little gain.*/ - - list = PyList_New(0); - if (!list) - goto onError; - - for (i = j = 0; i < len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < len && data[i] != '\n' && data[i] != '\r') - i++; - - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - SPLIT_APPEND(data, j, len); - } - - return list; - - onError: - Py_XDECREF(list); - return NULL; + return stringlib_splitlines( + (PyObject*) self, PyString_AS_STRING(self), PyString_GET_SIZE(self), + keepends + ); } PyDoc_STRVAR(sizeof__doc__, @@ -3950,15 +3569,10 @@ static PyObject * string_sizeof(PyStringObject *v) { Py_ssize_t res; - res = sizeof(PyStringObject) + v->ob_size * v->ob_type->tp_itemsize; + res = PyStringObject_SIZE + PyString_GET_SIZE(v) * Py_TYPE(v)->tp_itemsize; return PyInt_FromSsize_t(res); } -#undef SPLIT_APPEND -#undef SPLIT_ADD -#undef MAX_PREALLOC -#undef PREALLOC_SIZE - static PyObject * string_getnewargs(PyStringObject *v) { @@ -3971,7 +3585,8 @@ string_getnewargs(PyStringObject *v) PyDoc_STRVAR(format__doc__, "S.format(*args, **kwargs) -> string\n\ \n\ -"); +Return a formatted version of S, using substitutions from args and kwargs.\n\ +The substitutions are identified by braces ('{' and '}')."); static PyObject * string__format__(PyObject* self, PyObject* args) @@ -4005,7 +3620,7 @@ done: PyDoc_STRVAR(p_format__doc__, "S.__format__(format_spec) -> string\n\ \n\ -"); +Return a formatted version of S as described by format_spec."); static PyMethodDef @@ -4055,8 +3670,8 @@ string_methods[] = { {"__format__", (PyCFunction) string__format__, METH_VARARGS, p_format__doc__}, {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, - {"encode", (PyCFunction)string_encode, METH_VARARGS, encode__doc__}, - {"decode", (PyCFunction)string_decode, METH_VARARGS, decode__doc__}, + {"encode", (PyCFunction)string_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, + {"decode", (PyCFunction)string_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__}, {"expandtabs", (PyCFunction)string_expandtabs, METH_VARARGS, expandtabs__doc__}, {"splitlines", (PyCFunction)string_splitlines, METH_VARARGS, @@ -4189,7 +3804,7 @@ If the argument is a string, the return value is the same object."); PyTypeObject PyString_Type = { PyVarObject_HEAD_INIT(&PyType_Type, 0) "str", - sizeof(PyStringObject), + PyStringObject_SIZE, sizeof(char), string_dealloc, /* tp_dealloc */ (printfunc)string_print, /* tp_print */ @@ -4285,7 +3900,7 @@ _PyString_Resize(PyObject **pv, Py_ssize_t newsize) _Py_DEC_REFTOTAL; _Py_ForgetReference(v); *pv = (PyObject *) - PyObject_REALLOC((char *)v, sizeof(PyStringObject) + newsize); + PyObject_REALLOC((char *)v, PyStringObject_SIZE + newsize); if (*pv == NULL) { PyObject_Del(v); PyErr_NoMemory(); @@ -4330,63 +3945,33 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) #define F_ALT (1<<3) #define F_ZERO (1<<4) -Py_LOCAL_INLINE(int) -formatfloat(char *buf, size_t buflen, int flags, - int prec, int type, PyObject *v) +/* Returns a new reference to a PyString object, or NULL on failure. */ + +static PyObject * +formatfloat(PyObject *v, int flags, int prec, int type) { - /* fmt = '%#.' + `prec` + `type` - worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ - char fmt[20]; + char *p; + PyObject *result; double x; + x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) { PyErr_Format(PyExc_TypeError, "float argument required, " "not %.200s", Py_TYPE(v)->tp_name); - return -1; + return NULL; } + if (prec < 0) prec = 6; -#if SIZEOF_INT > 4 - /* make sure that the decimal representation of precision really does - need at most 10 digits: platforms with sizeof(int) == 8 exist! */ - if (prec > 0x7fffffff) { - PyErr_SetString(PyExc_OverflowError, - "outrageously large precision " - "for formatted float"); - return -1; - } -#endif - - if (type == 'f' && fabs(x) >= 1e50) - type = 'g'; - /* Worst case length calc to ensure no buffer overrun: - - 'g' formats: - fmt = %#.<prec>g - buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp - for any double rep.) - len = 1 + prec + 1 + 2 + 5 = 9 + prec - - 'f' formats: - buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) - len = 1 + 50 + 1 + prec = 52 + prec - If prec=0 the effective precision is 1 (the leading digit is - always given), therefore increase the length by one. + p = PyOS_double_to_string(x, type, prec, + (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL); - */ - if (((type == 'g' || type == 'G') && - buflen <= (size_t)10 + (size_t)prec) || - (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { - PyErr_SetString(PyExc_OverflowError, - "formatted float is too long (precision too large?)"); - return -1; - } - PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", - (flags&F_ALT) ? "#" : "", - prec, type); - PyOS_ascii_formatd(buf, buflen, fmt, x); - return (int)strlen(buf); + if (p == NULL) + return NULL; + result = PyString_FromStringAndSize(p, strlen(p)); + PyMem_Free(p); + return result; } /* _PyString_FormatLong emulates the format codes d, u, o, x and X, and @@ -4626,7 +4211,7 @@ formatchar(char *buf, size_t buflen, PyObject *v) /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) - FORMATBUFLEN is the length of the buffer in which the floats, ints, & + FORMATBUFLEN is the length of the buffer in which the ints & chars are formatted. XXX This is a magic number. Each formatting routine does bounds checking to ensure no overflow, but a better solution may be to malloc a buffer of appropriate size for each @@ -4674,7 +4259,7 @@ PyString_Format(PyObject *format, PyObject *args) if (--rescnt < 0) { rescnt = fmtcnt + 100; reslen += rescnt; - if (_PyString_Resize(&result, reslen) < 0) + if (_PyString_Resize(&result, reslen)) return NULL; res = PyString_AS_STRING(result) + reslen - rescnt; @@ -4696,7 +4281,7 @@ PyString_Format(PyObject *format, PyObject *args) int sign; Py_ssize_t len; char formatbuf[FORMATBUFLEN]; - /* For format{float,int,char}() */ + /* For format{int,char}() */ #ifdef Py_USING_UNICODE char *fmt_start = fmt; Py_ssize_t argidx_start = argidx; @@ -4947,13 +4532,11 @@ PyString_Format(PyObject *format, PyObject *args) case 'F': case 'g': case 'G': - if (c == 'F') - c = 'f'; - pbuf = formatbuf; - len = formatfloat(pbuf, sizeof(formatbuf), - flags, prec, c, v); - if (len < 0) + temp = formatfloat(v, flags, prec, c); + if (temp == NULL) goto error; + pbuf = PyString_AS_STRING(temp); + len = PyString_GET_SIZE(temp); sign = 1; if (flags & F_ZERO) fill = '0'; @@ -5003,7 +4586,7 @@ PyString_Format(PyObject *format, PyObject *args) Py_XDECREF(temp); return PyErr_NoMemory(); } - if (_PyString_Resize(&result, reslen) < 0) { + if (_PyString_Resize(&result, reslen)) { Py_XDECREF(temp); return NULL; } @@ -5071,7 +4654,8 @@ PyString_Format(PyObject *format, PyObject *args) if (args_owned) { Py_DECREF(args); } - _PyString_Resize(&result, reslen - rescnt); + if (_PyString_Resize(&result, reslen - rescnt)) + return NULL; return result; #ifdef Py_USING_UNICODE |