summaryrefslogtreecommitdiff
path: root/Objects/stringobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/stringobject.c')
-rw-r--r--Objects/stringobject.c1408
1 files changed, 970 insertions, 438 deletions
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index b34dcb2110..110c38ee01 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -1,6 +1,7 @@
/* String object implementation */
#define PY_SSIZE_T_CLEAN
+
#include "Python.h"
#include <ctype.h>
@@ -176,14 +177,11 @@ PyString_FromFormatV(const char *format, va_list vargs)
while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
;
- /* skip the 'l' in %ld, since it doesn't change the
- width. although only %d is supported (see
- "expand" section below), others can be easily
- added */
- if (*f == 'l' && *(f+1) == 'd')
- ++f;
- /* likewise for %zd */
- if (*f == 'z' && *(f+1) == 'd')
+ /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
+ * they don't affect the amount of space we reserve.
+ */
+ if ((*f == 'l' || *f == 'z') &&
+ (f[1] == 'd' || f[1] == 'u'))
++f;
switch (*f) {
@@ -193,7 +191,7 @@ PyString_FromFormatV(const char *format, va_list vargs)
case '%':
n++;
break;
- case 'd': case 'i': case 'x':
+ case 'd': case 'u': case 'i': case 'x':
(void) va_arg(count, int);
/* 20 bytes is enough to hold a 64-bit
integer. Decimal takes the most space.
@@ -255,14 +253,14 @@ PyString_FromFormatV(const char *format, va_list vargs)
}
while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
f++;
- /* handle the long flag, but only for %ld. others
- can be added when necessary. */
- if (*f == 'l' && *(f+1) == 'd') {
+ /* handle the long flag, but only for %ld and %lu.
+ others can be added when necessary. */
+ if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
longflag = 1;
++f;
}
/* handle the size_t flag. */
- if (*f == 'z' && *(f+1) == 'd') {
+ if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
size_tflag = 1;
++f;
}
@@ -275,10 +273,22 @@ PyString_FromFormatV(const char *format, va_list vargs)
if (longflag)
sprintf(s, "%ld", va_arg(vargs, long));
else if (size_tflag)
+ sprintf(s, "%" PY_FORMAT_SIZE_T "d",
+ va_arg(vargs, Py_ssize_t));
+ else
+ sprintf(s, "%d", va_arg(vargs, int));
+ s += strlen(s);
+ break;
+ case 'u':
+ if (longflag)
+ sprintf(s, "%lu",
+ va_arg(vargs, unsigned long));
+ else if (size_tflag)
sprintf(s, "%" PY_FORMAT_SIZE_T "u",
va_arg(vargs, size_t));
else
- sprintf(s, "%d", va_arg(vargs, int));
+ sprintf(s, "%u",
+ va_arg(vargs, unsigned int));
s += strlen(s);
break;
case 'i':
@@ -680,6 +690,9 @@ PyObject *PyString_DecodeEscape(const char *s,
return NULL;
}
+/* -------------------------------------------------------------------- */
+/* object api */
+
static Py_ssize_t
string_getsize(register PyObject *op)
{
@@ -754,8 +767,25 @@ PyString_AsStringAndSize(register PyObject *obj,
return 0;
}
+/* -------------------------------------------------------------------- */
/* Methods */
+#define STRINGLIB_CHAR char
+
+#define STRINGLIB_CMP memcmp
+#define STRINGLIB_LEN PyString_GET_SIZE
+#define STRINGLIB_NEW PyString_FromStringAndSize
+#define STRINGLIB_STR PyString_AS_STRING
+
+#define STRINGLIB_EMPTY nullstring
+
+#include "stringlib/fastsearch.h"
+
+#include "stringlib/count.h"
+#include "stringlib/find.h"
+#include "stringlib/partition.h"
+
+
static int
string_print(PyStringObject *op, FILE *fp, int flags)
{
@@ -900,7 +930,7 @@ string_length(PyStringObject *a)
static PyObject *
string_concat(register PyStringObject *a, register PyObject *bb)
{
- register size_t size;
+ register Py_ssize_t size;
register PyStringObject *op;
if (!PyString_Check(bb)) {
#ifdef Py_USING_UNICODE
@@ -924,7 +954,12 @@ string_concat(register PyStringObject *a, register PyObject *bb)
return (PyObject *)a;
}
size = a->ob_size + b->ob_size;
- /* XXX check overflow */
+ if (size < 0) {
+ PyErr_SetString(PyExc_OverflowError,
+ "strings are too large to concat");
+ return NULL;
+ }
+
/* Inline PyObject_NewVar */
op = (PyStringObject *)PyObject_MALLOC(sizeof(PyStringObject) + size);
if (op == NULL)
@@ -1017,65 +1052,36 @@ string_slice(register PyStringObject *a, register Py_ssize_t i,
}
static int
-string_contains(PyObject *a, PyObject *el)
+string_contains(PyObject *str_obj, PyObject *sub_obj)
{
- char *s = PyString_AS_STRING(a);
- const char *sub = PyString_AS_STRING(el);
- char *last;
- Py_ssize_t len_sub = PyString_GET_SIZE(el);
- Py_ssize_t shortsub;
- char firstchar, lastchar;
-
- if (!PyString_CheckExact(el)) {
+ if (!PyString_CheckExact(sub_obj)) {
#ifdef Py_USING_UNICODE
- if (PyUnicode_Check(el))
- return PyUnicode_Contains(a, el);
+ if (PyUnicode_Check(sub_obj))
+ return PyUnicode_Contains(str_obj, sub_obj);
#endif
- if (!PyString_Check(el)) {
+ if (!PyString_Check(sub_obj)) {
PyErr_SetString(PyExc_TypeError,
"'in <string>' requires string as left operand");
return -1;
}
}
- if (len_sub == 0)
- return 1;
- /* last points to one char beyond the start of the rightmost
- substring. When s<last, there is still room for a possible match
- and s[0] through s[len_sub-1] will be in bounds.
- shortsub is len_sub minus the last character which is checked
- separately just before the memcmp(). That check helps prevent
- false starts and saves the setup time for memcmp().
- */
- firstchar = sub[0];
- shortsub = len_sub - 1;
- lastchar = sub[shortsub];
- last = s + PyString_GET_SIZE(a) - len_sub + 1;
- while (s < last) {
- s = (char *)memchr(s, firstchar, last-s);
- if (s == NULL)
- return 0;
- assert(s < last);
- if (s[shortsub] == lastchar && memcmp(s, sub, shortsub) == 0)
- return 1;
- s++;
- }
- return 0;
+ return stringlib_contains_obj(str_obj, sub_obj);
}
static PyObject *
string_item(PyStringObject *a, register Py_ssize_t i)
{
+ char pchar;
PyObject *v;
- char *pchar;
if (i < 0 || i >= a->ob_size) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL;
}
- pchar = a->ob_sval + i;
- v = (PyObject *)characters[*pchar & UCHAR_MAX];
+ pchar = a->ob_sval[i];
+ v = (PyObject *)characters[pchar & UCHAR_MAX];
if (v == NULL)
- v = PyString_FromStringAndSize(pchar, 1);
+ v = PyString_FromStringAndSize(&pchar, 1);
else {
#ifdef COUNT_ALLOCS
one_strings++;
@@ -1151,9 +1157,8 @@ string_richcompare(PyStringObject *a, PyStringObject *b, int op)
int
_PyString_Eq(PyObject *o1, PyObject *o2)
{
- PyStringObject *a, *b;
- a = (PyStringObject*)o1;
- b = (PyStringObject*)o2;
+ PyStringObject *a = (PyStringObject*) o1;
+ PyStringObject *b = (PyStringObject*) o2;
return a->ob_size == b->ob_size
&& *a->ob_sval == *b->ob_sval
&& memcmp(a->ob_sval, b->ob_sval, a->ob_size) == 0;
@@ -1308,6 +1313,27 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3)
+
+/* Don't call if length < 2 */
+#define Py_STRING_MATCH(target, offset, pattern, length) \
+ (target[offset] == pattern[0] && \
+ target[offset+length-1] == pattern[length-1] && \
+ !memcmp(target+offset+1, pattern+1, length-2) )
+
+
+/* Overallocate the initial list to reduce the number of reallocs for small
+ split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
+ resizes, to sizes 4, 8, then 16. Most observed string splits are for human
+ text (roughly 11 words per line) and field delimited data (usually 1-10
+ fields). For large strings the split algorithms are bandwidth limited
+ so increasing the preallocation likely will not improve things.*/
+
+#define MAX_PREALLOC 12
+
+/* 5 splits gives 6 elements */
+#define PREALLOC_SIZE(maxsplit) \
+ (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
+
#define SPLIT_APPEND(data, left, right) \
str = PyString_FromStringAndSize((data) + (left), \
(right) - (left)); \
@@ -1320,74 +1346,90 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
else \
Py_DECREF(str);
-#define SPLIT_INSERT(data, left, right) \
+#define SPLIT_ADD(data, left, right) { \
str = PyString_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
- if (PyList_Insert(list, 0, str)) { \
- Py_DECREF(str); \
- goto onError; \
+ if (count < MAX_PREALLOC) { \
+ PyList_SET_ITEM(list, count, str); \
+ } else { \
+ if (PyList_Append(list, str)) { \
+ Py_DECREF(str); \
+ goto onError; \
+ } \
+ else \
+ Py_DECREF(str); \
} \
- else \
- Py_DECREF(str);
+ count++; }
-static PyObject *
+/* Always force the list to the expected size. */
+#define FIX_PREALLOC_SIZE(list) ((PyListObject *)list)->ob_size = count;
+
+#define SKIP_SPACE(s, i, len) { while (i<len && isspace(Py_CHARMASK(s[i]))) i++; }
+#define SKIP_NONSPACE(s, i, len) { while (i<len && !isspace(Py_CHARMASK(s[i]))) i++; }
+#define RSKIP_SPACE(s, i) { while (i>=0 && isspace(Py_CHARMASK(s[i]))) i--; }
+#define RSKIP_NONSPACE(s, i) { while (i>=0 && !isspace(Py_CHARMASK(s[i]))) i--; }
+
+Py_LOCAL_INLINE(PyObject *)
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
{
- Py_ssize_t i, j;
+ Py_ssize_t i, j, count=0;
PyObject *str;
- PyObject *list = PyList_New(0);
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
- for (i = j = 0; i < len; ) {
- while (i < len && isspace(Py_CHARMASK(s[i])))
- i++;
- j = i;
- while (i < len && !isspace(Py_CHARMASK(s[i])))
- i++;
- if (j < i) {
- if (maxsplit-- <= 0)
- break;
- SPLIT_APPEND(s, j, i);
- while (i < len && isspace(Py_CHARMASK(s[i])))
- i++;
- j = i;
- }
+ i = j = 0;
+
+ while (maxsplit-- > 0) {
+ SKIP_SPACE(s, i, len);
+ if (i==len) break;
+ j = i; i++;
+ SKIP_NONSPACE(s, i, len);
+ SPLIT_ADD(s, j, i);
}
- if (j < len) {
- SPLIT_APPEND(s, j, len);
+
+ if (i < len) {
+ /* Only occurs when maxsplit was reached */
+ /* Skip any remaining whitespace and copy to end of string */
+ SKIP_SPACE(s, i, len);
+ if (i != len)
+ SPLIT_ADD(s, i, len);
}
+ FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
- register Py_ssize_t i, j;
+ register Py_ssize_t i, j, count=0;
PyObject *str;
- PyObject *list = PyList_New(0);
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
- for (i = j = 0; i < len; ) {
- if (s[i] == ch) {
- if (maxcount-- <= 0)
+ i = j = 0;
+ while ((j < len) && (maxcount-- > 0)) {
+ for(; j<len; j++) {
+ /* I found that using memchr makes no difference */
+ if (s[j] == ch) {
+ SPLIT_ADD(s, i, j);
+ i = j = j + 1;
break;
- SPLIT_APPEND(s, j, i);
- i = j = i + 1;
- } else
- i++;
+ }
+ }
}
- if (j <= len) {
- SPLIT_APPEND(s, j, len);
+ if (i <= len) {
+ SPLIT_ADD(s, i, len);
}
+ FIX_PREALLOC_SIZE(list);
return list;
onError:
@@ -1407,10 +1449,12 @@ static PyObject *
string_split(PyStringObject *self, PyObject *args)
{
Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
- int err;
- Py_ssize_t maxsplit = -1;
+ Py_ssize_t maxsplit = -1, count=0;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *item, *subobj = Py_None;
+ PyObject *list, *str, *subobj = Py_None;
+#ifdef USE_FAST
+ Py_ssize_t pos;
+#endif
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
@@ -1436,98 +1480,166 @@ string_split(PyStringObject *self, PyObject *args)
else if (n == 1)
return split_char(s, len, sub[0], maxsplit);
- list = PyList_New(0);
+ list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
+#ifdef USE_FAST
i = j = 0;
- while (i+n <= len) {
- if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
- if (maxsplit-- <= 0)
+ while (maxsplit-- > 0) {
+ pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
+ if (pos < 0)
+ break;
+ j = i+pos;
+ SPLIT_ADD(s, i, j);
+ i = j + n;
+
+ }
+#else
+ i = j = 0;
+ while ((j+n <= len) && (maxsplit-- > 0)) {
+ for (; j+n <= len; j++) {
+ if (Py_STRING_MATCH(s, j, sub, n)) {
+ SPLIT_ADD(s, i, j);
+ i = j = j + n;
break;
- item = PyString_FromStringAndSize(s+j, i-j);
- if (item == NULL)
- goto fail;
- err = PyList_Append(list, item);
- Py_DECREF(item);
- if (err < 0)
- goto fail;
- i = j = i + n;
+ }
}
- else
- i++;
}
- item = PyString_FromStringAndSize(s+j, len-j);
- if (item == NULL)
- goto fail;
- err = PyList_Append(list, item);
- Py_DECREF(item);
- if (err < 0)
- goto fail;
-
+#endif
+ SPLIT_ADD(s, i, len);
+ FIX_PREALLOC_SIZE(list);
return list;
- fail:
+ onError:
Py_DECREF(list);
return NULL;
}
+PyDoc_STRVAR(partition__doc__,
+"S.partition(sep) -> (head, sep, tail)\n\
+\n\
+Searches for the separator sep in S, and returns the part before it,\n\
+the separator itself, and the part after it. If the separator is not\n\
+found, returns S and two empty strings.");
+
+static PyObject *
+string_partition(PyStringObject *self, PyObject *sep_obj)
+{
+ const char *sep;
+ Py_ssize_t sep_len;
+
+ if (PyString_Check(sep_obj)) {
+ sep = PyString_AS_STRING(sep_obj);
+ sep_len = PyString_GET_SIZE(sep_obj);
+ }
+#ifdef Py_USING_UNICODE
+ else if (PyUnicode_Check(sep_obj))
+ return PyUnicode_Partition((PyObject *) self, sep_obj);
+#endif
+ else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
+ return NULL;
+
+ return stringlib_partition(
+ (PyObject*) self,
+ PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ sep_obj, sep, sep_len
+ );
+}
+
+PyDoc_STRVAR(rpartition__doc__,
+"S.rpartition(sep) -> (head, sep, tail)\n\
+\n\
+Searches for the separator sep in S, starting at the end of S, and returns\n\
+the part before it, the separator itself, and the part after it. If the\n\
+separator is not found, returns S and two empty strings.");
+
static PyObject *
+string_rpartition(PyStringObject *self, PyObject *sep_obj)
+{
+ const char *sep;
+ Py_ssize_t sep_len;
+
+ if (PyString_Check(sep_obj)) {
+ sep = PyString_AS_STRING(sep_obj);
+ sep_len = PyString_GET_SIZE(sep_obj);
+ }
+#ifdef Py_USING_UNICODE
+ else if (PyUnicode_Check(sep_obj))
+ return PyUnicode_Partition((PyObject *) self, sep_obj);
+#endif
+ else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
+ return NULL;
+
+ return stringlib_rpartition(
+ (PyObject*) self,
+ PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ sep_obj, sep, sep_len
+ );
+}
+
+Py_LOCAL_INLINE(PyObject *)
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxsplit)
{
- Py_ssize_t i, j;
+ Py_ssize_t i, j, count=0;
PyObject *str;
- PyObject *list = PyList_New(0);
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
- for (i = j = len - 1; i >= 0; ) {
- while (i >= 0 && isspace(Py_CHARMASK(s[i])))
- i--;
- j = i;
- while (i >= 0 && !isspace(Py_CHARMASK(s[i])))
- i--;
- if (j > i) {
- if (maxsplit-- <= 0)
- break;
- SPLIT_INSERT(s, i + 1, j + 1);
- while (i >= 0 && isspace(Py_CHARMASK(s[i])))
- i--;
- j = i;
- }
- }
- if (j >= 0) {
- SPLIT_INSERT(s, 0, j + 1);
- }
+ i = j = len-1;
+
+ while (maxsplit-- > 0) {
+ RSKIP_SPACE(s, i);
+ if (i<0) break;
+ j = i; i--;
+ RSKIP_NONSPACE(s, i);
+ SPLIT_ADD(s, i + 1, j + 1);
+ }
+ if (i >= 0) {
+ /* Only occurs when maxsplit was reached */
+ /* Skip any remaining whitespace and copy to beginning of string */
+ RSKIP_SPACE(s, i);
+ if (i >= 0)
+ SPLIT_ADD(s, 0, i + 1);
+
+ }
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
- register Py_ssize_t i, j;
+ register Py_ssize_t i, j, count=0;
PyObject *str;
- PyObject *list = PyList_New(0);
+ PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
- for (i = j = len - 1; i >= 0; ) {
- if (s[i] == ch) {
- if (maxcount-- <= 0)
+ i = j = len - 1;
+ while ((i >= 0) && (maxcount-- > 0)) {
+ for (; i >= 0; i--) {
+ if (s[i] == ch) {
+ SPLIT_ADD(s, i + 1, j + 1);
+ j = i = i - 1;
break;
- SPLIT_INSERT(s, i + 1, j + 1);
- j = i = i - 1;
- } else
- i--;
+ }
+ }
}
if (j >= -1) {
- SPLIT_INSERT(s, 0, j + 1);
+ SPLIT_ADD(s, 0, j + 1);
}
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
onError:
@@ -1548,10 +1660,9 @@ static PyObject *
string_rsplit(PyStringObject *self, PyObject *args)
{
Py_ssize_t len = PyString_GET_SIZE(self), n, i, j;
- int err;
- Py_ssize_t maxsplit = -1;
+ Py_ssize_t maxsplit = -1, count=0;
const char *s = PyString_AS_STRING(self), *sub;
- PyObject *list, *item, *subobj = Py_None;
+ PyObject *list, *str, *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
@@ -1577,40 +1688,30 @@ string_rsplit(PyStringObject *self, PyObject *args)
else if (n == 1)
return rsplit_char(s, len, sub[0], maxsplit);
- list = PyList_New(0);
+ list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
return NULL;
j = len;
i = j - n;
- while (i >= 0) {
- if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
- if (maxsplit-- <= 0)
+
+ while ( (i >= 0) && (maxsplit-- > 0) ) {
+ for (; i>=0; i--) {
+ if (Py_STRING_MATCH(s, i, sub, n)) {
+ SPLIT_ADD(s, i + n, j);
+ j = i;
+ i -= n;
break;
- item = PyString_FromStringAndSize(s+i+n, j-i-n);
- if (item == NULL)
- goto fail;
- err = PyList_Insert(list, 0, item);
- Py_DECREF(item);
- if (err < 0)
- goto fail;
- j = i;
- i -= n;
+ }
}
- else
- i--;
}
- item = PyString_FromStringAndSize(s, j);
- if (item == NULL)
- goto fail;
- err = PyList_Insert(list, 0, item);
- Py_DECREF(item);
- if (err < 0)
- goto fail;
-
+ SPLIT_ADD(s, 0, j);
+ FIX_PREALLOC_SIZE(list);
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
- fail:
+onError:
Py_DECREF(list);
return NULL;
}
@@ -1727,7 +1828,7 @@ _PyString_Join(PyObject *sep, PyObject *x)
return string_join((PyStringObject *)sep, x);
}
-static void
+Py_LOCAL_INLINE(void)
string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
{
if (*end > len)
@@ -1742,50 +1843,38 @@ string_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
*start = 0;
}
-static Py_ssize_t
+Py_LOCAL_INLINE(Py_ssize_t)
string_find_internal(PyStringObject *self, PyObject *args, int dir)
{
- const char *s = PyString_AS_STRING(self), *sub;
- Py_ssize_t len = PyString_GET_SIZE(self);
- Py_ssize_t n, i = 0, last = PY_SSIZE_T_MAX;
PyObject *subobj;
+ const char *sub;
+ Py_ssize_t sub_len;
+ Py_ssize_t start=0, end=PY_SSIZE_T_MAX;
/* XXX ssize_t i */
- if (!PyArg_ParseTuple(args, "O|O&O&:find/rfind/index/rindex",
- &subobj, _PyEval_SliceIndex, &i, _PyEval_SliceIndex, &last))
+ if (!PyArg_ParseTuple(args, "O|O&O&:find/rfind/index/rindex", &subobj,
+ _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
return -2;
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
- n = PyString_GET_SIZE(subobj);
+ sub_len = PyString_GET_SIZE(subobj);
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(subobj))
- return PyUnicode_Find((PyObject *)self, subobj, i, last, dir);
+ return PyUnicode_Find(
+ (PyObject *)self, subobj, start, end, dir);
#endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+ else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
return -2;
- string_adjust_indices(&i, &last, len);
-
- if (dir > 0) {
- if (n == 0 && i <= last)
- return (long)i;
- last -= n;
- for (; i <= last; ++i)
- if (s[i] == sub[0] && memcmp(&s[i], sub, n) == 0)
- return (long)i;
- }
- else {
- Py_ssize_t j;
-
- if (n == 0 && i <= last)
- return last;
- for (j = last-n; j >= i; --j)
- if (s[j] == sub[0] && memcmp(&s[j], sub, n) == 0)
- return j;
- }
-
- return -1;
+ if (dir > 0)
+ return stringlib_find_slice(
+ PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ sub, sub_len, start, end);
+ else
+ return stringlib_rfind_slice(
+ PyString_AS_STRING(self), PyString_GET_SIZE(self),
+ sub, sub_len, start, end);
}
@@ -1867,7 +1956,7 @@ string_rindex(PyStringObject *self, PyObject *args)
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
{
char *s = PyString_AS_STRING(self);
@@ -1900,7 +1989,7 @@ do_xstrip(PyStringObject *self, int striptype, PyObject *sepobj)
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
do_strip(PyStringObject *self, int striptype)
{
char *s = PyString_AS_STRING(self);
@@ -1930,7 +2019,7 @@ do_strip(PyStringObject *self, int striptype)
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
do_argstrip(PyStringObject *self, int striptype, PyObject *args)
{
PyObject *sep = NULL;
@@ -2024,57 +2113,68 @@ PyDoc_STRVAR(lower__doc__,
\n\
Return a copy of the string S converted to lowercase.");
+/* _tolower and _toupper are defined by SUSv2, but they're not ISO C */
+#ifndef _tolower
+#define _tolower tolower
+#endif
+
static PyObject *
string_lower(PyStringObject *self)
{
- char *s = PyString_AS_STRING(self), *s_new;
+ char *s;
Py_ssize_t i, n = PyString_GET_SIZE(self);
PyObject *newobj;
newobj = PyString_FromStringAndSize(NULL, n);
- if (newobj == NULL)
+ if (!newobj)
return NULL;
- s_new = PyString_AsString(newobj);
+
+ s = PyString_AS_STRING(newobj);
+
+ memcpy(s, PyString_AS_STRING(self), n);
+
for (i = 0; i < n; i++) {
- int c = Py_CHARMASK(*s++);
- if (isupper(c)) {
- *s_new = tolower(c);
- } else
- *s_new = c;
- s_new++;
+ int c = Py_CHARMASK(s[i]);
+ if (isupper(c))
+ s[i] = _tolower(c);
}
+
return newobj;
}
-
PyDoc_STRVAR(upper__doc__,
"S.upper() -> string\n\
\n\
Return a copy of the string S converted to uppercase.");
+#ifndef _toupper
+#define _toupper toupper
+#endif
+
static PyObject *
string_upper(PyStringObject *self)
{
- char *s = PyString_AS_STRING(self), *s_new;
+ char *s;
Py_ssize_t i, n = PyString_GET_SIZE(self);
PyObject *newobj;
newobj = PyString_FromStringAndSize(NULL, n);
- if (newobj == NULL)
+ if (!newobj)
return NULL;
- s_new = PyString_AsString(newobj);
+
+ s = PyString_AS_STRING(newobj);
+
+ memcpy(s, PyString_AS_STRING(self), n);
+
for (i = 0; i < n; i++) {
- int c = Py_CHARMASK(*s++);
- if (islower(c)) {
- *s_new = toupper(c);
- } else
- *s_new = c;
- s_new++;
+ int c = Py_CHARMASK(s[i]);
+ if (islower(c))
+ s[i] = _toupper(c);
}
+
return newobj;
}
-
PyDoc_STRVAR(title__doc__,
"S.title() -> string\n\
\n\
@@ -2150,62 +2250,44 @@ string_capitalize(PyStringObject *self)
PyDoc_STRVAR(count__doc__,
"S.count(sub[, start[, end]]) -> int\n\
\n\
-Return the number of occurrences of substring sub in string\n\
-S[start:end]. Optional arguments start and end are\n\
-interpreted as in slice notation.");
+Return the number of non-overlapping occurrences of substring sub in\n\
+string S[start:end]. Optional arguments start and end are interpreted\n\
+as in slice notation.");
static PyObject *
string_count(PyStringObject *self, PyObject *args)
{
- const char *s = PyString_AS_STRING(self), *sub, *t;
- Py_ssize_t len = PyString_GET_SIZE(self), n;
- Py_ssize_t i = 0, last = PY_SSIZE_T_MAX;
- Py_ssize_t m, r;
- PyObject *subobj;
+ PyObject *sub_obj;
+ const char *str = PyString_AS_STRING(self), *sub;
+ Py_ssize_t sub_len;
+ Py_ssize_t start = 0, end = PY_SSIZE_T_MAX;
- if (!PyArg_ParseTuple(args, "O|O&O&:count", &subobj,
- _PyEval_SliceIndex, &i, _PyEval_SliceIndex, &last))
+ if (!PyArg_ParseTuple(args, "O|O&O&:count", &sub_obj,
+ _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
return NULL;
- if (PyString_Check(subobj)) {
- sub = PyString_AS_STRING(subobj);
- n = PyString_GET_SIZE(subobj);
+ if (PyString_Check(sub_obj)) {
+ sub = PyString_AS_STRING(sub_obj);
+ sub_len = PyString_GET_SIZE(sub_obj);
}
#ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(subobj)) {
+ else if (PyUnicode_Check(sub_obj)) {
Py_ssize_t count;
- count = PyUnicode_Count((PyObject *)self, subobj, i, last);
+ count = PyUnicode_Count((PyObject *)self, sub_obj, start, end);
if (count == -1)
return NULL;
else
- return PyInt_FromLong((long) count);
+ return PyInt_FromSsize_t(count);
}
#endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+ else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
return NULL;
- string_adjust_indices(&i, &last, len);
-
- m = last + 1 - n;
- if (n == 0)
- return PyInt_FromSsize_t(m-i);
+ string_adjust_indices(&start, &end, PyString_GET_SIZE(self));
- r = 0;
- while (i < m) {
- if (!memcmp(s+i, sub, n)) {
- r++;
- i += n;
- } else {
- i++;
- }
- if (i >= m)
- break;
- t = (const char *)memchr(s+i, sub[0], m-i);
- if (t == NULL)
- break;
- i = t - s;
- }
- return PyInt_FromSsize_t(r);
+ return PyInt_FromSsize_t(
+ stringlib_count(str + start, end - start, sub, sub_len)
+ );
}
PyDoc_STRVAR(swapcase__doc__,
@@ -2359,156 +2441,616 @@ string_translate(PyStringObject *self, PyObject *args)
}
-/* What follows is used for implementing replace(). Perry Stoll. */
+#define FORWARD 1
+#define REVERSE -1
-/*
- mymemfind
+/* find and count characters and substrings */
- strstr replacement for arbitrary blocks of memory.
+#define findchar(target, target_len, c) \
+ ((char *)memchr((const void *)(target), c, target_len))
- Locates the first occurrence in the memory pointed to by MEM of the
- contents of memory pointed to by PAT. Returns the index into MEM if
- found, or -1 if not found. If len of PAT is greater than length of
- MEM, the function returns -1.
-*/
-static Py_ssize_t
-mymemfind(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
+/* String ops must return a string. */
+/* If the object is subclass of string, create a copy */
+Py_LOCAL(PyStringObject *)
+return_self(PyStringObject *self)
{
- register Py_ssize_t ii;
+ if (PyString_CheckExact(self)) {
+ Py_INCREF(self);
+ return self;
+ }
+ return (PyStringObject *)PyString_FromStringAndSize(
+ PyString_AS_STRING(self),
+ PyString_GET_SIZE(self));
+}
- /* pattern can not occur in the last pat_len-1 chars */
- len -= pat_len;
+Py_LOCAL_INLINE(Py_ssize_t)
+countchar(char *target, int target_len, char c, Py_ssize_t maxcount)
+{
+ Py_ssize_t count=0;
+ char *start=target;
+ char *end=target+target_len;
- for (ii = 0; ii <= len; ii++) {
- if (mem[ii] == pat[0] && memcmp(&mem[ii], pat, pat_len) == 0) {
- return ii;
- }
+ while ( (start=findchar(start, end-start, c)) != NULL ) {
+ count++;
+ if (count >= maxcount)
+ break;
+ start += 1;
+ }
+ return count;
+}
+
+Py_LOCAL(Py_ssize_t)
+findstring(char *target, Py_ssize_t target_len,
+ char *pattern, Py_ssize_t pattern_len,
+ Py_ssize_t start,
+ Py_ssize_t end,
+ int direction)
+{
+ if (start < 0) {
+ start += target_len;
+ if (start < 0)
+ start = 0;
+ }
+ if (end > target_len) {
+ end = target_len;
+ } else if (end < 0) {
+ end += target_len;
+ if (end < 0)
+ end = 0;
+ }
+
+ /* zero-length substrings always match at the first attempt */
+ if (pattern_len == 0)
+ return (direction > 0) ? start : end;
+
+ end -= pattern_len;
+
+ if (direction < 0) {
+ for (; end >= start; end--)
+ if (Py_STRING_MATCH(target, end, pattern, pattern_len))
+ return end;
+ } else {
+ for (; start <= end; start++)
+ if (Py_STRING_MATCH(target, start, pattern, pattern_len))
+ return start;
}
return -1;
}
-/*
- mymemcnt
+Py_LOCAL_INLINE(Py_ssize_t)
+countstring(char *target, Py_ssize_t target_len,
+ char *pattern, Py_ssize_t pattern_len,
+ Py_ssize_t start,
+ Py_ssize_t end,
+ int direction, Py_ssize_t maxcount)
+{
+ Py_ssize_t count=0;
+
+ if (start < 0) {
+ start += target_len;
+ if (start < 0)
+ start = 0;
+ }
+ if (end > target_len) {
+ end = target_len;
+ } else if (end < 0) {
+ end += target_len;
+ if (end < 0)
+ end = 0;
+ }
+
+ /* zero-length substrings match everywhere */
+ if (pattern_len == 0 || maxcount == 0) {
+ if (target_len+1 < maxcount)
+ return target_len+1;
+ return maxcount;
+ }
+
+ end -= pattern_len;
+ if (direction < 0) {
+ for (; (end >= start); end--)
+ if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
+ count++;
+ if (--maxcount <= 0) break;
+ end -= pattern_len-1;
+ }
+ } else {
+ for (; (start <= end); start++)
+ if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
+ count++;
+ if (--maxcount <= 0)
+ break;
+ start += pattern_len-1;
+ }
+ }
+ return count;
+}
- Return the number of distinct times PAT is found in MEM.
- meaning mem=1111 and pat==11 returns 2.
- mem=11111 and pat==11 also return 2.
- */
-static Py_ssize_t
-mymemcnt(const char *mem, Py_ssize_t len, const char *pat, Py_ssize_t pat_len)
+
+/* Algorithms for different cases of string replacement */
+
+/* len(self)>=1, from="", len(to)>=1, maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_interleave(PyStringObject *self,
+ PyStringObject *to,
+ Py_ssize_t maxcount)
{
- register Py_ssize_t offset = 0;
- Py_ssize_t nfound = 0;
+ char *self_s, *to_s, *result_s;
+ Py_ssize_t self_len, to_len, result_len;
+ Py_ssize_t count, i, product;
+ PyStringObject *result;
+
+ self_len = PyString_GET_SIZE(self);
+ to_len = PyString_GET_SIZE(to);
+
+ /* 1 at the end plus 1 after every character */
+ count = self_len+1;
+ if (maxcount < count)
+ count = maxcount;
+
+ /* Check for overflow */
+ /* result_len = count * to_len + self_len; */
+ product = count * to_len;
+ if (product / to_len != count) {
+ PyErr_SetString(PyExc_OverflowError,
+ "replace string is too long");
+ return NULL;
+ }
+ result_len = product + self_len;
+ if (result_len < 0) {
+ PyErr_SetString(PyExc_OverflowError,
+ "replace string is too long");
+ return NULL;
+ }
+
+ if (! (result = (PyStringObject *)
+ PyString_FromStringAndSize(NULL, result_len)) )
+ return NULL;
- while (len >= 0) {
- offset = mymemfind(mem, len, pat, pat_len);
- if (offset == -1)
- break;
- mem += offset + pat_len;
- len -= offset + pat_len;
- nfound++;
+ self_s = PyString_AS_STRING(self);
+ to_s = PyString_AS_STRING(to);
+ to_len = PyString_GET_SIZE(to);
+ result_s = PyString_AS_STRING(result);
+
+ /* TODO: special case single character, which doesn't need memcpy */
+
+ /* Lay the first one down (guaranteed this will occur) */
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
+ count -= 1;
+
+ for (i=0; i<count; i++) {
+ *result_s++ = *self_s++;
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
}
- return nfound;
+
+ /* Copy the rest of the original string */
+ memcpy(result_s, self_s, self_len-i);
+
+ return result;
}
-/*
- mymemreplace
+/* Special case for deleting a single character */
+/* len(self)>=1, len(from)==1, to="", maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_delete_single_character(PyStringObject *self,
+ char from_c, Py_ssize_t maxcount)
+{
+ char *self_s, *result_s;
+ char *start, *next, *end;
+ Py_ssize_t self_len, result_len;
+ Py_ssize_t count;
+ PyStringObject *result;
- Return a string in which all occurrences of PAT in memory STR are
- replaced with SUB.
+ self_len = PyString_GET_SIZE(self);
+ self_s = PyString_AS_STRING(self);
- If length of PAT is less than length of STR or there are no occurrences
- of PAT in STR, then the original string is returned. Otherwise, a new
- string is allocated here and returned.
+ count = countchar(self_s, self_len, from_c, maxcount);
+ if (count == 0) {
+ return return_self(self);
+ }
+
+ result_len = self_len - count; /* from_len == 1 */
+ assert(result_len>=0);
- on return, out_len is:
- the length of output string, or
- -1 if the input string is returned, or
- unchanged if an error occurs (no memory).
+ if ( (result = (PyStringObject *)
+ PyString_FromStringAndSize(NULL, result_len)) == NULL)
+ return NULL;
+ result_s = PyString_AS_STRING(result);
- return value is:
- the new string allocated locally, or
- NULL if an error occurred.
-*/
-static char *
-mymemreplace(const char *str, Py_ssize_t len, /* input string */
- const char *pat, Py_ssize_t pat_len, /* pattern string to find */
- const char *sub, Py_ssize_t sub_len, /* substitution string */
- Py_ssize_t count, /* number of replacements */
- Py_ssize_t *out_len)
+ start = self_s;
+ end = self_s + self_len;
+ while (count-- > 0) {
+ next = findchar(start, end-start, from_c);
+ if (next == NULL)
+ break;
+ memcpy(result_s, start, next-start);
+ result_s += (next-start);
+ start = next+1;
+ }
+ memcpy(result_s, start, end-start);
+
+ return result;
+}
+
+/* len(self)>=1, len(from)>=2, to="", maxcount>=1 */
+
+Py_LOCAL(PyStringObject *)
+replace_delete_substring(PyStringObject *self, PyStringObject *from,
+ Py_ssize_t maxcount) {
+ char *self_s, *from_s, *result_s;
+ char *start, *next, *end;
+ Py_ssize_t self_len, from_len, result_len;
+ Py_ssize_t count, offset;
+ PyStringObject *result;
+
+ self_len = PyString_GET_SIZE(self);
+ self_s = PyString_AS_STRING(self);
+ from_len = PyString_GET_SIZE(from);
+ from_s = PyString_AS_STRING(from);
+
+ count = countstring(self_s, self_len,
+ from_s, from_len,
+ 0, self_len, 1,
+ maxcount);
+
+ if (count == 0) {
+ /* no matches */
+ return return_self(self);
+ }
+
+ result_len = self_len - (count * from_len);
+ assert (result_len>=0);
+
+ if ( (result = (PyStringObject *)
+ PyString_FromStringAndSize(NULL, result_len)) == NULL )
+ return NULL;
+
+ result_s = PyString_AS_STRING(result);
+
+ start = self_s;
+ end = self_s + self_len;
+ while (count-- > 0) {
+ offset = findstring(start, end-start,
+ from_s, from_len,
+ 0, end-start, FORWARD);
+ if (offset == -1)
+ break;
+ next = start + offset;
+
+ memcpy(result_s, start, next-start);
+
+ result_s += (next-start);
+ start = next+from_len;
+ }
+ memcpy(result_s, start, end-start);
+ return result;
+}
+
+/* len(self)>=1, len(from)==len(to)==1, maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_single_character_in_place(PyStringObject *self,
+ char from_c, char to_c,
+ Py_ssize_t maxcount)
{
- char *out_s;
- char *new_s;
- Py_ssize_t nfound, offset, new_len;
-
- if (len == 0 || (pat_len == 0 && sub_len == 0) || pat_len > len)
- goto return_same;
-
- /* find length of output string */
- nfound = (pat_len > 0) ? mymemcnt(str, len, pat, pat_len) : len + 1;
- if (count < 0)
- count = PY_SSIZE_T_MAX;
- else if (nfound > count)
- nfound = count;
- if (nfound == 0)
- goto return_same;
-
- new_len = len + nfound*(sub_len - pat_len);
- if (new_len == 0) {
- /* Have to allocate something for the caller to free(). */
- out_s = (char *)PyMem_MALLOC(1);
- if (out_s == NULL)
- return NULL;
- out_s[0] = '\0';
+ char *self_s, *result_s, *start, *end, *next;
+ Py_ssize_t self_len;
+ PyStringObject *result;
+
+ /* The result string will be the same size */
+ self_s = PyString_AS_STRING(self);
+ self_len = PyString_GET_SIZE(self);
+
+ next = findchar(self_s, self_len, from_c);
+
+ if (next == NULL) {
+ /* No matches; return the original string */
+ return return_self(self);
+ }
+
+ /* Need to make a new string */
+ result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
+ if (result == NULL)
+ return NULL;
+ result_s = PyString_AS_STRING(result);
+ memcpy(result_s, self_s, self_len);
+
+ /* change everything in-place, starting with this one */
+ start = result_s + (next-self_s);
+ *start = to_c;
+ start++;
+ end = result_s + self_len;
+
+ while (--maxcount > 0) {
+ next = findchar(start, end-start, from_c);
+ if (next == NULL)
+ break;
+ *next = to_c;
+ start = next+1;
}
- else {
- assert(new_len > 0);
- new_s = (char *)PyMem_MALLOC(new_len);
- if (new_s == NULL)
- return NULL;
- out_s = new_s;
+
+ return result;
+}
- if (pat_len > 0) {
- for (; nfound > 0; --nfound) {
- /* find index of next instance of pattern */
- offset = mymemfind(str, len, pat, pat_len);
- if (offset == -1)
- break;
+/* len(self)>=1, len(from)==len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_substring_in_place(PyStringObject *self,
+ PyStringObject *from,
+ PyStringObject *to,
+ Py_ssize_t maxcount)
+{
+ char *result_s, *start, *end;
+ char *self_s, *from_s, *to_s;
+ Py_ssize_t self_len, from_len, offset;
+ PyStringObject *result;
+
+ /* The result string will be the same size */
+
+ self_s = PyString_AS_STRING(self);
+ self_len = PyString_GET_SIZE(self);
+
+ from_s = PyString_AS_STRING(from);
+ from_len = PyString_GET_SIZE(from);
+ to_s = PyString_AS_STRING(to);
+
+ offset = findstring(self_s, self_len,
+ from_s, from_len,
+ 0, self_len, FORWARD);
+
+ if (offset == -1) {
+ /* No matches; return the original string */
+ return return_self(self);
+ }
+
+ /* Need to make a new string */
+ result = (PyStringObject *) PyString_FromStringAndSize(NULL, self_len);
+ if (result == NULL)
+ return NULL;
+ result_s = PyString_AS_STRING(result);
+ memcpy(result_s, self_s, self_len);
+
+
+ /* change everything in-place, starting with this one */
+ start = result_s + offset;
+ memcpy(start, to_s, from_len);
+ start += from_len;
+ end = result_s + self_len;
+
+ while ( --maxcount > 0) {
+ offset = findstring(start, end-start,
+ from_s, from_len,
+ 0, end-start, FORWARD);
+ if (offset==-1)
+ break;
+ memcpy(start+offset, to_s, from_len);
+ start += offset+from_len;
+ }
+
+ return result;
+}
- /* copy non matching part of input string */
- memcpy(new_s, str, offset);
- str += offset + pat_len;
- len -= offset + pat_len;
+/* len(self)>=1, len(from)==1, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_single_character(PyStringObject *self,
+ char from_c,
+ PyStringObject *to,
+ Py_ssize_t maxcount)
+{
+ char *self_s, *to_s, *result_s;
+ char *start, *next, *end;
+ Py_ssize_t self_len, to_len, result_len;
+ Py_ssize_t count, product;
+ PyStringObject *result;
+
+ self_s = PyString_AS_STRING(self);
+ self_len = PyString_GET_SIZE(self);
+
+ count = countchar(self_s, self_len, from_c, maxcount);
+
+ if (count == 0) {
+ /* no matches, return unchanged */
+ return return_self(self);
+ }
+
+ to_s = PyString_AS_STRING(to);
+ to_len = PyString_GET_SIZE(to);
+
+ /* use the difference between current and new, hence the "-1" */
+ /* result_len = self_len + count * (to_len-1) */
+ product = count * (to_len-1);
+ if (product / (to_len-1) != count) {
+ PyErr_SetString(PyExc_OverflowError, "replace string is too long");
+ return NULL;
+ }
+ result_len = self_len + product;
+ if (result_len < 0) {
+ PyErr_SetString(PyExc_OverflowError, "replace string is too long");
+ return NULL;
+ }
+
+ if ( (result = (PyStringObject *)
+ PyString_FromStringAndSize(NULL, result_len)) == NULL)
+ return NULL;
+ result_s = PyString_AS_STRING(result);
+
+ start = self_s;
+ end = self_s + self_len;
+ while (count-- > 0) {
+ next = findchar(start, end-start, from_c);
+ if (next == NULL)
+ break;
+
+ if (next == start) {
+ /* replace with the 'to' */
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
+ start += 1;
+ } else {
+ /* copy the unchanged old then the 'to' */
+ memcpy(result_s, start, next-start);
+ result_s += (next-start);
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
+ start = next+1;
+ }
+ }
+ /* Copy the remainder of the remaining string */
+ memcpy(result_s, start, end-start);
+
+ return result;
+}
- /* copy substitute into the output string */
- new_s += offset;
- memcpy(new_s, sub, sub_len);
- new_s += sub_len;
- }
- /* copy any remaining values into output string */
- if (len > 0)
- memcpy(new_s, str, len);
+/* len(self)>=1, len(from)>=2, len(to)>=2, maxcount>=1 */
+Py_LOCAL(PyStringObject *)
+replace_substring(PyStringObject *self,
+ PyStringObject *from,
+ PyStringObject *to,
+ Py_ssize_t maxcount) {
+ char *self_s, *from_s, *to_s, *result_s;
+ char *start, *next, *end;
+ Py_ssize_t self_len, from_len, to_len, result_len;
+ Py_ssize_t count, offset, product;
+ PyStringObject *result;
+
+ self_s = PyString_AS_STRING(self);
+ self_len = PyString_GET_SIZE(self);
+ from_s = PyString_AS_STRING(from);
+ from_len = PyString_GET_SIZE(from);
+
+ count = countstring(self_s, self_len,
+ from_s, from_len,
+ 0, self_len, FORWARD, maxcount);
+ if (count == 0) {
+ /* no matches, return unchanged */
+ return return_self(self);
+ }
+
+ to_s = PyString_AS_STRING(to);
+ to_len = PyString_GET_SIZE(to);
+
+ /* Check for overflow */
+ /* result_len = self_len + count * (to_len-from_len) */
+ product = count * (to_len-from_len);
+ if (product / (to_len-from_len) != count) {
+ PyErr_SetString(PyExc_OverflowError, "replace string is too long");
+ return NULL;
+ }
+ result_len = self_len + product;
+ if (result_len < 0) {
+ PyErr_SetString(PyExc_OverflowError, "replace string is too long");
+ return NULL;
+ }
+
+ if ( (result = (PyStringObject *)
+ PyString_FromStringAndSize(NULL, result_len)) == NULL)
+ return NULL;
+ result_s = PyString_AS_STRING(result);
+
+ start = self_s;
+ end = self_s + self_len;
+ while (count-- > 0) {
+ offset = findstring(start, end-start,
+ from_s, from_len,
+ 0, end-start, FORWARD);
+ if (offset == -1)
+ break;
+ next = start+offset;
+ if (next == start) {
+ /* replace with the 'to' */
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
+ start += from_len;
+ } else {
+ /* copy the unchanged old then the 'to' */
+ memcpy(result_s, start, next-start);
+ result_s += (next-start);
+ memcpy(result_s, to_s, to_len);
+ result_s += to_len;
+ start = next+from_len;
}
- else {
- for (;;++str, --len) {
- memcpy(new_s, sub, sub_len);
- new_s += sub_len;
- if (--nfound <= 0) {
- memcpy(new_s, str, len);
- break;
- }
- *new_s++ = *str;
- }
+ }
+ /* Copy the remainder of the remaining string */
+ memcpy(result_s, start, end-start);
+
+ return result;
+}
+
+
+Py_LOCAL(PyStringObject *)
+replace(PyStringObject *self,
+ PyStringObject *from,
+ PyStringObject *to,
+ Py_ssize_t maxcount)
+{
+ Py_ssize_t from_len, to_len;
+
+ if (maxcount < 0) {
+ maxcount = PY_SSIZE_T_MAX;
+ } else if (maxcount == 0 || PyString_GET_SIZE(self) == 0) {
+ /* nothing to do; return the original string */
+ return return_self(self);
+ }
+
+ from_len = PyString_GET_SIZE(from);
+ to_len = PyString_GET_SIZE(to);
+
+ if (maxcount == 0 ||
+ (from_len == 0 && to_len == 0)) {
+ /* nothing to do; return the original string */
+ return return_self(self);
+ }
+
+ /* Handle zero-length special cases */
+
+ if (from_len == 0) {
+ /* insert the 'to' string everywhere. */
+ /* >>> "Python".replace("", ".") */
+ /* '.P.y.t.h.o.n.' */
+ return replace_interleave(self, to, maxcount);
+ }
+
+ /* Except for "".replace("", "A") == "A" there is no way beyond this */
+ /* point for an empty self string to generate a non-empty string */
+ /* Special case so the remaining code always gets a non-empty string */
+ if (PyString_GET_SIZE(self) == 0) {
+ return return_self(self);
+ }
+
+ if (to_len == 0) {
+ /* delete all occurances of 'from' string */
+ if (from_len == 1) {
+ return replace_delete_single_character(
+ self, PyString_AS_STRING(from)[0], maxcount);
+ } else {
+ return replace_delete_substring(self, from, maxcount);
}
}
- *out_len = new_len;
- return out_s;
- return_same:
- *out_len = -1;
- return (char *)str; /* cast away const */
-}
+ /* Handle special case where both strings have the same length */
+
+ if (from_len == to_len) {
+ if (from_len == 1) {
+ return replace_single_character_in_place(
+ self,
+ PyString_AS_STRING(from)[0],
+ PyString_AS_STRING(to)[0],
+ maxcount);
+ } else {
+ return replace_substring_in_place(
+ self, from, to, maxcount);
+ }
+ }
+ /* Otherwise use the more generic algorithms */
+ if (from_len == 1) {
+ return replace_single_character(self, PyString_AS_STRING(from)[0],
+ to, maxcount);
+ } else {
+ /* len('from')>=2, len('to')>=1 */
+ return replace_substring(self, from, to, maxcount);
+ }
+}
PyDoc_STRVAR(replace__doc__,
"S.replace (old, new[, count]) -> string\n\
@@ -2520,66 +3062,42 @@ given, only the first count occurrences are replaced.");
static PyObject *
string_replace(PyStringObject *self, PyObject *args)
{
- const char *str = PyString_AS_STRING(self), *sub, *repl;
- char *new_s;
- const Py_ssize_t len = PyString_GET_SIZE(self);
- Py_ssize_t sub_len, repl_len, out_len;
Py_ssize_t count = -1;
- PyObject *newobj;
- PyObject *subobj, *replobj;
+ PyObject *from, *to;
+ const char *tmp_s;
+ Py_ssize_t tmp_len;
- if (!PyArg_ParseTuple(args, "OO|n:replace",
- &subobj, &replobj, &count))
+ if (!PyArg_ParseTuple(args, "OO|n:replace", &from, &to, &count))
return NULL;
- if (PyString_Check(subobj)) {
- sub = PyString_AS_STRING(subobj);
- sub_len = PyString_GET_SIZE(subobj);
+ if (PyString_Check(from)) {
+ /* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(subobj))
+ if (PyUnicode_Check(from))
return PyUnicode_Replace((PyObject *)self,
- subobj, replobj, count);
+ from, to, count);
#endif
- else if (PyObject_AsCharBuffer(subobj, &sub, &sub_len))
+ else if (PyObject_AsCharBuffer(from, &tmp_s, &tmp_len))
return NULL;
- if (PyString_Check(replobj)) {
- repl = PyString_AS_STRING(replobj);
- repl_len = PyString_GET_SIZE(replobj);
+ if (PyString_Check(to)) {
+ /* Can this be made a '!check' after the Unicode check? */
}
#ifdef Py_USING_UNICODE
- else if (PyUnicode_Check(replobj))
+ else if (PyUnicode_Check(to))
return PyUnicode_Replace((PyObject *)self,
- subobj, replobj, count);
+ from, to, count);
#endif
- else if (PyObject_AsCharBuffer(replobj, &repl, &repl_len))
+ else if (PyObject_AsCharBuffer(to, &tmp_s, &tmp_len))
return NULL;
- new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
- if (new_s == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- if (out_len == -1) {
- if (PyString_CheckExact(self)) {
- /* we're returning another reference to self */
- newobj = (PyObject*)self;
- Py_INCREF(newobj);
- }
- else {
- newobj = PyString_FromStringAndSize(str, len);
- if (newobj == NULL)
- return NULL;
- }
- }
- else {
- newobj = PyString_FromStringAndSize(new_s, out_len);
- PyMem_FREE(new_s);
- }
- return newobj;
+ return (PyObject *)replace((PyStringObject *) self,
+ (PyStringObject *) from,
+ (PyStringObject *) to, count);
}
+/** End DALKE **/
PyDoc_STRVAR(startswith__doc__,
"S.startswith(prefix[, start[, end]]) -> bool\n\
@@ -2820,7 +3338,7 @@ string_expandtabs(PyStringObject *self, PyObject *args)
return u;
}
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
pad(PyStringObject *self, Py_ssize_t left, Py_ssize_t right, char fill)
{
PyObject *u;
@@ -3237,6 +3755,14 @@ string_splitlines(PyStringObject *self, PyObject *args)
data = PyString_AS_STRING(self);
len = PyString_GET_SIZE(self);
+ /* This does not use the preallocated list because splitlines is
+ usually run with hundreds of newlines. The overhead of
+ switching between PyList_SET_ITEM and append causes about a
+ 2-3% slowdown for that common case. A smarter implementation
+ could move the if check out, so the SET_ITEMs are done first
+ and the appends only done when the prealloc buffer is full.
+ That's too much work for little gain.*/
+
list = PyList_New(0);
if (!list)
goto onError;
@@ -3274,6 +3800,9 @@ string_splitlines(PyStringObject *self, PyObject *args)
}
#undef SPLIT_APPEND
+#undef SPLIT_ADD
+#undef MAX_PREALLOC
+#undef PREALLOC_SIZE
static PyObject *
string_getnewargs(PyStringObject *v)
@@ -3303,6 +3832,7 @@ string_methods[] = {
{"count", (PyCFunction)string_count, METH_VARARGS, count__doc__},
{"endswith", (PyCFunction)string_endswith, METH_VARARGS,
endswith__doc__},
+ {"partition", (PyCFunction)string_partition, METH_O, partition__doc__},
{"find", (PyCFunction)string_find, METH_VARARGS, find__doc__},
{"index", (PyCFunction)string_index, METH_VARARGS, index__doc__},
{"lstrip", (PyCFunction)string_lstrip, METH_VARARGS, lstrip__doc__},
@@ -3310,6 +3840,8 @@ string_methods[] = {
{"rfind", (PyCFunction)string_rfind, METH_VARARGS, rfind__doc__},
{"rindex", (PyCFunction)string_rindex, METH_VARARGS, rindex__doc__},
{"rstrip", (PyCFunction)string_rstrip, METH_VARARGS, rstrip__doc__},
+ {"rpartition", (PyCFunction)string_rpartition, METH_O,
+ rpartition__doc__},
{"startswith", (PyCFunction)string_startswith, METH_VARARGS,
startswith__doc__},
{"strip", (PyCFunction)string_strip, METH_VARARGS, strip__doc__},
@@ -3566,7 +4098,7 @@ _PyString_Resize(PyObject **pv, Py_ssize_t newsize)
/* Helpers for formatstring */
-static PyObject *
+Py_LOCAL_INLINE(PyObject *)
getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
{
Py_ssize_t argidx = *p_argidx;
@@ -3595,7 +4127,7 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
#define F_ALT (1<<3)
#define F_ZERO (1<<4)
-static int
+Py_LOCAL_INLINE(int)
formatfloat(char *buf, size_t buflen, int flags,
int prec, int type, PyObject *v)
{
@@ -3782,7 +4314,7 @@ _PyString_FormatLong(PyObject *val, int flags, int prec, int type,
return result;
}
-static int
+Py_LOCAL_INLINE(int)
formatint(char *buf, size_t buflen, int flags,
int prec, int type, PyObject *v)
{
@@ -3854,7 +4386,7 @@ formatint(char *buf, size_t buflen, int flags,
return (int)strlen(buf);
}
-static int
+Py_LOCAL_INLINE(int)
formatchar(char *buf, size_t buflen, PyObject *v)
{
/* presume that the buffer is at least 2 characters long */