summaryrefslogtreecommitdiff
path: root/Objects/stringlib
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/stringlib')
-rw-r--r--Objects/stringlib/asciilib.h1
-rw-r--r--Objects/stringlib/codecs.h6
-rw-r--r--Objects/stringlib/fastsearch.h8
-rw-r--r--Objects/stringlib/join.h133
-rw-r--r--Objects/stringlib/replace.h53
-rw-r--r--Objects/stringlib/stringdefs.h1
-rw-r--r--Objects/stringlib/ucs1lib.h1
-rw-r--r--Objects/stringlib/ucs2lib.h1
-rw-r--r--Objects/stringlib/ucs4lib.h1
-rw-r--r--Objects/stringlib/undef.h1
-rw-r--r--Objects/stringlib/unicode_format.h139
-rw-r--r--Objects/stringlib/unicodedefs.h1
12 files changed, 256 insertions, 90 deletions
diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h
index f62813d2fd..d0fc18d22f 100644
--- a/Objects/stringlib/asciilib.h
+++ b/Objects/stringlib/asciilib.h
@@ -19,7 +19,6 @@
#define STRINGLIB_STR PyUnicode_1BYTE_DATA
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
#define STRINGLIB_NEW(STR,LEN) _PyUnicode_FromASCII((char*)(STR),(LEN))
-#define STRINGLIB_RESIZE not_supported
#define STRINGLIB_CHECK PyUnicode_Check
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index f353367013..f855003308 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -47,7 +47,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK)
break;
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
_p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
_p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
_p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
@@ -486,7 +486,7 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
const unsigned char *q = *inptr;
STRINGLIB_CHAR *p = dest + *outpos;
/* Offsets from q for retrieving byte pairs in the right order. */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
int ihi = !!native_ordering, ilo = !native_ordering;
#else
int ihi = !native_ordering, ilo = !!native_ordering;
@@ -517,7 +517,7 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
block = SWAB(block);
#endif
}
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+#if PY_LITTLE_ENDIAN
# if SIZEOF_LONG == 4
p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
p[1] = (STRINGLIB_CHAR)(block >> 16);
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index 55ac77dd70..cd7cac40fa 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -142,6 +142,8 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
mask = 0;
if (mode != FAST_RSEARCH) {
+ const STRINGLIB_CHAR *ss = s + m - 1;
+ const STRINGLIB_CHAR *pp = p + m - 1;
/* create compressed boyer-moore delta 1 table */
@@ -156,7 +158,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */
- if (s[i+m-1] == p[m-1]) {
+ if (ss[i] == pp[0]) {
/* candidate match */
for (j = 0; j < mlast; j++)
if (s[i+j] != p[j])
@@ -172,13 +174,13 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
continue;
}
/* miss: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, s[i+m]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1]))
i = i + m;
else
i = i + skip;
} else {
/* skip: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, s[i+m]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1]))
i = i + m;
}
}
diff --git a/Objects/stringlib/join.h b/Objects/stringlib/join.h
new file mode 100644
index 0000000000..5568b31dab
--- /dev/null
+++ b/Objects/stringlib/join.h
@@ -0,0 +1,133 @@
+/* stringlib: bytes joining implementation */
+
+#if STRINGLIB_SIZEOF_CHAR != 1
+#error join.h only compatible with byte-wise strings
+#endif
+
+Py_LOCAL_INLINE(PyObject *)
+STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable)
+{
+ char *sepstr = STRINGLIB_STR(sep);
+ const Py_ssize_t seplen = STRINGLIB_LEN(sep);
+ PyObject *res = NULL;
+ char *p;
+ Py_ssize_t seqlen = 0;
+ Py_ssize_t sz = 0;
+ Py_ssize_t i, nbufs;
+ PyObject *seq, *item;
+ Py_buffer *buffers = NULL;
+#define NB_STATIC_BUFFERS 10
+ Py_buffer static_buffers[NB_STATIC_BUFFERS];
+
+ seq = PySequence_Fast(iterable, "can only join an iterable");
+ if (seq == NULL) {
+ return NULL;
+ }
+
+ seqlen = PySequence_Fast_GET_SIZE(seq);
+ if (seqlen == 0) {
+ Py_DECREF(seq);
+ return STRINGLIB_NEW(NULL, 0);
+ }
+#ifndef STRINGLIB_MUTABLE
+ if (seqlen == 1) {
+ item = PySequence_Fast_GET_ITEM(seq, 0);
+ if (STRINGLIB_CHECK_EXACT(item)) {
+ Py_INCREF(item);
+ Py_DECREF(seq);
+ return item;
+ }
+ }
+#endif
+ if (seqlen > NB_STATIC_BUFFERS) {
+ buffers = PyMem_NEW(Py_buffer, seqlen);
+ if (buffers == NULL) {
+ Py_DECREF(seq);
+ PyErr_NoMemory();
+ return NULL;
+ }
+ }
+ else {
+ buffers = static_buffers;
+ }
+
+ /* Here is the general case. Do a pre-pass to figure out the total
+ * amount of space we'll need (sz), and see whether all arguments are
+ * buffer-compatible.
+ */
+ for (i = 0, nbufs = 0; i < seqlen; i++) {
+ Py_ssize_t itemlen;
+ item = PySequence_Fast_GET_ITEM(seq, i);
+ if (_getbuffer(item, &buffers[i]) < 0) {
+ PyErr_Format(PyExc_TypeError,
+ "sequence item %zd: expected bytes, bytearray, "
+ "or an object with the buffer interface, %.80s found",
+ i, Py_TYPE(item)->tp_name);
+ goto error;
+ }
+ nbufs = i + 1; /* for error cleanup */
+ itemlen = buffers[i].len;
+ if (itemlen > PY_SSIZE_T_MAX - sz) {
+ PyErr_SetString(PyExc_OverflowError,
+ "join() result is too long");
+ goto error;
+ }
+ sz += itemlen;
+ if (i != 0) {
+ if (seplen > PY_SSIZE_T_MAX - sz) {
+ PyErr_SetString(PyExc_OverflowError,
+ "join() result is too long");
+ goto error;
+ }
+ sz += seplen;
+ }
+ if (seqlen != PySequence_Fast_GET_SIZE(seq)) {
+ PyErr_SetString(PyExc_RuntimeError,
+ "sequence changed size during iteration");
+ goto error;
+ }
+ }
+
+ /* Allocate result space. */
+ res = STRINGLIB_NEW(NULL, sz);
+ if (res == NULL)
+ goto error;
+
+ /* Catenate everything. */
+ p = STRINGLIB_STR(res);
+ if (!seplen) {
+ /* fast path */
+ for (i = 0; i < nbufs; i++) {
+ Py_ssize_t n = buffers[i].len;
+ char *q = buffers[i].buf;
+ Py_MEMCPY(p, q, n);
+ p += n;
+ }
+ goto done;
+ }
+ for (i = 0; i < nbufs; i++) {
+ Py_ssize_t n;
+ char *q;
+ if (i) {
+ Py_MEMCPY(p, sepstr, seplen);
+ p += seplen;
+ }
+ n = buffers[i].len;
+ q = buffers[i].buf;
+ Py_MEMCPY(p, q, n);
+ p += n;
+ }
+ goto done;
+
+error:
+ res = NULL;
+done:
+ Py_DECREF(seq);
+ for (i = 0; i < nbufs; i++)
+ PyBuffer_Release(&buffers[i]);
+ if (buffers != static_buffers)
+ PyMem_FREE(buffers);
+ return res;
+}
+
+#undef NB_STATIC_BUFFERS
diff --git a/Objects/stringlib/replace.h b/Objects/stringlib/replace.h
new file mode 100644
index 0000000000..ef318ed6dd
--- /dev/null
+++ b/Objects/stringlib/replace.h
@@ -0,0 +1,53 @@
+/* stringlib: replace implementation */
+
+#ifndef STRINGLIB_FASTSEARCH_H
+#error must include "stringlib/fastsearch.h" before including this module
+#endif
+
+Py_LOCAL_INLINE(void)
+STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
+ Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
+{
+ *s = u2;
+ while (--maxcount && ++s != end) {
+ /* Find the next character to be replaced.
+
+ If it occurs often, it is faster to scan for it using an inline
+ loop. If it occurs seldom, it is faster to scan for it using a
+ function call; the overhead of the function call is amortized
+ across the many characters that call covers. We start with an
+ inline loop and use a heuristic to determine whether to fall back
+ to a function call. */
+ if (*s != u1) {
+ int attempts = 10;
+ /* search u1 in a dummy loop */
+ while (1) {
+ if (++s == end)
+ return;
+ if (*s == u1)
+ break;
+ if (!--attempts) {
+ /* if u1 was not found for attempts iterations,
+ use FASTSEARCH() or memchr() */
+#if STRINGLIB_SIZEOF_CHAR == 1
+ s++;
+ s = memchr(s, u1, end - s);
+ if (s == NULL)
+ return;
+#else
+ Py_ssize_t i;
+ STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1;
+ s++;
+ i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH);
+ if (i < 0)
+ return;
+ s += i;
+#endif
+ /* restart the dummy loop */
+ break;
+ }
+ }
+ }
+ *s = u2;
+ }
+}
diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h
index 7bb91a7a5b..ce27f3e408 100644
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@@ -21,7 +21,6 @@
#define STRINGLIB_STR PyBytes_AS_STRING
#define STRINGLIB_LEN PyBytes_GET_SIZE
#define STRINGLIB_NEW PyBytes_FromStringAndSize
-#define STRINGLIB_RESIZE _PyBytes_Resize
#define STRINGLIB_CHECK PyBytes_Check
#define STRINGLIB_CHECK_EXACT PyBytes_CheckExact
#define STRINGLIB_TOSTR PyObject_Str
diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h
index e8c6fcb85f..ce1eb57f0d 100644
--- a/Objects/stringlib/ucs1lib.h
+++ b/Objects/stringlib/ucs1lib.h
@@ -19,7 +19,6 @@
#define STRINGLIB_STR PyUnicode_1BYTE_DATA
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
#define STRINGLIB_NEW _PyUnicode_FromUCS1
-#define STRINGLIB_RESIZE not_supported
#define STRINGLIB_CHECK PyUnicode_Check
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h
index 45e572963d..f900cb65f8 100644
--- a/Objects/stringlib/ucs2lib.h
+++ b/Objects/stringlib/ucs2lib.h
@@ -19,7 +19,6 @@
#define STRINGLIB_STR PyUnicode_2BYTE_DATA
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
#define STRINGLIB_NEW _PyUnicode_FromUCS2
-#define STRINGLIB_RESIZE not_supported
#define STRINGLIB_CHECK PyUnicode_Check
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h
index 647a27e233..86a480f1e3 100644
--- a/Objects/stringlib/ucs4lib.h
+++ b/Objects/stringlib/ucs4lib.h
@@ -19,7 +19,6 @@
#define STRINGLIB_STR PyUnicode_4BYTE_DATA
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
#define STRINGLIB_NEW _PyUnicode_FromUCS4
-#define STRINGLIB_RESIZE not_supported
#define STRINGLIB_CHECK PyUnicode_Check
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
diff --git a/Objects/stringlib/undef.h b/Objects/stringlib/undef.h
index 03117ec443..f9d3f1d332 100644
--- a/Objects/stringlib/undef.h
+++ b/Objects/stringlib/undef.h
@@ -6,7 +6,6 @@
#undef STRINGLIB_STR
#undef STRINGLIB_LEN
#undef STRINGLIB_NEW
-#undef STRINGLIB_RESIZE
#undef _Py_InsertThousandsGrouping
#undef STRINGLIB_IS_UNICODE
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index c1c2cf3781..aec221acff 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -543,7 +543,7 @@ done:
static int
parse_field(SubString *str, SubString *field_name, SubString *format_spec,
- Py_UCS4 *conversion)
+ int *format_spec_needs_expanding, Py_UCS4 *conversion)
{
/* Note this function works if the field name is zero length,
which is good. Zero length field names are handled later, in
@@ -561,6 +561,15 @@ parse_field(SubString *str, SubString *field_name, SubString *format_spec,
field_name->start = str->start;
while (str->start < str->end) {
switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
+ case '{':
+ PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
+ return 0;
+ case '[':
+ for (; str->start < str->end; str->start++)
+ if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
+ break;
+ continue;
+ case '}':
case ':':
case '!':
break;
@@ -570,41 +579,62 @@ parse_field(SubString *str, SubString *field_name, SubString *format_spec,
break;
}
+ field_name->end = str->start - 1;
if (c == '!' || c == ':') {
+ Py_ssize_t count;
/* we have a format specifier and/or a conversion */
/* don't include the last character */
- field_name->end = str->start-1;
-
- /* the format specifier is the rest of the string */
- format_spec->str = str->str;
- format_spec->start = str->start;
- format_spec->end = str->end;
/* see if there's a conversion specifier */
if (c == '!') {
/* there must be another character present */
- if (format_spec->start >= format_spec->end) {
+ if (str->start >= str->end) {
PyErr_SetString(PyExc_ValueError,
- "end of format while looking for conversion "
+ "end of string while looking for conversion "
"specifier");
return 0;
}
- *conversion = PyUnicode_READ_CHAR(format_spec->str, format_spec->start++);
+ *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
- /* if there is another character, it must be a colon */
- if (format_spec->start < format_spec->end) {
- c = PyUnicode_READ_CHAR(format_spec->str, format_spec->start++);
+ if (str->start < str->end) {
+ c = PyUnicode_READ_CHAR(str->str, str->start++);
+ if (c == '}')
+ return 1;
if (c != ':') {
PyErr_SetString(PyExc_ValueError,
- "expected ':' after format specifier");
+ "expected ':' after conversion specifier");
return 0;
}
}
}
+ format_spec->str = str->str;
+ format_spec->start = str->start;
+ count = 1;
+ while (str->start < str->end) {
+ switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
+ case '{':
+ *format_spec_needs_expanding = 1;
+ count++;
+ break;
+ case '}':
+ count--;
+ if (count == 0) {
+ format_spec->end = str->start - 1;
+ return 1;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
+ return 0;
+ }
+ else if (c != '}') {
+ PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
+ return 0;
}
- else
- /* end of string, there's no format_spec or conversion */
- field_name->end = str->start;
return 1;
}
@@ -638,10 +668,9 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal,
SubString *format_spec, Py_UCS4 *conversion,
int *format_spec_needs_expanding)
{
- int at_end, hit_format_spec;
+ int at_end;
Py_UCS4 c = 0;
Py_ssize_t start;
- int count;
Py_ssize_t len;
int markup_follows = 0;
@@ -713,50 +742,12 @@ MarkupIterator_next(MarkupIterator *self, SubString *literal,
if (!markup_follows)
return 2;
- /* this is markup, find the end of the string by counting nested
- braces. note that this prohibits escaped braces, so that
- format_specs cannot have braces in them. */
+ /* this is markup; parse the field */
*field_present = 1;
- count = 1;
-
- start = self->str.start;
-
- /* we know we can't have a zero length string, so don't worry
- about that case */
- hit_format_spec = 0;
- while (self->str.start < self->str.end) {
- switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
- case ':':
- hit_format_spec = 1;
- count = 1;
- break;
- case '{':
- /* the format spec needs to be recursively expanded.
- this is an optimization, and not strictly needed */
- if (hit_format_spec)
- *format_spec_needs_expanding = 1;
- count++;
- break;
- case '}':
- count--;
- if (count <= 0) {
- /* we're done. parse and get out */
- SubString s;
-
- SubString_init(&s, self->str.str, start, self->str.start - 1);
- if (parse_field(&s, field_name, format_spec, conversion) == 0)
- return 0;
-
- /* success */
- return 2;
- }
- break;
- }
- }
-
- /* end of string while searching for matching '}' */
- PyErr_SetString(PyExc_ValueError, "unmatched '{' in format");
- return 0;
+ if (!parse_field(&self->str, field_name, format_spec,
+ format_spec_needs_expanding, conversion))
+ return 0;
+ return 2;
}
@@ -875,25 +866,19 @@ do_markup(SubString *input, PyObject *args, PyObject *kwargs,
SubString literal;
SubString field_name;
SubString format_spec;
- Py_UCS4 conversion, maxchar;
- Py_ssize_t sublen;
- int err;
+ Py_UCS4 conversion;
MarkupIterator_init(&iter, input->str, input->start, input->end);
while ((result = MarkupIterator_next(&iter, &literal, &field_present,
&field_name, &format_spec,
&conversion,
&format_spec_needs_expanding)) == 2) {
- sublen = literal.end - literal.start;
- if (sublen) {
- maxchar = _PyUnicode_FindMaxChar(literal.str,
- literal.start, literal.end);
- err = _PyUnicodeWriter_Prepare(writer, sublen, maxchar);
- if (err == -1)
+ if (literal.end != literal.start) {
+ if (!field_present && iter.str.start == iter.str.end)
+ writer->overallocate = 0;
+ if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
+ literal.start, literal.end) < 0)
return 0;
- _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
- literal.str, literal.start, sublen);
- writer->pos += sublen;
}
if (field_present) {
@@ -918,7 +903,6 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
int recursion_depth, AutoNumber *auto_number)
{
_PyUnicodeWriter writer;
- Py_ssize_t minlen;
/* check the recursion level */
if (recursion_depth <= 0) {
@@ -927,8 +911,9 @@ build_string(SubString *input, PyObject *args, PyObject *kwargs,
return NULL;
}
- minlen = PyUnicode_GET_LENGTH(input->str) + 100;
- _PyUnicodeWriter_Init(&writer, minlen);
+ _PyUnicodeWriter_Init(&writer);
+ writer.overallocate = 1;
+ writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
if (!do_markup(input, args, kwargs, &writer, recursion_depth,
auto_number)) {
diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h
index f16f21e60c..48d00eccd0 100644
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@@ -21,7 +21,6 @@
#define STRINGLIB_STR PyUnicode_AS_UNICODE
#define STRINGLIB_LEN PyUnicode_GET_SIZE
#define STRINGLIB_NEW PyUnicode_FromUnicode
-#define STRINGLIB_RESIZE PyUnicode_Resize
#define STRINGLIB_CHECK PyUnicode_Check
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact