summaryrefslogtreecommitdiff
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c1732
1 files changed, 1082 insertions, 650 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 2ec878f916..b70666106d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -41,9 +41,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#define PY_SSIZE_T_CLEAN
#include "Python.h"
-#include "bytes_methods.h"
-
-#include "unicodeobject.h"
#include "ucnhash.h"
#ifdef MS_WINDOWS
@@ -114,21 +111,12 @@ static PyUnicodeObject *unicode_empty;
shared as well. */
static PyUnicodeObject *unicode_latin1[256];
-/* Default encoding to use and assume when NULL is passed as encoding
- parameter; it is fixed to "utf-8". Always use the
- PyUnicode_GetDefaultEncoding() API to access this global.
-
- Don't forget to alter Py_FileSystemDefaultEncoding if you change the
- hard coded default!
-*/
-static const char unicode_default_encoding[] = "utf-8";
-
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0,
-/* case 0x0009: * HORIZONTAL TABULATION */
+/* case 0x0009: * CHARACTER TABULATION */
/* case 0x000A: * LINE FEED */
-/* case 0x000B: * VERTICAL TABULATION */
+/* case 0x000B: * LINE TABULATION */
/* case 0x000C: * FORM FEED */
/* case 0x000D: * CARRIAGE RETURN */
0, 1, 1, 1, 1, 1, 0, 0,
@@ -169,8 +157,10 @@ static void raise_encode_exception(PyObject **exceptionObject,
static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x000A, * LINE FEED */
+/* 0x000B, * LINE TABULATION */
+/* 0x000C, * FORM FEED */
/* 0x000D, * CARRIAGE RETURN */
- 0, 0, 1, 0, 0, 1, 0, 0,
+ 0, 0, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x001C, * FILE SEPARATOR */
/* 0x001D, * GROUP SEPARATOR */
@@ -212,11 +202,22 @@ PyUnicode_GetMax(void)
/* the linebreak mask is set up by Unicode_Init below */
+#if LONG_BIT >= 128
+#define BLOOM_WIDTH 128
+#elif LONG_BIT >= 64
+#define BLOOM_WIDTH 64
+#elif LONG_BIT >= 32
+#define BLOOM_WIDTH 32
+#else
+#error "LONG_BIT is smaller than 32"
+#endif
+
#define BLOOM_MASK unsigned long
static BLOOM_MASK bloom_linebreak;
-#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
+#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM_LINEBREAK(ch) \
((ch) < 128U ? ascii_linebreak[(ch)] : \
@@ -226,12 +227,12 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
{
/* calculate simple bloom-style bitmask for a given unicode string */
- long mask;
+ BLOOM_MASK mask;
Py_ssize_t i;
mask = 0;
for (i = 0; i < len; i++)
- mask |= (1 << (ptr[i] & 0x1F));
+ BLOOM_ADD(mask, ptr[i]);
return mask;
}
@@ -651,7 +652,7 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return NULL;
/* Copy the wchar_t data into the new object */
-#ifdef HAVE_USABLE_WCHAR_T
+#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
memcpy(unicode->str, w, size * sizeof(wchar_t));
#else
{
@@ -671,7 +672,8 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
#undef CONVERT_WCHAR_TO_SURROGATES
static void
-makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
+makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
+ int zeropad, int width, int precision, char c)
{
*fmt++ = '%';
if (width) {
@@ -683,6 +685,19 @@ makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int pre
fmt += sprintf(fmt, ".%d", precision);
if (longflag)
*fmt++ = 'l';
+ else if (longlongflag) {
+ /* longlongflag should only ever be nonzero on machines with
+ HAVE_LONG_LONG defined */
+#ifdef HAVE_LONG_LONG
+ char *f = PY_FORMAT_LONG_LONG;
+ while (*f)
+ *fmt++ = *f++;
+#else
+ /* we shouldn't ever get here */
+ assert(0);
+ *fmt++ = 'l';
+#endif
+ }
else if (size_tflag) {
char *f = PY_FORMAT_SIZE_T;
while (*f)
@@ -694,6 +709,16 @@ makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int pre
#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
+/* size of fixed-size buffer for formatting single arguments */
+#define ITEM_BUFFER_LEN 21
+/* maximum number of characters required for output of %ld. 21 characters
+ allows for 64-bit integers (in decimal) and an optional sign. */
+#define MAX_LONG_CHARS 21
+/* maximum number of characters required for output of %lld.
+ We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
+ plus 1 for the sign. 53/22 is an upper bound for log10(256). */
+#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+
PyObject *
PyUnicode_FromFormatV(const char *format, va_list vargs)
{
@@ -709,24 +734,16 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
Py_UNICODE *s;
PyObject *string;
/* used by sprintf */
- char buffer[21];
+ char buffer[ITEM_BUFFER_LEN+1];
/* use abuffer instead of buffer, if we need more space
* (which can happen if there's a format specifier with width). */
char *abuffer = NULL;
char *realbuffer;
Py_ssize_t abuffersize = 0;
- char fmt[60]; /* should be enough for %0width.precisionld */
+ char fmt[61]; /* should be enough for %0width.precisionlld */
const char *copy;
-#ifdef VA_LIST_IS_ARRAY
- Py_MEMCPY(count, vargs, sizeof(va_list));
-#else
-#ifdef __va_copy
- __va_copy(count, vargs);
-#else
- count = vargs;
-#endif
-#endif
+ Py_VA_COPY(count, vargs);
/* step 1: count the number of %S/%R/%A/%s format specifications
* (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
* PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
@@ -735,15 +752,22 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
if (*f == '%') {
if (*(f+1)=='%')
continue;
- if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
+ if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
++callcount;
- while (ISDIGIT((unsigned)*f))
+ while (Py_ISDIGIT((unsigned)*f))
width = (width*10) + *f++ - '0';
- while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
+ while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
;
if (*f == 's')
++callcount;
}
+ else if (128 <= (unsigned char)*f) {
+ PyErr_Format(PyExc_ValueError,
+ "PyUnicode_FromFormatV() expects an ASCII-encoded format "
+ "string, got a non-ASCII byte: 0x%02x",
+ (unsigned char)*f);
+ return NULL;
+ }
}
/* step 2: allocate memory for the results of
* PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
@@ -758,37 +782,70 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
/* step 3: figure out how large a buffer we need */
for (f = format; *f; f++) {
if (*f == '%') {
+#ifdef HAVE_LONG_LONG
+ int longlongflag = 0;
+#endif
const char* p = f;
width = 0;
- while (ISDIGIT((unsigned)*f))
+ while (Py_ISDIGIT((unsigned)*f))
width = (width*10) + *f++ - '0';
- while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
+ while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
;
/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
* they don't affect the amount of space we reserve.
*/
- if ((*f == 'l' || *f == 'z') &&
- (f[1] == 'd' || f[1] == 'u'))
+ if (*f == 'l') {
+ if (f[1] == 'd' || f[1] == 'u') {
+ ++f;
+ }
+#ifdef HAVE_LONG_LONG
+ else if (f[1] == 'l' &&
+ (f[2] == 'd' || f[2] == 'u')) {
+ longlongflag = 1;
+ f += 2;
+ }
+#endif
+ }
+ else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
++f;
+ }
switch (*f) {
case 'c':
+ {
+#ifndef Py_UNICODE_WIDE
+ int ordinal = va_arg(count, int);
+ if (ordinal > 0xffff)
+ n += 2;
+ else
+ n++;
+#else
(void)va_arg(count, int);
- /* fall through... */
+ n++;
+#endif
+ break;
+ }
case '%':
n++;
break;
case 'd': case 'u': case 'i': case 'x':
(void) va_arg(count, int);
- /* 20 bytes is enough to hold a 64-bit
- integer. Decimal takes the most space.
- This isn't enough for octal.
- If a width is specified we need more
- (which we allocate later). */
- if (width < 20)
- width = 20;
+#ifdef HAVE_LONG_LONG
+ if (longlongflag) {
+ if (width < MAX_LONG_LONG_CHARS)
+ width = MAX_LONG_LONG_CHARS;
+ }
+ else
+#endif
+ /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
+ including sign. Decimal takes the most space. This
+ isn't enough for octal. If a width is specified we
+ need more (which we allocate later). */
+ if (width < MAX_LONG_CHARS)
+ width = MAX_LONG_CHARS;
n += width;
+ /* XXX should allow for large precision here too. */
if (abuffersize < width)
abuffersize = width;
break;
@@ -815,12 +872,20 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
{
PyObject *obj = va_arg(count, PyObject *);
const char *str = va_arg(count, const char *);
+ PyObject *str_obj;
assert(obj || str);
assert(!obj || PyUnicode_Check(obj));
- if (obj)
+ if (obj) {
n += PyUnicode_GET_SIZE(obj);
- else
- n += strlen(str);
+ *callresult++ = NULL;
+ }
+ else {
+ str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
+ if (!str_obj)
+ goto fail;
+ n += PyUnicode_GET_SIZE(str_obj);
+ *callresult++ = str_obj;
+ }
break;
}
case 'S':
@@ -885,8 +950,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
n++;
}
expand:
- if (abuffersize > 20) {
- abuffer = PyObject_Malloc(abuffersize);
+ if (abuffersize > ITEM_BUFFER_LEN) {
+ /* add 1 for sprintf's trailing null byte */
+ abuffer = PyObject_Malloc(abuffersize + 1);
if (!abuffer) {
PyErr_NoMemory();
goto fail;
@@ -910,23 +976,32 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
if (*f == '%') {
const char* p = f++;
int longflag = 0;
+ int longlongflag = 0;
int size_tflag = 0;
zeropad = (*f == '0');
/* parse the width.precision part */
width = 0;
- while (ISDIGIT((unsigned)*f))
+ while (Py_ISDIGIT((unsigned)*f))
width = (width*10) + *f++ - '0';
precision = 0;
if (*f == '.') {
f++;
- while (ISDIGIT((unsigned)*f))
+ while (Py_ISDIGIT((unsigned)*f))
precision = (precision*10) + *f++ - '0';
}
- /* handle the long flag, but only for %ld and %lu.
- others can be added when necessary. */
- if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
- longflag = 1;
- ++f;
+ /* Handle %ld, %lu, %lld and %llu. */
+ if (*f == 'l') {
+ if (f[1] == 'd' || f[1] == 'u') {
+ longflag = 1;
+ ++f;
+ }
+#ifdef HAVE_LONG_LONG
+ else if (f[1] == 'l' &&
+ (f[2] == 'd' || f[2] == 'u')) {
+ longlongflag = 1;
+ f += 2;
+ }
+#endif
}
/* handle the size_t flag. */
if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
@@ -936,12 +1011,27 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
switch (*f) {
case 'c':
- *s++ = va_arg(vargs, int);
+ {
+ int ordinal = va_arg(vargs, int);
+#ifndef Py_UNICODE_WIDE
+ if (ordinal > 0xffff) {
+ ordinal -= 0x10000;
+ *s++ = 0xD800 | (ordinal >> 10);
+ *s++ = 0xDC00 | (ordinal & 0x3FF);
+ } else
+#endif
+ *s++ = ordinal;
break;
+ }
case 'd':
- makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
+ makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
+ width, precision, 'd');
if (longflag)
sprintf(realbuffer, fmt, va_arg(vargs, long));
+#ifdef HAVE_LONG_LONG
+ else if (longlongflag)
+ sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
+#endif
else if (size_tflag)
sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
else
@@ -949,9 +1039,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
appendstring(realbuffer);
break;
case 'u':
- makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
+ makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
+ width, precision, 'u');
if (longflag)
sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
+#ifdef HAVE_LONG_LONG
+ else if (longlongflag)
+ sprintf(realbuffer, fmt, va_arg(vargs,
+ unsigned PY_LONG_LONG));
+#endif
else if (size_tflag)
sprintf(realbuffer, fmt, va_arg(vargs, size_t));
else
@@ -959,12 +1055,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
appendstring(realbuffer);
break;
case 'i':
- makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
+ makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
sprintf(realbuffer, fmt, va_arg(vargs, int));
appendstring(realbuffer);
break;
case 'x':
- makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
+ makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
sprintf(realbuffer, fmt, va_arg(vargs, int));
appendstring(realbuffer);
break;
@@ -992,18 +1088,23 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 'V':
{
PyObject *obj = va_arg(vargs, PyObject *);
- const char *str = va_arg(vargs, const char *);
+ va_arg(vargs, const char *);
if (obj) {
Py_ssize_t size = PyUnicode_GET_SIZE(obj);
Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
s += size;
} else {
- appendstring(str);
+ Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
+ PyUnicode_GET_SIZE(*callresult));
+ s += PyUnicode_GET_SIZE(*callresult);
+ Py_DECREF(*callresult);
}
+ ++callresult;
break;
}
case 'S':
case 'R':
+ case 'A':
{
Py_UNICODE *ucopy;
Py_ssize_t usize;
@@ -1039,7 +1140,8 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
appendstring(p);
goto end;
}
- } else
+ }
+ else
*s++ = *f;
}
@@ -1054,7 +1156,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
if (callresults) {
PyObject **callresult2 = callresults;
while (callresult2 < callresult) {
- Py_DECREF(*callresult2);
+ Py_XDECREF(*callresult2);
++callresult2;
}
PyObject_Free(callresults);
@@ -1082,35 +1184,154 @@ PyUnicode_FromFormat(const char *format, ...)
return ret;
}
-Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
- wchar_t *w,
- Py_ssize_t size)
+/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
+ convert a Unicode object to a wide character string.
+
+ - If w is NULL: return the number of wide characters (including the null
+ character) required to convert the unicode object. Ignore size argument.
+
+ - Otherwise: return the number of wide characters (excluding the null
+ character) written into w. Write at most size wide characters (including
+ the null character). */
+static Py_ssize_t
+unicode_aswidechar(PyUnicodeObject *unicode,
+ wchar_t *w,
+ Py_ssize_t size)
+{
+#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
+ Py_ssize_t res;
+ if (w != NULL) {
+ res = PyUnicode_GET_SIZE(unicode);
+ if (size > res)
+ size = res + 1;
+ else
+ res = size;
+ memcpy(w, unicode->str, size * sizeof(wchar_t));
+ return res;
+ }
+ else
+ return PyUnicode_GET_SIZE(unicode) + 1;
+#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
+ register const Py_UNICODE *u;
+ const Py_UNICODE *uend;
+ const wchar_t *worig, *wend;
+ Py_ssize_t nchar;
+
+ u = PyUnicode_AS_UNICODE(unicode);
+ uend = u + PyUnicode_GET_SIZE(unicode);
+ if (w != NULL) {
+ worig = w;
+ wend = w + size;
+ while (u != uend && w != wend) {
+ if (0xD800 <= u[0] && u[0] <= 0xDBFF
+ && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ {
+ *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
+ u += 2;
+ }
+ else {
+ *w = *u;
+ u++;
+ }
+ w++;
+ }
+ if (w != wend)
+ *w = L'\0';
+ return w - worig;
+ }
+ else {
+ nchar = 1; /* null character at the end */
+ while (u != uend) {
+ if (0xD800 <= u[0] && u[0] <= 0xDBFF
+ && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
+ u += 2;
+ else
+ u++;
+ nchar++;
+ }
+ }
+ return nchar;
+#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
+ register Py_UNICODE *u, *uend, ordinal;
+ register Py_ssize_t i;
+ wchar_t *worig, *wend;
+ Py_ssize_t nchar;
+
+ u = PyUnicode_AS_UNICODE(unicode);
+ uend = u + PyUnicode_GET_SIZE(u);
+ if (w != NULL) {
+ worig = w;
+ wend = w + size;
+ while (u != uend && w != wend) {
+ ordinal = *u;
+ if (ordinal > 0xffff) {
+ ordinal -= 0x10000;
+ *w++ = 0xD800 | (ordinal >> 10);
+ *w++ = 0xDC00 | (ordinal & 0x3FF);
+ }
+ else
+ *w++ = ordinal;
+ u++;
+ }
+ if (w != wend)
+ *w = 0;
+ return w - worig;
+ }
+ else {
+ nchar = 1; /* null character */
+ while (u != uend) {
+ if (*u > 0xffff)
+ nchar += 2;
+ else
+ nchar++;
+ u++;
+ }
+ return nchar;
+ }
+#else
+# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
+#endif
+}
+
+Py_ssize_t
+PyUnicode_AsWideChar(PyObject *unicode,
+ wchar_t *w,
+ Py_ssize_t size)
{
if (unicode == NULL) {
PyErr_BadInternalCall();
return -1;
}
+ return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
+}
- /* If possible, try to copy the 0-termination as well */
- if (size > PyUnicode_GET_SIZE(unicode))
- size = PyUnicode_GET_SIZE(unicode) + 1;
+wchar_t*
+PyUnicode_AsWideCharString(PyObject *unicode,
+ Py_ssize_t *size)
+{
+ wchar_t* buffer;
+ Py_ssize_t buflen;
-#ifdef HAVE_USABLE_WCHAR_T
- memcpy(w, unicode->str, size * sizeof(wchar_t));
-#else
- {
- register Py_UNICODE *u;
- register Py_ssize_t i;
- u = PyUnicode_AS_UNICODE(unicode);
- for (i = size; i > 0; i--)
- *w++ = *u++;
+ if (unicode == NULL) {
+ PyErr_BadInternalCall();
+ return NULL;
}
-#endif
- if (size > PyUnicode_GET_SIZE(unicode))
- return PyUnicode_GET_SIZE(unicode);
- else
- return size;
+ buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
+ if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+
+ buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
+ if (buffer == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
+ if (size != NULL)
+ *size = buflen;
+ return buffer;
}
#endif
@@ -1210,27 +1431,26 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
return v;
}
-PyObject *PyUnicode_Decode(const char *s,
- Py_ssize_t size,
- const char *encoding,
- const char *errors)
+/* Convert encoding to lower case and replace '_' with '-' in order to
+ catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
+ 1 on success. */
+static int
+normalize_encoding(const char *encoding,
+ char *lower,
+ size_t lower_len)
{
- PyObject *buffer = NULL, *unicode;
- Py_buffer info;
- char lower[20]; /* Enough for any encoding name we recognize */
- char *l;
const char *e;
+ char *l;
+ char *l_end;
- if (encoding == NULL)
- encoding = PyUnicode_GetDefaultEncoding();
-
- /* Convert encoding to lower case and replace '_' with '-' in order to
- catch e.g. UTF_8 */
e = encoding;
l = lower;
- while (*e && l < &lower[(sizeof lower) - 2]) {
- if (ISUPPER(*e)) {
- *l++ = TOLOWER(*e++);
+ l_end = &lower[lower_len - 1];
+ while (*e) {
+ if (l == l_end)
+ return 0;
+ if (Py_ISUPPER(*e)) {
+ *l++ = Py_TOLOWER(*e++);
}
else if (*e == '_') {
*l++ = '-';
@@ -1241,23 +1461,39 @@ PyObject *PyUnicode_Decode(const char *s,
}
}
*l = '\0';
+ return 1;
+}
+
+PyObject *PyUnicode_Decode(const char *s,
+ Py_ssize_t size,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *buffer = NULL, *unicode;
+ Py_buffer info;
+ char lower[11]; /* Enough for any encoding shortcut */
+
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
- if (strcmp(lower, "utf-8") == 0)
- return PyUnicode_DecodeUTF8(s, size, errors);
- else if ((strcmp(lower, "latin-1") == 0) ||
- (strcmp(lower, "iso-8859-1") == 0))
- return PyUnicode_DecodeLatin1(s, size, errors);
+ if (normalize_encoding(encoding, lower, sizeof(lower))) {
+ if (strcmp(lower, "utf-8") == 0)
+ return PyUnicode_DecodeUTF8(s, size, errors);
+ else if ((strcmp(lower, "latin-1") == 0) ||
+ (strcmp(lower, "iso-8859-1") == 0))
+ return PyUnicode_DecodeLatin1(s, size, errors);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- else if (strcmp(lower, "mbcs") == 0)
- return PyUnicode_DecodeMBCS(s, size, errors);
+ else if (strcmp(lower, "mbcs") == 0)
+ return PyUnicode_DecodeMBCS(s, size, errors);
#endif
- else if (strcmp(lower, "ascii") == 0)
- return PyUnicode_DecodeASCII(s, size, errors);
- else if (strcmp(lower, "utf-16") == 0)
- return PyUnicode_DecodeUTF16(s, size, errors, 0);
- else if (strcmp(lower, "utf-32") == 0)
- return PyUnicode_DecodeUTF32(s, size, errors, 0);
+ else if (strcmp(lower, "ascii") == 0)
+ return PyUnicode_DecodeASCII(s, size, errors);
+ else if (strcmp(lower, "utf-16") == 0)
+ return PyUnicode_DecodeUTF16(s, size, errors, 0);
+ else if (strcmp(lower, "utf-32") == 0)
+ return PyUnicode_DecodeUTF32(s, size, errors, 0);
+ }
/* Decode via the codec registry */
buffer = NULL;
@@ -1378,11 +1614,77 @@ PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL;
}
+PyObject *
+PyUnicode_EncodeFSDefault(PyObject *unicode)
+{
+#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
+ return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ NULL);
+#elif defined(__APPLE__)
+ return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ "surrogateescape");
+#else
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
+ /* Bootstrap check: if the filesystem codec is implemented in Python, we
+ cannot use it to encode and decode filenames before it is loaded. Load
+ the Python codec requires to encode at least its own filename. Use the C
+ version of the locale codec until the codec registry is initialized and
+ the Python codec is loaded.
+
+ Py_FileSystemDefaultEncoding is shared between all interpreters, we
+ cannot only rely on it: check also interp->fscodec_initialized for
+ subinterpreters. */
+ if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
+ return PyUnicode_AsEncodedString(unicode,
+ Py_FileSystemDefaultEncoding,
+ "surrogateescape");
+ }
+ else {
+ /* locale encoding with surrogateescape */
+ wchar_t *wchar;
+ char *bytes;
+ PyObject *bytes_obj;
+ size_t error_pos;
+
+ wchar = PyUnicode_AsWideCharString(unicode, NULL);
+ if (wchar == NULL)
+ return NULL;
+ bytes = _Py_wchar2char(wchar, &error_pos);
+ if (bytes == NULL) {
+ if (error_pos != (size_t)-1) {
+ char *errmsg = strerror(errno);
+ PyObject *exc = NULL;
+ if (errmsg == NULL)
+ errmsg = "Py_wchar2char() failed";
+ raise_encode_exception(&exc,
+ "filesystemencoding",
+ PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
+ error_pos, error_pos+1,
+ errmsg);
+ Py_XDECREF(exc);
+ }
+ else
+ PyErr_NoMemory();
+ PyMem_Free(wchar);
+ return NULL;
+ }
+ PyMem_Free(wchar);
+
+ bytes_obj = PyBytes_FromString(bytes);
+ PyMem_Free(bytes);
+ return bytes_obj;
+ }
+#endif
+}
+
PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
const char *encoding,
const char *errors)
{
PyObject *v;
+ char lower[11]; /* Enough for any encoding shortcut */
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
@@ -1393,31 +1695,42 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
encoding = PyUnicode_GetDefaultEncoding();
/* Shortcuts for common default encodings */
- if (errors == NULL) {
- if (strcmp(encoding, "utf-8") == 0)
- return PyUnicode_AsUTF8String(unicode);
- else if (strcmp(encoding, "latin-1") == 0)
- return PyUnicode_AsLatin1String(unicode);
+ if (normalize_encoding(encoding, lower, sizeof(lower))) {
+ if (strcmp(lower, "utf-8") == 0)
+ return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ errors);
+ else if ((strcmp(lower, "latin-1") == 0) ||
+ (strcmp(lower, "iso-8859-1") == 0))
+ return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ errors);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- else if (strcmp(encoding, "mbcs") == 0)
- return PyUnicode_AsMBCSString(unicode);
+ else if (strcmp(lower, "mbcs") == 0)
+ return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ errors);
#endif
- else if (strcmp(encoding, "ascii") == 0)
- return PyUnicode_AsASCIIString(unicode);
- /* During bootstrap, we may need to find the encodings
- package, to load the file system encoding, and require the
- file system encoding in order to load the encodings
- package.
-
- Break out of this dependency by assuming that the path to
- the encodings module is ASCII-only. XXX could try wcstombs
- instead, if the file system encoding is the locale's
- encoding. */
- else if (Py_FileSystemDefaultEncoding &&
- strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
- !PyThreadState_GET()->interp->codecs_initialized)
- return PyUnicode_AsASCIIString(unicode);
- }
+ else if (strcmp(lower, "ascii") == 0)
+ return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ errors);
+ }
+ /* During bootstrap, we may need to find the encodings
+ package, to load the file system encoding, and require the
+ file system encoding in order to load the encodings
+ package.
+
+ Break out of this dependency by assuming that the path to
+ the encodings module is ASCII-only. XXX could try wcstombs
+ instead, if the file system encoding is the locale's
+ encoding. */
+ if (Py_FileSystemDefaultEncoding &&
+ strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
+ !PyThreadState_GET()->interp->codecs_initialized)
+ return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ errors);
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
@@ -1430,12 +1743,13 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
/* If the codec returns a buffer, raise a warning and convert to bytes */
if (PyByteArray_Check(v)) {
- char msg[100];
+ int error;
PyObject *b;
- PyOS_snprintf(msg, sizeof(msg),
- "encoder %s returned buffer instead of bytes",
- encoding);
- if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
+
+ error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+ "encoder %s returned bytearray instead of bytes",
+ encoding);
+ if (error) {
Py_DECREF(v);
return NULL;
}
@@ -1509,32 +1823,61 @@ PyUnicode_DecodeFSDefault(const char *s) {
PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
{
- /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
- can be undefined. If it is case, decode using UTF-8. The following assumes
- that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
- bootstrapping process where the codecs aren't ready yet.
- */
- if (Py_FileSystemDefaultEncoding) {
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
- if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
- return PyUnicode_DecodeMBCS(s, size, "replace");
- }
+ return PyUnicode_DecodeMBCS(s, size, NULL);
#elif defined(__APPLE__)
- if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
- return PyUnicode_DecodeUTF8(s, size, "replace");
- }
-#endif
+ return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
+#else
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
+ /* Bootstrap check: if the filesystem codec is implemented in Python, we
+ cannot use it to encode and decode filenames before it is loaded. Load
+ the Python codec requires to encode at least its own filename. Use the C
+ version of the locale codec until the codec registry is initialized and
+ the Python codec is loaded.
+
+ Py_FileSystemDefaultEncoding is shared between all interpreters, we
+ cannot only rely on it: check also interp->fscodec_initialized for
+ subinterpreters. */
+ if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
return PyUnicode_Decode(s, size,
Py_FileSystemDefaultEncoding,
- "replace");
+ "surrogateescape");
}
else {
- return PyUnicode_DecodeUTF8(s, size, "replace");
+ /* locale encoding with surrogateescape */
+ wchar_t *wchar;
+ PyObject *unicode;
+ size_t len;
+
+ if (s[size] != '\0' || size != strlen(s)) {
+ PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+ return NULL;
+ }
+
+ wchar = _Py_char2wchar(s, &len);
+ if (wchar == NULL)
+ return PyErr_NoMemory();
+
+ unicode = PyUnicode_FromWideChar(wchar, len);
+ PyMem_Free(wchar);
+ return unicode;
}
+#endif
+}
+
+
+int
+_PyUnicode_HasNULChars(PyObject* s)
+{
+ static PyObject *nul = NULL;
+
+ if (nul == NULL)
+ nul = PyUnicode_FromStringAndSize("\0", 1);
+ if (nul == NULL)
+ return -1;
+ return PyUnicode_Contains(s, nul);
}
-/* Convert the argument to a bytes object, according to the file
- system encoding */
int
PyUnicode_FSConverter(PyObject* arg, void* addr)
@@ -1546,7 +1889,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
Py_DECREF(*(PyObject**)addr);
return 1;
}
- if (PyBytes_Check(arg) || PyByteArray_Check(arg)) {
+ if (PyBytes_Check(arg)) {
output = arg;
Py_INCREF(output);
}
@@ -1554,9 +1897,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
arg = PyUnicode_FromObject(arg);
if (!arg)
return 0;
- output = PyUnicode_AsEncodedObject(arg,
- Py_FileSystemDefaultEncoding,
- "surrogateescape");
+ output = PyUnicode_EncodeFSDefault(arg);
Py_DECREF(arg);
if (!output)
return 0;
@@ -1566,15 +1907,50 @@ PyUnicode_FSConverter(PyObject* arg, void* addr)
return 0;
}
}
- if (PyBytes_Check(output)) {
- size = PyBytes_GET_SIZE(output);
- data = PyBytes_AS_STRING(output);
+ size = PyBytes_GET_SIZE(output);
+ data = PyBytes_AS_STRING(output);
+ if (size != strlen(data)) {
+ PyErr_SetString(PyExc_TypeError, "embedded NUL character");
+ Py_DECREF(output);
+ return 0;
+ }
+ *(PyObject**)addr = output;
+ return Py_CLEANUP_SUPPORTED;
+}
+
+
+int
+PyUnicode_FSDecoder(PyObject* arg, void* addr)
+{
+ PyObject *output = NULL;
+ Py_ssize_t size;
+ void *data;
+ if (arg == NULL) {
+ Py_DECREF(*(PyObject**)addr);
+ return 1;
+ }
+ if (PyUnicode_Check(arg)) {
+ output = arg;
+ Py_INCREF(output);
}
else {
- size = PyByteArray_GET_SIZE(output);
- data = PyByteArray_AS_STRING(output);
+ arg = PyBytes_FromObject(arg);
+ if (!arg)
+ return 0;
+ output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
+ PyBytes_GET_SIZE(arg));
+ Py_DECREF(arg);
+ if (!output)
+ return 0;
+ if (!PyUnicode_Check(output)) {
+ Py_DECREF(output);
+ PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
+ return 0;
+ }
}
- if (size != strlen(data)) {
+ size = PyUnicode_GET_SIZE(output);
+ data = PyUnicode_AS_UNICODE(output);
+ if (size != Py_UNICODE_strlen(data)) {
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Py_DECREF(output);
return 0;
@@ -1632,18 +2008,34 @@ Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
const char *PyUnicode_GetDefaultEncoding(void)
{
- return unicode_default_encoding;
+ return "utf-8";
}
-int PyUnicode_SetDefaultEncoding(const char *encoding)
+/* create or adjust a UnicodeDecodeError */
+static void
+make_decode_exception(PyObject **exceptionObject,
+ const char *encoding,
+ const char *input, Py_ssize_t length,
+ Py_ssize_t startpos, Py_ssize_t endpos,
+ const char *reason)
{
- if (strcmp(encoding, unicode_default_encoding) != 0) {
- PyErr_Format(PyExc_ValueError,
- "Can only set default encoding to %s",
- unicode_default_encoding);
- return -1;
+ if (*exceptionObject == NULL) {
+ *exceptionObject = PyUnicodeDecodeError_Create(
+ encoding, input, length, startpos, endpos, reason);
}
- return 0;
+ else {
+ if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
+ goto onError;
+ if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
+ goto onError;
+ }
+ return;
+
+onError:
+ Py_DECREF(*exceptionObject);
+ *exceptionObject = NULL;
}
/* error handling callback helper:
@@ -1679,20 +2071,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
goto onError;
}
- if (*exceptionObject == NULL) {
- *exceptionObject = PyUnicodeDecodeError_Create(
- encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
- if (*exceptionObject == NULL)
- goto onError;
- }
- else {
- if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
- goto onError;
- if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
- goto onError;
- }
+ make_decode_exception(exceptionObject,
+ encoding,
+ *input, *inend - *input,
+ *startinpos, *endinpos,
+ reason);
+ if (*exceptionObject == NULL)
+ goto onError;
restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
if (restuple == NULL)
@@ -1910,21 +2295,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
*p++ = outCh;
#endif
surrogate = 0;
+ continue;
}
else {
+ *p++ = surrogate;
surrogate = 0;
- errmsg = "second surrogate missing";
- goto utf7Error;
}
}
- else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+ if (outCh >= 0xD800 && outCh <= 0xDBFF) {
/* first surrogate */
surrogate = outCh;
}
- else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
- errmsg = "unexpected second surrogate";
- goto utf7Error;
- }
else {
*p++ = outCh;
}
@@ -1934,8 +2315,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
inShift = 0;
s++;
if (surrogate) {
- errmsg = "second surrogate missing at end of shift sequence";
- goto utf7Error;
+ *p++ = surrogate;
+ surrogate = 0;
}
if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) {
@@ -2382,7 +2763,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
- "utf8", errmsg,
+ "utf-8", errmsg,
&starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p))
goto onError;
@@ -2408,6 +2789,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
#undef ASCII_CHAR_MASK
+#ifdef __APPLE__
+
+/* Simplified UTF-8 decoder using surrogateescape error handler,
+ used to decode the command line arguments on Mac OS X. */
+
+wchar_t*
+_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
+{
+ int n;
+ const char *e;
+ wchar_t *unicode, *p;
+
+ /* Note: size will always be longer than the resulting Unicode
+ character count */
+ if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
+ if (!unicode)
+ return NULL;
+
+ /* Unpack UTF-8 encoded data */
+ p = unicode;
+ e = s + size;
+ while (s < e) {
+ Py_UCS4 ch = (unsigned char)*s;
+
+ if (ch < 0x80) {
+ *p++ = (wchar_t)ch;
+ s++;
+ continue;
+ }
+
+ n = utf8_code_length[ch];
+ if (s + n > e) {
+ goto surrogateescape;
+ }
+
+ switch (n) {
+ case 0:
+ case 1:
+ goto surrogateescape;
+
+ case 2:
+ if ((s[1] & 0xc0) != 0x80)
+ goto surrogateescape;
+ ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
+ assert ((ch > 0x007F) && (ch <= 0x07FF));
+ *p++ = (wchar_t)ch;
+ break;
+
+ case 3:
+ /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
+ will result in surrogates in range d800-dfff. Surrogates are
+ not valid UTF-8 so they are rejected.
+ See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xE0 &&
+ (unsigned char)s[1] < 0xA0) ||
+ ((unsigned char)s[0] == 0xED &&
+ (unsigned char)s[1] > 0x9F)) {
+
+ goto surrogateescape;
+ }
+ ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+ assert ((ch > 0x07FF) && (ch <= 0xFFFF));
+ *p++ = (Py_UNICODE)ch;
+ break;
+
+ case 4:
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ ((unsigned char)s[0] == 0xF0 &&
+ (unsigned char)s[1] < 0x90) ||
+ ((unsigned char)s[0] == 0xF4 &&
+ (unsigned char)s[1] > 0x8F)) {
+ goto surrogateescape;
+ }
+ ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
+ ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
+ assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+
+#if SIZEOF_WCHAR_T == 4
+ *p++ = (wchar_t)ch;
+#else
+ /* compute and append the two surrogates: */
+
+ /* translate from 10000..10FFFF to 0..FFFF */
+ ch -= 0x10000;
+
+ /* high surrogate = top 10 bits added to D800 */
+ *p++ = (wchar_t)(0xD800 + (ch >> 10));
+
+ /* low surrogate = bottom 10 bits added to DC00 */
+ *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
+#endif
+ break;
+ }
+ s += n;
+ continue;
+
+ surrogateescape:
+ *p++ = 0xDC00 + ch;
+ s++;
+ }
+ *p = L'\0';
+ return unicode;
+}
+
+#endif /* __APPLE__ */
/* Allocation strategy: if the string is short, convert into a stack buffer
and allocate exactly as much space needed at the end. Else allocate the
@@ -3366,7 +3861,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
}
for (i = 0; i < digits; ++i) {
c = (unsigned char) s[i];
- if (!ISXDIGIT(c)) {
+ if (!Py_ISXDIGIT(c)) {
endinpos = (s+i+1)-starts;
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
@@ -3732,7 +4227,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
outpos = p-PyUnicode_AS_UNICODE(v);
for (x = 0, i = 0; i < count; ++i, ++s) {
c = (unsigned char)*s;
- if (!ISXDIGIT(c)) {
+ if (!Py_ISXDIGIT(c)) {
endinpos = s-starts;
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
@@ -4431,32 +4926,46 @@ static int is_dbcs_lead_byte(const char *s, int offset)
static int decode_mbcs(PyUnicodeObject **v,
const char *s, /* MBCS string */
int size, /* sizeof MBCS string */
- int final)
+ int final,
+ const char *errors)
{
Py_UNICODE *p;
- Py_ssize_t n = 0;
- int usize = 0;
+ Py_ssize_t n;
+ DWORD usize;
+ DWORD flags;
assert(size >= 0);
+ /* check and handle 'errors' arg */
+ if (errors==NULL || strcmp(errors, "strict")==0)
+ flags = MB_ERR_INVALID_CHARS;
+ else if (strcmp(errors, "ignore")==0)
+ flags = 0;
+ else {
+ PyErr_Format(PyExc_ValueError,
+ "mbcs encoding does not support errors='%s'",
+ errors);
+ return -1;
+ }
+
/* Skip trailing lead-byte unless 'final' is set */
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
--size;
/* First get the size of the result */
if (size > 0) {
- usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
- if (usize == 0) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
- }
- }
+ usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
+ if (usize==0)
+ goto mbcs_decode_error;
+ } else
+ usize = 0;
if (*v == NULL) {
/* Create unicode object */
*v = _PyUnicode_New(usize);
if (*v == NULL)
return -1;
+ n = 0;
}
else {
/* Extend unicode object */
@@ -4466,15 +4975,35 @@ static int decode_mbcs(PyUnicodeObject **v,
}
/* Do the conversion */
- if (size > 0) {
+ if (usize > 0) {
p = PyUnicode_AS_UNICODE(*v) + n;
- if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
- PyErr_SetFromWindowsErrWithFilename(0, NULL);
- return -1;
+ if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
+ goto mbcs_decode_error;
}
}
-
return size;
+
+mbcs_decode_error:
+ /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
+ we raise a UnicodeDecodeError - else it is a 'generic'
+ windows error
+ */
+ if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
+ /* Ideally, we should get reason from FormatMessage - this
+ is the Windows 2000 English version of the message
+ */
+ PyObject *exc = NULL;
+ const char *reason = "No mapping for the Unicode character exists "
+ "in the target multi-byte code page.";
+ make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
+ if (exc != NULL) {
+ PyCodec_StrictErrors(exc);
+ Py_DECREF(exc);
+ }
+ } else {
+ PyErr_SetFromWindowsErrWithFilename(0, NULL);
+ }
+ return -1;
}
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
@@ -4491,10 +5020,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- done = decode_mbcs(&v, s, INT_MAX, 0);
+ done = decode_mbcs(&v, s, INT_MAX, 0, errors);
else
#endif
- done = decode_mbcs(&v, s, (int)size, !consumed);
+ done = decode_mbcs(&v, s, (int)size, !consumed, errors);
if (done < 0) {
Py_XDECREF(v);
@@ -4528,20 +5057,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s,
*/
static int encode_mbcs(PyObject **repr,
const Py_UNICODE *p, /* unicode */
- int size) /* size of unicode */
+ int size, /* size of unicode */
+ const char* errors)
{
- int mbcssize = 0;
- Py_ssize_t n = 0;
+ BOOL usedDefaultChar = FALSE;
+ BOOL *pusedDefaultChar;
+ int mbcssize;
+ Py_ssize_t n;
+ PyObject *exc = NULL;
+ DWORD flags;
assert(size >= 0);
+ /* check and handle 'errors' arg */
+ if (errors==NULL || strcmp(errors, "strict")==0) {
+ flags = WC_NO_BEST_FIT_CHARS;
+ pusedDefaultChar = &usedDefaultChar;
+ } else if (strcmp(errors, "replace")==0) {
+ flags = 0;
+ pusedDefaultChar = NULL;
+ } else {
+ PyErr_Format(PyExc_ValueError,
+ "mbcs encoding does not support errors='%s'",
+ errors);
+ return -1;
+ }
+
/* First get the size of the result */
if (size > 0) {
- mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+ mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
+ NULL, pusedDefaultChar);
if (mbcssize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
+ /* If we used a default char, then we failed! */
+ if (pusedDefaultChar && *pusedDefaultChar)
+ goto mbcs_encode_error;
+ } else {
+ mbcssize = 0;
}
if (*repr == NULL) {
@@ -4549,6 +5103,7 @@ static int encode_mbcs(PyObject **repr,
*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
return -1;
+ n = 0;
}
else {
/* Extend string object */
@@ -4560,13 +5115,20 @@ static int encode_mbcs(PyObject **repr,
/* Do the conversion */
if (size > 0) {
char *s = PyBytes_AS_STRING(*repr) + n;
- if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+ if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
+ NULL, pusedDefaultChar)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
+ if (pusedDefaultChar && *pusedDefaultChar)
+ goto mbcs_encode_error;
}
-
return 0;
+
+mbcs_encode_error:
+ raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
+ Py_XDECREF(exc);
+ return -1;
}
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
@@ -4579,10 +5141,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
- ret = encode_mbcs(&repr, p, INT_MAX);
+ ret = encode_mbcs(&repr, p, INT_MAX, errors);
else
#endif
- ret = encode_mbcs(&repr, p, (int)size);
+ ret = encode_mbcs(&repr, p, (int)size, errors);
if (ret < 0) {
Py_XDECREF(repr);
@@ -5699,6 +6261,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
return NULL;
}
+PyObject *
+PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
+ Py_ssize_t length)
+{
+ PyObject *result;
+ Py_UNICODE *p; /* write pointer into result */
+ Py_ssize_t i;
+ /* Copy to a new string */
+ result = (PyObject *)_PyUnicode_New(length);
+ Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
+ if (result == NULL)
+ return result;
+ p = PyUnicode_AS_UNICODE(result);
+ /* Iterate over code points */
+ for (i = 0; i < length; i++) {
+ Py_UNICODE ch =s[i];
+ if (ch > 127) {
+ int decimal = Py_UNICODE_TODECIMAL(ch);
+ if (decimal >= 0)
+ p[i] = '0' + decimal;
+ }
+ }
+ return result;
+}
/* --- Decimal Encoder ---------------------------------------------------- */
int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@@ -5750,11 +6336,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
}
/* All other characters are considered unencodable */
collstart = p;
- collend = p+1;
- while (collend < end) {
+ for (collend = p+1; collend < end; collend++) {
if ((0 < *collend && *collend < 256) ||
- !Py_UNICODE_ISSPACE(*collend) ||
- Py_UNICODE_TODECIMAL(*collend))
+ Py_UNICODE_ISSPACE(*collend) ||
+ 0 <= Py_UNICODE_TODECIMAL(*collend))
break;
}
/* cache callback name lookup
@@ -5840,28 +6425,61 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
#include "stringlib/unicodedefs.h"
#include "stringlib/fastsearch.h"
+
#include "stringlib/count.h"
-/* Include _ParseTupleFinds from find.h */
-#define FROM_UNICODE
#include "stringlib/find.h"
#include "stringlib/partition.h"
+#include "stringlib/split.h"
#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
#include "stringlib/localeutil.h"
/* helper macro to fixup start/end slice values */
-#define FIX_START_END(obj) \
- if (start < 0) \
- start += (obj)->length; \
- if (start < 0) \
- start = 0; \
- if (end > (obj)->length) \
- end = (obj)->length; \
- if (end < 0) \
- end += (obj)->length; \
- if (end < 0) \
- end = 0;
+#define ADJUST_INDICES(start, end, len) \
+ if (end > len) \
+ end = len; \
+ else if (end < 0) { \
+ end += len; \
+ if (end < 0) \
+ end = 0; \
+ } \
+ if (start < 0) { \
+ start += len; \
+ if (start < 0) \
+ start = 0; \
+ }
+
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ * (especially if used with other macros).
+ */
+
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
+ (((((Py_UCS4)(high) & 0x03FF) << 10) | \
+ ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end) \
+ (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
+ _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
+ ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+ (Py_UCS4)*(ptr)++)
+#endif
Py_ssize_t PyUnicode_Count(PyObject *str,
PyObject *substr,
@@ -5881,10 +6499,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
return -1;
}
- FIX_START_END(str_obj);
-
+ ADJUST_INDICES(start, end, str_obj->length);
result = stringlib_count(
- str_obj->str + start, end - start, sub_obj->str, sub_obj->length
+ str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
+ PY_SSIZE_T_MAX
);
Py_DECREF(sub_obj);
@@ -5939,8 +6557,7 @@ int tailmatch(PyUnicodeObject *self,
if (substring->length == 0)
return 1;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
end -= substring->length;
if (end < start)
return 0;
@@ -6080,13 +6697,13 @@ int fixcapitalize(PyUnicodeObject *self)
if (len == 0)
return 0;
- if (Py_UNICODE_ISLOWER(*s)) {
+ if (!Py_UNICODE_ISUPPER(*s)) {
*s = Py_UNICODE_TOUPPER(*s);
status = 1;
}
s++;
while (--len > 0) {
- if (Py_UNICODE_ISUPPER(*s)) {
+ if (!Py_UNICODE_ISLOWER(*s)) {
*s = Py_UNICODE_TOLOWER(*s);
status = 1;
}
@@ -6281,305 +6898,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
return u;
}
-#define SPLIT_APPEND(data, left, right) \
- str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
- if (!str) \
- goto onError; \
- if (PyList_Append(list, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
-static
-PyObject *split_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
+PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- /* find a token */
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
- i++;
- if (j < i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- while (i < len && Py_UNICODE_ISSPACE(buf[i]))
- i++;
- j = i;
- }
- }
- if (j < len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-PyObject *PyUnicode_Splitlines(PyObject *string,
- int keepends)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len;
PyObject *list;
- PyObject *str;
- Py_UNICODE *data;
string = PyUnicode_FromObject(string);
if (string == NULL)
return NULL;
- data = PyUnicode_AS_UNICODE(string);
- len = PyUnicode_GET_SIZE(string);
-
- list = PyList_New(0);
- if (!list)
- goto onError;
-
- for (i = j = 0; i < len; ) {
- Py_ssize_t eol;
-
- /* Find a line and append it */
- while (i < len && !BLOOM_LINEBREAK(data[i]))
- i++;
- /* Skip the line break reading CRLF as one line break */
- eol = i;
- if (i < len) {
- if (data[i] == '\r' && i + 1 < len &&
- data[i+1] == '\n')
- i += 2;
- else
- i++;
- if (keepends)
- eol = i;
- }
- SPLIT_APPEND(data, j, eol);
- j = i;
- }
- if (j < len) {
- SPLIT_APPEND(data, j, len);
- }
+ list = stringlib_splitlines(
+ (PyObject*) string, PyUnicode_AS_UNICODE(string),
+ PyUnicode_GET_SIZE(string), keepends);
Py_DECREF(string);
return list;
-
- onError:
- Py_XDECREF(list);
- Py_DECREF(string);
- return NULL;
}
static
-PyObject *split_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = 0; i < len; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, j, i);
- i = j = i + 1;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(buf, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *split_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = j = 0; i <= len - sublen; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, j, i);
- i = j = i + sublen;
- } else
- i++;
- }
- if (j <= len) {
- SPLIT_APPEND(self->str, j, len);
- }
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_whitespace(PyUnicodeObject *self,
- PyObject *list,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- /* find a token */
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
- i--;
- if (j > i) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
- i--;
- j = i;
- }
- }
- if (j >= 0) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_char(PyUnicodeObject *self,
- PyObject *list,
- Py_UNICODE ch,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- PyObject *str;
- register const Py_UNICODE *buf = self->str;
-
- for (i = j = len - 1; i >= 0; ) {
- if (buf[i] == ch) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(buf, i + 1, j + 1);
- j = i = i - 1;
- } else
- i--;
- }
- if (j >= -1) {
- SPLIT_APPEND(buf, 0, j + 1);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-static
-PyObject *rsplit_substring(PyUnicodeObject *self,
- PyObject *list,
- PyUnicodeObject *substring,
- Py_ssize_t maxcount)
-{
- register Py_ssize_t i;
- register Py_ssize_t j;
- Py_ssize_t len = self->length;
- Py_ssize_t sublen = substring->length;
- PyObject *str;
-
- for (i = len - sublen, j = len; i >= 0; ) {
- if (Py_UNICODE_MATCH(self, i, substring)) {
- if (maxcount-- <= 0)
- break;
- SPLIT_APPEND(self->str, i + sublen, j);
- j = i;
- i -= sublen;
- } else
- i--;
- }
- if (j >= 0) {
- SPLIT_APPEND(self->str, 0, j);
- }
- if (PyList_Reverse(list) < 0)
- goto onError;
- return list;
-
- onError:
- Py_DECREF(list);
- return NULL;
-}
-
-#undef SPLIT_APPEND
-
-static
PyObject *split(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return split_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return split_char(self,list,substring->str[0],maxcount);
+ return stringlib_split_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return split_substring(self,list,substring,maxcount);
+ return stringlib_split(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -6587,28 +6939,19 @@ PyObject *rsplit(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t maxcount)
{
- PyObject *list;
-
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
- list = PyList_New(0);
- if (!list)
- return NULL;
-
if (substring == NULL)
- return rsplit_whitespace(self,list,maxcount);
-
- else if (substring->length == 1)
- return rsplit_char(self,list,substring->str[0],maxcount);
+ return stringlib_rsplit_whitespace(
+ (PyObject*) self, self->str, self->length, maxcount
+ );
- else if (substring->length == 0) {
- Py_DECREF(list);
- PyErr_SetString(PyExc_ValueError, "empty separator");
- return NULL;
- }
- else
- return rsplit_substring(self,list,substring,maxcount);
+ return stringlib_rsplit(
+ (PyObject*) self, self->str, self->length,
+ substring->str, substring->length,
+ maxcount
+ );
}
static
@@ -6621,10 +6964,14 @@ PyObject *replace(PyUnicodeObject *self,
if (maxcount < 0)
maxcount = PY_SSIZE_T_MAX;
+ else if (maxcount == 0 || self->length == 0)
+ goto nothing;
if (str1->length == str2->length) {
- /* same length */
Py_ssize_t i;
+ /* same length */
+ if (str1->length == 0)
+ goto nothing;
if (str1->length == 1) {
/* replace characters */
Py_UNICODE u1, u2;
@@ -6643,8 +6990,8 @@ PyObject *replace(PyUnicodeObject *self,
u->str[i] = u2;
}
} else {
- i = fastsearch(
- self->str, self->length, str1->str, str1->length, FAST_SEARCH
+ i = stringlib_find(
+ self->str, self->length, str1->str, str1->length, 0
);
if (i < 0)
goto nothing;
@@ -6652,25 +6999,30 @@ PyObject *replace(PyUnicodeObject *self,
if (!u)
return NULL;
Py_UNICODE_COPY(u->str, self->str, self->length);
- while (i <= self->length - str1->length)
- if (Py_UNICODE_MATCH(self, i, str1)) {
- if (--maxcount < 0)
- break;
- Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
- i += str1->length;
- } else
- i++;
+
+ /* change everything in-place, starting with this one */
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+
+ while ( --maxcount > 0) {
+ i = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (i == -1)
+ break;
+ Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
+ i += str1->length;
+ }
}
} else {
- Py_ssize_t n, i, j, e;
+ Py_ssize_t n, i, j;
Py_ssize_t product, new_size, delta;
Py_UNICODE *p;
/* replace strings */
- n = stringlib_count(self->str, self->length, str1->str, str1->length);
- if (n > maxcount)
- n = maxcount;
+ n = stringlib_count(self->str, self->length, str1->str, str1->length,
+ maxcount);
if (n == 0)
goto nothing;
/* new_size = self->length + n * (str2->length - str1->length)); */
@@ -6696,19 +7048,15 @@ PyObject *replace(PyUnicodeObject *self,
return NULL;
i = 0;
p = u->str;
- e = self->length - str1->length;
if (str1->length > 0) {
while (n-- > 0) {
/* look for next match */
- j = i;
- while (j <= e) {
- if (Py_UNICODE_MATCH(self, j, str1))
- break;
- j++;
- }
- if (j > i) {
- if (j > e)
- break;
+ j = stringlib_find(self->str+i, self->length-i,
+ str1->str, str1->length,
+ i);
+ if (j == -1)
+ break;
+ else if (j > i) {
/* copy unchanged part [i:j] */
Py_UNICODE_COPY(p, self->str+i, j-i);
p += j - i;
@@ -6970,10 +7318,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
return ((int)id[i] < (int)str[i]) ? -1 : 1;
/* This check keeps Python strings that end in '\0' from comparing equal
to C strings identical up to that point. */
- if (PyUnicode_GET_SIZE(uni) != i)
- /* We'll say the Python string is longer. */
- return 1;
- if (id[i])
+ if (PyUnicode_GET_SIZE(uni) != i || id[i])
return 1; /* uni is longer */
if (str[i])
return -1; /* str is longer */
@@ -7154,11 +7499,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
&start, &end))
return NULL;
- FIX_START_END(self);
-
+ ADJUST_INDICES(start, end, self->length);
result = PyLong_FromSsize_t(
stringlib_count(self->str + start, end - start,
- substring->str, substring->length)
+ substring->str, substring->length,
+ PY_SSIZE_T_MAX)
);
Py_DECREF(substring);
@@ -7167,39 +7512,26 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
}
PyDoc_STRVAR(encode__doc__,
- "S.encode([encoding[, errors]]) -> bytes\n\
+ "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
\n\
-Encode S using the codec registered for encoding. encoding defaults\n\
-to the default encoding. errors may be given to set a different error\n\
+Encode S using the codec registered for encoding. Default encoding\n\
+is 'utf-8'. errors may be given to set a different error\n\
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
'xmlcharrefreplace' as well as any other name registered with\n\
codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
-unicode_encode(PyUnicodeObject *self, PyObject *args)
+unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
{
+ static char *kwlist[] = {"encoding", "errors", 0};
char *encoding = NULL;
char *errors = NULL;
- PyObject *v;
- if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
+ kwlist, &encoding, &errors))
return NULL;
- v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
- if (v == NULL)
- goto onError;
- if (!PyBytes_Check(v)) {
- PyErr_Format(PyExc_TypeError,
- "encoder did not return a bytes object "
- "(type=%.400s)",
- Py_TYPE(v)->tp_name);
- Py_DECREF(v);
- return NULL;
- }
- return v;
-
- onError:
- return NULL;
+ return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
}
PyDoc_STRVAR(expandtabs__doc__,
@@ -7293,7 +7625,7 @@ PyDoc_STRVAR(find__doc__,
"S.find(sub[, start[, end]]) -> int\n\
\n\
Return the lowest index in S where substring sub is found,\n\
-such that sub is contained within s[start:end]. Optional\n\
+such that sub is contained within S[start:end]. Optional\n\
arguments start and end are interpreted as in slice notation.\n\
\n\
Return -1 on failure.");
@@ -7334,12 +7666,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
/* Believe it or not, this produces the same value for ASCII strings
as string_hash(). */
-static long
+static Py_hash_t
unicode_hash(PyUnicodeObject *self)
{
Py_ssize_t len;
Py_UNICODE *p;
- long x;
+ Py_hash_t x;
assert(_Py_HashSecret_Initialized);
if (self->hash != -1)
@@ -7357,7 +7689,7 @@ unicode_hash(PyUnicodeObject *self)
x = _Py_HashSecret.prefix;
x ^= *p << 7;
while (--len >= 0)
- x = (1000003*x) ^ *p++;
+ x = (_PyHASH_MULTIPLIER*x) ^ *p++;
x ^= Py_SIZE(self);
x ^= _Py_HashSecret.suffix;
if (x == -1)
@@ -7422,8 +7754,8 @@ unicode_islower(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self);
cased = 0;
- for (; p < e; p++) {
- register const Py_UNICODE ch = *p;
+ while (p < e) {
+ const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
return PyBool_FromLong(0);
@@ -7456,8 +7788,8 @@ unicode_isupper(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self);
cased = 0;
- for (; p < e; p++) {
- register const Py_UNICODE ch = *p;
+ while (p < e) {
+ const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
return PyBool_FromLong(0);
@@ -7494,8 +7826,8 @@ unicode_istitle(PyUnicodeObject *self)
e = p + PyUnicode_GET_SIZE(self);
cased = 0;
previous_is_cased = 0;
- for (; p < e; p++) {
- register const Py_UNICODE ch = *p;
+ while (p < e) {
+ const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
if (previous_is_cased)
@@ -7537,8 +7869,9 @@ unicode_isspace(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISSPACE(*p))
+ while (p < e) {
+ const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
+ if (!Py_UNICODE_ISSPACE(ch))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7566,8 +7899,8 @@ unicode_isalpha(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISALPHA(*p))
+ while (p < e) {
+ if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7595,8 +7928,9 @@ unicode_isalnum(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISALNUM(*p))
+ while (p < e) {
+ const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
+ if (!Py_UNICODE_ISALNUM(ch))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7624,8 +7958,8 @@ unicode_isdecimal(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISDECIMAL(*p))
+ while (p < e) {
+ if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7653,8 +7987,8 @@ unicode_isdigit(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISDIGIT(*p))
+ while (p < e) {
+ if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7682,8 +8016,8 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(0);
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISNUMERIC(*p))
+ while (p < e) {
+ if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
return PyBool_FromLong(0);
}
return PyBool_FromLong(1);
@@ -7692,8 +8026,9 @@ unicode_isnumeric(PyUnicodeObject *self)
int
PyUnicode_IsIdentifier(PyObject *self)
{
- register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
- register const Py_UNICODE *e;
+ const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
+ const Py_UNICODE *e;
+ Py_UCS4 first;
/* Special case for empty strings */
if (PyUnicode_GET_SIZE(self) == 0)
@@ -7707,14 +8042,14 @@ PyUnicode_IsIdentifier(PyObject *self)
definition of XID_Start and XID_Continue, it is sufficient
to check just for these, except that _ must be allowed
as starting an identifier. */
- if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
+ e = p + PyUnicode_GET_SIZE(self);
+ first = _Py_UNICODE_NEXT(p, e);
+ if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
return 0;
- e = p + PyUnicode_GET_SIZE(self);
- for (p++; p < e; p++) {
- if (!_PyUnicode_IsXidContinue(*p))
+ while (p < e)
+ if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
return 0;
- }
return 1;
}
@@ -7748,8 +8083,8 @@ unicode_isprintable(PyObject *self)
}
e = p + PyUnicode_GET_SIZE(self);
- for (; p < e; p++) {
- if (!Py_UNICODE_ISPRINTABLE(*p)) {
+ while (p < e) {
+ if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Py_RETURN_FALSE;
}
}
@@ -8230,7 +8565,7 @@ PyDoc_STRVAR(rfind__doc__,
"S.rfind(sub[, start[, end]]) -> int\n\
\n\
Return the highest index in S where substring sub is found,\n\
-such that sub is contained within s[start:end]. Optional\n\
+such that sub is contained within S[start:end]. Optional\n\
arguments start and end are interpreted as in slice notation.\n\
\n\
Return -1 on failure.");
@@ -8577,9 +8912,13 @@ unicode_maketrans(PyUnicodeObject *null, PyObject *args)
/* create entries for translating chars in x to those in y */
for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
+ if (!key)
+ goto err;
value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
- if (!key || !value)
+ if (!value) {
+ Py_DECREF(key);
goto err;
+ }
res = PyDict_SetItem(new, key, value);
Py_DECREF(key);
Py_DECREF(value);
@@ -8715,6 +9054,13 @@ unicode_freelistsize(PyUnicodeObject *self)
{
return PyLong_FromLong(numfree);
}
+
+static PyObject *
+unicode__decimal2ascii(PyObject *self)
+{
+ return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
+ PyUnicode_GET_SIZE(self));
+}
#endif
PyDoc_STRVAR(startswith__doc__,
@@ -8821,6 +9167,12 @@ PyDoc_STRVAR(format__doc__,
Return a formatted version of S, using substitutions from args and kwargs.\n\
The substitutions are identified by braces ('{' and '}').");
+PyDoc_STRVAR(format_map__doc__,
+ "S.format_map(mapping) -> str\n\
+\n\
+Return a formatted version of S, using substitutions from mapping.\n\
+The substitutions are identified by braces ('{' and '}').");
+
static PyObject *
unicode__format__(PyObject* self, PyObject* args)
{
@@ -8855,13 +9207,12 @@ unicode_getnewargs(PyUnicodeObject *v)
return Py_BuildValue("(u#)", v->str, v->length);
}
-
static PyMethodDef unicode_methods[] = {
/* Order is according to common usage: often used methods should
appear first, since lookup is done sequentially. */
- {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
+ {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
@@ -8902,9 +9253,8 @@ static PyMethodDef unicode_methods[] = {
{"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
{"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
+ {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
{"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
- {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
- {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
{"maketrans", (PyCFunction) unicode_maketrans,
METH_VARARGS | METH_STATIC, maketrans__doc__},
{"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
@@ -8913,8 +9263,9 @@ static PyMethodDef unicode_methods[] = {
#endif
#if 0
- /* This one is just used for debugging the implementation. */
+ /* These methods are just used for debugging the implementation. */
{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
+ {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
#endif
{"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
@@ -8965,7 +9316,7 @@ unicode_subscript(PyUnicodeObject* self, PyObject* item)
Py_UNICODE* result_buf;
PyObject* result;
- if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
+ if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
&start, &stop, &step, &slicelength) < 0) {
return NULL;
}
@@ -9394,8 +9745,6 @@ PyObject *PyUnicode_Format(PyObject *format,
case 'o':
case 'x':
case 'X':
- if (c == 'i')
- c = 'd';
isnumok = 0;
if (PyNumber_Check(v)) {
PyObject *iobj=NULL;
@@ -9410,7 +9759,7 @@ PyObject *PyUnicode_Format(PyObject *format,
if (iobj!=NULL) {
if (PyLong_Check(iobj)) {
isnumok = 1;
- temp = formatlong(iobj, flags, prec, c);
+ temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Py_DECREF(iobj);
if (!temp)
goto onError;
@@ -10019,6 +10368,15 @@ Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
return s1;
}
+Py_UNICODE*
+Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
+{
+ Py_UNICODE *u1 = s1;
+ u1 += Py_UNICODE_strlen(u1);
+ Py_UNICODE_strcpy(u1, s2);
+ return s1;
+}
+
int
Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
{
@@ -10033,6 +10391,23 @@ Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
return 0;
}
+int
+Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
+{
+ register Py_UNICODE u1, u2;
+ for (; n != 0; n--) {
+ u1 = *s1;
+ u2 = *s2;
+ if (u1 != u2)
+ return (u1 < u2) ? -1 : +1;
+ if (u1 == '\0')
+ return 0;
+ s1++;
+ s2++;
+ }
+ return 0;
+}
+
Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
{
@@ -10043,15 +10418,72 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
return NULL;
}
+Py_UNICODE*
+Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
+{
+ const Py_UNICODE *p;
+ p = s + Py_UNICODE_strlen(s);
+ while (p != s) {
+ p--;
+ if (*p == c)
+ return (Py_UNICODE*)p;
+ }
+ return NULL;
+}
+
+Py_UNICODE*
+PyUnicode_AsUnicodeCopy(PyObject *object)
+{
+ PyUnicodeObject *unicode = (PyUnicodeObject *)object;
+ Py_UNICODE *copy;
+ Py_ssize_t size;
-#ifdef __cplusplus
+ /* Ensure we won't overflow the size. */
+ if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
+ size *= sizeof(Py_UNICODE);
+ copy = PyMem_Malloc(size);
+ if (copy == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
+ return copy;
}
-#endif
+/* A _string module, to export formatter_parser and formatter_field_name_split
+ to the string.Formatter class implemented in Python. */
-/*
- Local variables:
- c-basic-offset: 4
- indent-tabs-mode: nil
- End:
-*/
+static PyMethodDef _string_methods[] = {
+ {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
+ METH_O, PyDoc_STR("split the argument as a field name")},
+ {"formatter_parser", (PyCFunction) formatter_parser,
+ METH_O, PyDoc_STR("parse the argument as a format string")},
+ {NULL, NULL}
+};
+
+static struct PyModuleDef _string_module = {
+ PyModuleDef_HEAD_INIT,
+ "_string",
+ PyDoc_STR("string helper module"),
+ 0,
+ _string_methods,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+};
+
+PyMODINIT_FUNC
+PyInit__string(void)
+{
+ return PyModule_Create(&_string_module);
+}
+
+
+#ifdef __cplusplus
+}
+#endif