diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 1732 |
1 files changed, 1082 insertions, 650 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2ec878f916..b70666106d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -41,9 +41,6 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "bytes_methods.h" - -#include "unicodeobject.h" #include "ucnhash.h" #ifdef MS_WINDOWS @@ -114,21 +111,12 @@ static PyUnicodeObject *unicode_empty; shared as well. */ static PyUnicodeObject *unicode_latin1[256]; -/* Default encoding to use and assume when NULL is passed as encoding - parameter; it is fixed to "utf-8". Always use the - PyUnicode_GetDefaultEncoding() API to access this global. - - Don't forget to alter Py_FileSystemDefaultEncoding if you change the - hard coded default! -*/ -static const char unicode_default_encoding[] = "utf-8"; - /* Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0, -/* case 0x0009: * HORIZONTAL TABULATION */ +/* case 0x0009: * CHARACTER TABULATION */ /* case 0x000A: * LINE FEED */ -/* case 0x000B: * VERTICAL TABULATION */ +/* case 0x000B: * LINE TABULATION */ /* case 0x000C: * FORM FEED */ /* case 0x000D: * CARRIAGE RETURN */ 0, 1, 1, 1, 1, 1, 0, 0, @@ -169,8 +157,10 @@ static void raise_encode_exception(PyObject **exceptionObject, static unsigned char ascii_linebreak[] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0x000A, * LINE FEED */ +/* 0x000B, * LINE TABULATION */ +/* 0x000C, * FORM FEED */ /* 0x000D, * CARRIAGE RETURN */ - 0, 0, 1, 0, 0, 1, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x001C, * FILE SEPARATOR */ /* 0x001D, * GROUP SEPARATOR */ @@ -212,11 +202,22 @@ PyUnicode_GetMax(void) /* the linebreak mask is set up by Unicode_Init below */ +#if LONG_BIT >= 128 +#define BLOOM_WIDTH 128 +#elif LONG_BIT >= 64 +#define BLOOM_WIDTH 64 +#elif LONG_BIT >= 32 +#define BLOOM_WIDTH 32 +#else +#error "LONG_BIT is smaller than 32" +#endif + #define BLOOM_MASK unsigned long static BLOOM_MASK bloom_linebreak; -#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) +#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) +#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM_LINEBREAK(ch) \ ((ch) < 128U ? ascii_linebreak[(ch)] : \ @@ -226,12 +227,12 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) { /* calculate simple bloom-style bitmask for a given unicode string */ - long mask; + BLOOM_MASK mask; Py_ssize_t i; mask = 0; for (i = 0; i < len; i++) - mask |= (1 << (ptr[i] & 0x1F)); + BLOOM_ADD(mask, ptr[i]); return mask; } @@ -651,7 +652,7 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, return NULL; /* Copy the wchar_t data into the new object */ -#ifdef HAVE_USABLE_WCHAR_T +#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T memcpy(unicode->str, w, size * sizeof(wchar_t)); #else { @@ -671,7 +672,8 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, #undef CONVERT_WCHAR_TO_SURROGATES static void -makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) +makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, + int zeropad, int width, int precision, char c) { *fmt++ = '%'; if (width) { @@ -683,6 +685,19 @@ makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int pre fmt += sprintf(fmt, ".%d", precision); if (longflag) *fmt++ = 'l'; + else if (longlongflag) { + /* longlongflag should only ever be nonzero on machines with + HAVE_LONG_LONG defined */ +#ifdef HAVE_LONG_LONG + char *f = PY_FORMAT_LONG_LONG; + while (*f) + *fmt++ = *f++; +#else + /* we shouldn't ever get here */ + assert(0); + *fmt++ = 'l'; +#endif + } else if (size_tflag) { char *f = PY_FORMAT_SIZE_T; while (*f) @@ -694,6 +709,16 @@ makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int pre #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} +/* size of fixed-size buffer for formatting single arguments */ +#define ITEM_BUFFER_LEN 21 +/* maximum number of characters required for output of %ld. 21 characters + allows for 64-bit integers (in decimal) and an optional sign. */ +#define MAX_LONG_CHARS 21 +/* maximum number of characters required for output of %lld. + We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, + plus 1 for the sign. 53/22 is an upper bound for log10(256). */ +#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) + PyObject * PyUnicode_FromFormatV(const char *format, va_list vargs) { @@ -709,24 +734,16 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) Py_UNICODE *s; PyObject *string; /* used by sprintf */ - char buffer[21]; + char buffer[ITEM_BUFFER_LEN+1]; /* use abuffer instead of buffer, if we need more space * (which can happen if there's a format specifier with width). */ char *abuffer = NULL; char *realbuffer; Py_ssize_t abuffersize = 0; - char fmt[60]; /* should be enough for %0width.precisionld */ + char fmt[61]; /* should be enough for %0width.precisionlld */ const char *copy; -#ifdef VA_LIST_IS_ARRAY - Py_MEMCPY(count, vargs, sizeof(va_list)); -#else -#ifdef __va_copy - __va_copy(count, vargs); -#else - count = vargs; -#endif -#endif + Py_VA_COPY(count, vargs); /* step 1: count the number of %S/%R/%A/%s format specifications * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the @@ -735,15 +752,22 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (*f == '%') { if (*(f+1)=='%') continue; - if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A') + if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V') ++callcount; - while (ISDIGIT((unsigned)*f)) + while (Py_ISDIGIT((unsigned)*f)) width = (width*10) + *f++ - '0'; - while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) + while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) ; if (*f == 's') ++callcount; } + else if (128 <= (unsigned char)*f) { + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)*f); + return NULL; + } } /* step 2: allocate memory for the results of * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ @@ -758,37 +782,70 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) /* step 3: figure out how large a buffer we need */ for (f = format; *f; f++) { if (*f == '%') { +#ifdef HAVE_LONG_LONG + int longlongflag = 0; +#endif const char* p = f; width = 0; - while (ISDIGIT((unsigned)*f)) + while (Py_ISDIGIT((unsigned)*f)) width = (width*10) + *f++ - '0'; - while (*++f && *f != '%' && !ISALPHA((unsigned)*f)) + while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f)) ; /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since * they don't affect the amount of space we reserve. */ - if ((*f == 'l' || *f == 'z') && - (f[1] == 'd' || f[1] == 'u')) + if (*f == 'l') { + if (f[1] == 'd' || f[1] == 'u') { + ++f; + } +#ifdef HAVE_LONG_LONG + else if (f[1] == 'l' && + (f[2] == 'd' || f[2] == 'u')) { + longlongflag = 1; + f += 2; + } +#endif + } + else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { ++f; + } switch (*f) { case 'c': + { +#ifndef Py_UNICODE_WIDE + int ordinal = va_arg(count, int); + if (ordinal > 0xffff) + n += 2; + else + n++; +#else (void)va_arg(count, int); - /* fall through... */ + n++; +#endif + break; + } case '%': n++; break; case 'd': case 'u': case 'i': case 'x': (void) va_arg(count, int); - /* 20 bytes is enough to hold a 64-bit - integer. Decimal takes the most space. - This isn't enough for octal. - If a width is specified we need more - (which we allocate later). */ - if (width < 20) - width = 20; +#ifdef HAVE_LONG_LONG + if (longlongflag) { + if (width < MAX_LONG_LONG_CHARS) + width = MAX_LONG_LONG_CHARS; + } + else +#endif + /* MAX_LONG_CHARS is enough to hold a 64-bit integer, + including sign. Decimal takes the most space. This + isn't enough for octal. If a width is specified we + need more (which we allocate later). */ + if (width < MAX_LONG_CHARS) + width = MAX_LONG_CHARS; n += width; + /* XXX should allow for large precision here too. */ if (abuffersize < width) abuffersize = width; break; @@ -815,12 +872,20 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) { PyObject *obj = va_arg(count, PyObject *); const char *str = va_arg(count, const char *); + PyObject *str_obj; assert(obj || str); assert(!obj || PyUnicode_Check(obj)); - if (obj) + if (obj) { n += PyUnicode_GET_SIZE(obj); - else - n += strlen(str); + *callresult++ = NULL; + } + else { + str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); + if (!str_obj) + goto fail; + n += PyUnicode_GET_SIZE(str_obj); + *callresult++ = str_obj; + } break; } case 'S': @@ -885,8 +950,9 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) n++; } expand: - if (abuffersize > 20) { - abuffer = PyObject_Malloc(abuffersize); + if (abuffersize > ITEM_BUFFER_LEN) { + /* add 1 for sprintf's trailing null byte */ + abuffer = PyObject_Malloc(abuffersize + 1); if (!abuffer) { PyErr_NoMemory(); goto fail; @@ -910,23 +976,32 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (*f == '%') { const char* p = f++; int longflag = 0; + int longlongflag = 0; int size_tflag = 0; zeropad = (*f == '0'); /* parse the width.precision part */ width = 0; - while (ISDIGIT((unsigned)*f)) + while (Py_ISDIGIT((unsigned)*f)) width = (width*10) + *f++ - '0'; precision = 0; if (*f == '.') { f++; - while (ISDIGIT((unsigned)*f)) + while (Py_ISDIGIT((unsigned)*f)) precision = (precision*10) + *f++ - '0'; } - /* handle the long flag, but only for %ld and %lu. - others can be added when necessary. */ - if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { - longflag = 1; - ++f; + /* Handle %ld, %lu, %lld and %llu. */ + if (*f == 'l') { + if (f[1] == 'd' || f[1] == 'u') { + longflag = 1; + ++f; + } +#ifdef HAVE_LONG_LONG + else if (f[1] == 'l' && + (f[2] == 'd' || f[2] == 'u')) { + longlongflag = 1; + f += 2; + } +#endif } /* handle the size_t flag. */ if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { @@ -936,12 +1011,27 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) switch (*f) { case 'c': - *s++ = va_arg(vargs, int); + { + int ordinal = va_arg(vargs, int); +#ifndef Py_UNICODE_WIDE + if (ordinal > 0xffff) { + ordinal -= 0x10000; + *s++ = 0xD800 | (ordinal >> 10); + *s++ = 0xDC00 | (ordinal & 0x3FF); + } else +#endif + *s++ = ordinal; break; + } case 'd': - makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); + makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, + width, precision, 'd'); if (longflag) sprintf(realbuffer, fmt, va_arg(vargs, long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG)); +#endif else if (size_tflag) sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); else @@ -949,9 +1039,15 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) appendstring(realbuffer); break; case 'u': - makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); + makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, + width, precision, 'u'); if (longflag) sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + sprintf(realbuffer, fmt, va_arg(vargs, + unsigned PY_LONG_LONG)); +#endif else if (size_tflag) sprintf(realbuffer, fmt, va_arg(vargs, size_t)); else @@ -959,12 +1055,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) appendstring(realbuffer); break; case 'i': - makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); + makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i'); sprintf(realbuffer, fmt, va_arg(vargs, int)); appendstring(realbuffer); break; case 'x': - makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); + makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); sprintf(realbuffer, fmt, va_arg(vargs, int)); appendstring(realbuffer); break; @@ -992,18 +1088,23 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 'V': { PyObject *obj = va_arg(vargs, PyObject *); - const char *str = va_arg(vargs, const char *); + va_arg(vargs, const char *); if (obj) { Py_ssize_t size = PyUnicode_GET_SIZE(obj); Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); s += size; } else { - appendstring(str); + Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult), + PyUnicode_GET_SIZE(*callresult)); + s += PyUnicode_GET_SIZE(*callresult); + Py_DECREF(*callresult); } + ++callresult; break; } case 'S': case 'R': + case 'A': { Py_UNICODE *ucopy; Py_ssize_t usize; @@ -1039,7 +1140,8 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) appendstring(p); goto end; } - } else + } + else *s++ = *f; } @@ -1054,7 +1156,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (callresults) { PyObject **callresult2 = callresults; while (callresult2 < callresult) { - Py_DECREF(*callresult2); + Py_XDECREF(*callresult2); ++callresult2; } PyObject_Free(callresults); @@ -1082,35 +1184,154 @@ PyUnicode_FromFormat(const char *format, ...) return ret; } -Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, - wchar_t *w, - Py_ssize_t size) +/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(): + convert a Unicode object to a wide character string. + + - If w is NULL: return the number of wide characters (including the null + character) required to convert the unicode object. Ignore size argument. + + - Otherwise: return the number of wide characters (excluding the null + character) written into w. Write at most size wide characters (including + the null character). */ +static Py_ssize_t +unicode_aswidechar(PyUnicodeObject *unicode, + wchar_t *w, + Py_ssize_t size) +{ +#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T + Py_ssize_t res; + if (w != NULL) { + res = PyUnicode_GET_SIZE(unicode); + if (size > res) + size = res + 1; + else + res = size; + memcpy(w, unicode->str, size * sizeof(wchar_t)); + return res; + } + else + return PyUnicode_GET_SIZE(unicode) + 1; +#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4 + register const Py_UNICODE *u; + const Py_UNICODE *uend; + const wchar_t *worig, *wend; + Py_ssize_t nchar; + + u = PyUnicode_AS_UNICODE(unicode); + uend = u + PyUnicode_GET_SIZE(unicode); + if (w != NULL) { + worig = w; + wend = w + size; + while (u != uend && w != wend) { + if (0xD800 <= u[0] && u[0] <= 0xDBFF + && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + { + *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000; + u += 2; + } + else { + *w = *u; + u++; + } + w++; + } + if (w != wend) + *w = L'\0'; + return w - worig; + } + else { + nchar = 1; /* null character at the end */ + while (u != uend) { + if (0xD800 <= u[0] && u[0] <= 0xDBFF + && 0xDC00 <= u[1] && u[1] <= 0xDFFF) + u += 2; + else + u++; + nchar++; + } + } + return nchar; +#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2 + register Py_UNICODE *u, *uend, ordinal; + register Py_ssize_t i; + wchar_t *worig, *wend; + Py_ssize_t nchar; + + u = PyUnicode_AS_UNICODE(unicode); + uend = u + PyUnicode_GET_SIZE(u); + if (w != NULL) { + worig = w; + wend = w + size; + while (u != uend && w != wend) { + ordinal = *u; + if (ordinal > 0xffff) { + ordinal -= 0x10000; + *w++ = 0xD800 | (ordinal >> 10); + *w++ = 0xDC00 | (ordinal & 0x3FF); + } + else + *w++ = ordinal; + u++; + } + if (w != wend) + *w = 0; + return w - worig; + } + else { + nchar = 1; /* null character */ + while (u != uend) { + if (*u > 0xffff) + nchar += 2; + else + nchar++; + u++; + } + return nchar; + } +#else +# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670" +#endif +} + +Py_ssize_t +PyUnicode_AsWideChar(PyObject *unicode, + wchar_t *w, + Py_ssize_t size) { if (unicode == NULL) { PyErr_BadInternalCall(); return -1; } + return unicode_aswidechar((PyUnicodeObject*)unicode, w, size); +} - /* If possible, try to copy the 0-termination as well */ - if (size > PyUnicode_GET_SIZE(unicode)) - size = PyUnicode_GET_SIZE(unicode) + 1; +wchar_t* +PyUnicode_AsWideCharString(PyObject *unicode, + Py_ssize_t *size) +{ + wchar_t* buffer; + Py_ssize_t buflen; -#ifdef HAVE_USABLE_WCHAR_T - memcpy(w, unicode->str, size * sizeof(wchar_t)); -#else - { - register Py_UNICODE *u; - register Py_ssize_t i; - u = PyUnicode_AS_UNICODE(unicode); - for (i = size; i > 0; i--) - *w++ = *u++; + if (unicode == NULL) { + PyErr_BadInternalCall(); + return NULL; } -#endif - if (size > PyUnicode_GET_SIZE(unicode)) - return PyUnicode_GET_SIZE(unicode); - else - return size; + buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0); + if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) { + PyErr_NoMemory(); + return NULL; + } + + buffer = PyMem_MALLOC(buflen * sizeof(wchar_t)); + if (buffer == NULL) { + PyErr_NoMemory(); + return NULL; + } + buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen); + if (size != NULL) + *size = buflen; + return buffer; } #endif @@ -1210,27 +1431,26 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, return v; } -PyObject *PyUnicode_Decode(const char *s, - Py_ssize_t size, - const char *encoding, - const char *errors) +/* Convert encoding to lower case and replace '_' with '-' in order to + catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), + 1 on success. */ +static int +normalize_encoding(const char *encoding, + char *lower, + size_t lower_len) { - PyObject *buffer = NULL, *unicode; - Py_buffer info; - char lower[20]; /* Enough for any encoding name we recognize */ - char *l; const char *e; + char *l; + char *l_end; - if (encoding == NULL) - encoding = PyUnicode_GetDefaultEncoding(); - - /* Convert encoding to lower case and replace '_' with '-' in order to - catch e.g. UTF_8 */ e = encoding; l = lower; - while (*e && l < &lower[(sizeof lower) - 2]) { - if (ISUPPER(*e)) { - *l++ = TOLOWER(*e++); + l_end = &lower[lower_len - 1]; + while (*e) { + if (l == l_end) + return 0; + if (Py_ISUPPER(*e)) { + *l++ = Py_TOLOWER(*e++); } else if (*e == '_') { *l++ = '-'; @@ -1241,23 +1461,39 @@ PyObject *PyUnicode_Decode(const char *s, } } *l = '\0'; + return 1; +} + +PyObject *PyUnicode_Decode(const char *s, + Py_ssize_t size, + const char *encoding, + const char *errors) +{ + PyObject *buffer = NULL, *unicode; + Py_buffer info; + char lower[11]; /* Enough for any encoding shortcut */ + + if (encoding == NULL) + encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - if (strcmp(lower, "utf-8") == 0) - return PyUnicode_DecodeUTF8(s, size, errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) - return PyUnicode_DecodeLatin1(s, size, errors); + if (normalize_encoding(encoding, lower, sizeof(lower))) { + if (strcmp(lower, "utf-8") == 0) + return PyUnicode_DecodeUTF8(s, size, errors); + else if ((strcmp(lower, "latin-1") == 0) || + (strcmp(lower, "iso-8859-1") == 0)) + return PyUnicode_DecodeLatin1(s, size, errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) - else if (strcmp(lower, "mbcs") == 0) - return PyUnicode_DecodeMBCS(s, size, errors); + else if (strcmp(lower, "mbcs") == 0) + return PyUnicode_DecodeMBCS(s, size, errors); #endif - else if (strcmp(lower, "ascii") == 0) - return PyUnicode_DecodeASCII(s, size, errors); - else if (strcmp(lower, "utf-16") == 0) - return PyUnicode_DecodeUTF16(s, size, errors, 0); - else if (strcmp(lower, "utf-32") == 0) - return PyUnicode_DecodeUTF32(s, size, errors, 0); + else if (strcmp(lower, "ascii") == 0) + return PyUnicode_DecodeASCII(s, size, errors); + else if (strcmp(lower, "utf-16") == 0) + return PyUnicode_DecodeUTF16(s, size, errors, 0); + else if (strcmp(lower, "utf-32") == 0) + return PyUnicode_DecodeUTF32(s, size, errors, 0); + } /* Decode via the codec registry */ buffer = NULL; @@ -1378,11 +1614,77 @@ PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } +PyObject * +PyUnicode_EncodeFSDefault(PyObject *unicode) +{ +#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) + return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL); +#elif defined(__APPLE__) + return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + "surrogateescape"); +#else + PyInterpreterState *interp = PyThreadState_GET()->interp; + /* Bootstrap check: if the filesystem codec is implemented in Python, we + cannot use it to encode and decode filenames before it is loaded. Load + the Python codec requires to encode at least its own filename. Use the C + version of the locale codec until the codec registry is initialized and + the Python codec is loaded. + + Py_FileSystemDefaultEncoding is shared between all interpreters, we + cannot only rely on it: check also interp->fscodec_initialized for + subinterpreters. */ + if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { + return PyUnicode_AsEncodedString(unicode, + Py_FileSystemDefaultEncoding, + "surrogateescape"); + } + else { + /* locale encoding with surrogateescape */ + wchar_t *wchar; + char *bytes; + PyObject *bytes_obj; + size_t error_pos; + + wchar = PyUnicode_AsWideCharString(unicode, NULL); + if (wchar == NULL) + return NULL; + bytes = _Py_wchar2char(wchar, &error_pos); + if (bytes == NULL) { + if (error_pos != (size_t)-1) { + char *errmsg = strerror(errno); + PyObject *exc = NULL; + if (errmsg == NULL) + errmsg = "Py_wchar2char() failed"; + raise_encode_exception(&exc, + "filesystemencoding", + PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), + error_pos, error_pos+1, + errmsg); + Py_XDECREF(exc); + } + else + PyErr_NoMemory(); + PyMem_Free(wchar); + return NULL; + } + PyMem_Free(wchar); + + bytes_obj = PyBytes_FromString(bytes); + PyMem_Free(bytes); + return bytes_obj; + } +#endif +} + PyObject *PyUnicode_AsEncodedString(PyObject *unicode, const char *encoding, const char *errors) { PyObject *v; + char lower[11]; /* Enough for any encoding shortcut */ if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); @@ -1393,31 +1695,42 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - if (errors == NULL) { - if (strcmp(encoding, "utf-8") == 0) - return PyUnicode_AsUTF8String(unicode); - else if (strcmp(encoding, "latin-1") == 0) - return PyUnicode_AsLatin1String(unicode); + if (normalize_encoding(encoding, lower, sizeof(lower))) { + if (strcmp(lower, "utf-8") == 0) + return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); + else if ((strcmp(lower, "latin-1") == 0) || + (strcmp(lower, "iso-8859-1") == 0)) + return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) - else if (strcmp(encoding, "mbcs") == 0) - return PyUnicode_AsMBCSString(unicode); + else if (strcmp(lower, "mbcs") == 0) + return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); #endif - else if (strcmp(encoding, "ascii") == 0) - return PyUnicode_AsASCIIString(unicode); - /* During bootstrap, we may need to find the encodings - package, to load the file system encoding, and require the - file system encoding in order to load the encodings - package. - - Break out of this dependency by assuming that the path to - the encodings module is ASCII-only. XXX could try wcstombs - instead, if the file system encoding is the locale's - encoding. */ - else if (Py_FileSystemDefaultEncoding && - strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && - !PyThreadState_GET()->interp->codecs_initialized) - return PyUnicode_AsASCIIString(unicode); - } + else if (strcmp(lower, "ascii") == 0) + return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); + } + /* During bootstrap, we may need to find the encodings + package, to load the file system encoding, and require the + file system encoding in order to load the encodings + package. + + Break out of this dependency by assuming that the path to + the encodings module is ASCII-only. XXX could try wcstombs + instead, if the file system encoding is the locale's + encoding. */ + if (Py_FileSystemDefaultEncoding && + strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && + !PyThreadState_GET()->interp->codecs_initialized) + return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); /* Encode via the codec registry */ v = PyCodec_Encode(unicode, encoding, errors); @@ -1430,12 +1743,13 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, /* If the codec returns a buffer, raise a warning and convert to bytes */ if (PyByteArray_Check(v)) { - char msg[100]; + int error; PyObject *b; - PyOS_snprintf(msg, sizeof(msg), - "encoder %s returned buffer instead of bytes", - encoding); - if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { + + error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1, + "encoder %s returned bytearray instead of bytes", + encoding); + if (error) { Py_DECREF(v); return NULL; } @@ -1509,32 +1823,61 @@ PyUnicode_DecodeFSDefault(const char *s) { PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { - /* During the early bootstrapping process, Py_FileSystemDefaultEncoding - can be undefined. If it is case, decode using UTF-8. The following assumes - that Py_FileSystemDefaultEncoding is set to a built-in encoding during the - bootstrapping process where the codecs aren't ready yet. - */ - if (Py_FileSystemDefaultEncoding) { #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) - if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) { - return PyUnicode_DecodeMBCS(s, size, "replace"); - } + return PyUnicode_DecodeMBCS(s, size, NULL); #elif defined(__APPLE__) - if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) { - return PyUnicode_DecodeUTF8(s, size, "replace"); - } -#endif + return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); +#else + PyInterpreterState *interp = PyThreadState_GET()->interp; + /* Bootstrap check: if the filesystem codec is implemented in Python, we + cannot use it to encode and decode filenames before it is loaded. Load + the Python codec requires to encode at least its own filename. Use the C + version of the locale codec until the codec registry is initialized and + the Python codec is loaded. + + Py_FileSystemDefaultEncoding is shared between all interpreters, we + cannot only rely on it: check also interp->fscodec_initialized for + subinterpreters. */ + if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) { return PyUnicode_Decode(s, size, Py_FileSystemDefaultEncoding, - "replace"); + "surrogateescape"); } else { - return PyUnicode_DecodeUTF8(s, size, "replace"); + /* locale encoding with surrogateescape */ + wchar_t *wchar; + PyObject *unicode; + size_t len; + + if (s[size] != '\0' || size != strlen(s)) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character"); + return NULL; + } + + wchar = _Py_char2wchar(s, &len); + if (wchar == NULL) + return PyErr_NoMemory(); + + unicode = PyUnicode_FromWideChar(wchar, len); + PyMem_Free(wchar); + return unicode; } +#endif +} + + +int +_PyUnicode_HasNULChars(PyObject* s) +{ + static PyObject *nul = NULL; + + if (nul == NULL) + nul = PyUnicode_FromStringAndSize("\0", 1); + if (nul == NULL) + return -1; + return PyUnicode_Contains(s, nul); } -/* Convert the argument to a bytes object, according to the file - system encoding */ int PyUnicode_FSConverter(PyObject* arg, void* addr) @@ -1546,7 +1889,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) Py_DECREF(*(PyObject**)addr); return 1; } - if (PyBytes_Check(arg) || PyByteArray_Check(arg)) { + if (PyBytes_Check(arg)) { output = arg; Py_INCREF(output); } @@ -1554,9 +1897,7 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) arg = PyUnicode_FromObject(arg); if (!arg) return 0; - output = PyUnicode_AsEncodedObject(arg, - Py_FileSystemDefaultEncoding, - "surrogateescape"); + output = PyUnicode_EncodeFSDefault(arg); Py_DECREF(arg); if (!output) return 0; @@ -1566,15 +1907,50 @@ PyUnicode_FSConverter(PyObject* arg, void* addr) return 0; } } - if (PyBytes_Check(output)) { - size = PyBytes_GET_SIZE(output); - data = PyBytes_AS_STRING(output); + size = PyBytes_GET_SIZE(output); + data = PyBytes_AS_STRING(output); + if (size != strlen(data)) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character"); + Py_DECREF(output); + return 0; + } + *(PyObject**)addr = output; + return Py_CLEANUP_SUPPORTED; +} + + +int +PyUnicode_FSDecoder(PyObject* arg, void* addr) +{ + PyObject *output = NULL; + Py_ssize_t size; + void *data; + if (arg == NULL) { + Py_DECREF(*(PyObject**)addr); + return 1; + } + if (PyUnicode_Check(arg)) { + output = arg; + Py_INCREF(output); } else { - size = PyByteArray_GET_SIZE(output); - data = PyByteArray_AS_STRING(output); + arg = PyBytes_FromObject(arg); + if (!arg) + return 0; + output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg), + PyBytes_GET_SIZE(arg)); + Py_DECREF(arg); + if (!output) + return 0; + if (!PyUnicode_Check(output)) { + Py_DECREF(output); + PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode"); + return 0; + } } - if (size != strlen(data)) { + size = PyUnicode_GET_SIZE(output); + data = PyUnicode_AS_UNICODE(output); + if (size != Py_UNICODE_strlen(data)) { PyErr_SetString(PyExc_TypeError, "embedded NUL character"); Py_DECREF(output); return 0; @@ -1632,18 +2008,34 @@ Py_ssize_t PyUnicode_GetSize(PyObject *unicode) const char *PyUnicode_GetDefaultEncoding(void) { - return unicode_default_encoding; + return "utf-8"; } -int PyUnicode_SetDefaultEncoding(const char *encoding) +/* create or adjust a UnicodeDecodeError */ +static void +make_decode_exception(PyObject **exceptionObject, + const char *encoding, + const char *input, Py_ssize_t length, + Py_ssize_t startpos, Py_ssize_t endpos, + const char *reason) { - if (strcmp(encoding, unicode_default_encoding) != 0) { - PyErr_Format(PyExc_ValueError, - "Can only set default encoding to %s", - unicode_default_encoding); - return -1; + if (*exceptionObject == NULL) { + *exceptionObject = PyUnicodeDecodeError_Create( + encoding, input, length, startpos, endpos, reason); } - return 0; + else { + if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos)) + goto onError; + if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos)) + goto onError; + if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) + goto onError; + } + return; + +onError: + Py_DECREF(*exceptionObject); + *exceptionObject = NULL; } /* error handling callback helper: @@ -1679,20 +2071,13 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler goto onError; } - if (*exceptionObject == NULL) { - *exceptionObject = PyUnicodeDecodeError_Create( - encoding, *input, *inend-*input, *startinpos, *endinpos, reason); - if (*exceptionObject == NULL) - goto onError; - } - else { - if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) - goto onError; - if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) - goto onError; - if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) - goto onError; - } + make_decode_exception(exceptionObject, + encoding, + *input, *inend - *input, + *startinpos, *endinpos, + reason); + if (*exceptionObject == NULL) + goto onError; restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); if (restuple == NULL) @@ -1910,21 +2295,17 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, *p++ = outCh; #endif surrogate = 0; + continue; } else { + *p++ = surrogate; surrogate = 0; - errmsg = "second surrogate missing"; - goto utf7Error; } } - else if (outCh >= 0xD800 && outCh <= 0xDBFF) { + if (outCh >= 0xD800 && outCh <= 0xDBFF) { /* first surrogate */ surrogate = outCh; } - else if (outCh >= 0xDC00 && outCh <= 0xDFFF) { - errmsg = "unexpected second surrogate"; - goto utf7Error; - } else { *p++ = outCh; } @@ -1934,8 +2315,8 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, inShift = 0; s++; if (surrogate) { - errmsg = "second surrogate missing at end of shift sequence"; - goto utf7Error; + *p++ = surrogate; + surrogate = 0; } if (base64bits > 0) { /* left-over bits */ if (base64bits >= 6) { @@ -2382,7 +2763,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, outpos = p-PyUnicode_AS_UNICODE(unicode); if (unicode_decode_call_errorhandler( errors, &errorHandler, - "utf8", errmsg, + "utf-8", errmsg, &starts, &e, &startinpos, &endinpos, &exc, &s, &unicode, &outpos, &p)) goto onError; @@ -2408,6 +2789,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, #undef ASCII_CHAR_MASK +#ifdef __APPLE__ + +/* Simplified UTF-8 decoder using surrogateescape error handler, + used to decode the command line arguments on Mac OS X. */ + +wchar_t* +_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) +{ + int n; + const char *e; + wchar_t *unicode, *p; + + /* Note: size will always be longer than the resulting Unicode + character count */ + if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { + PyErr_NoMemory(); + return NULL; + } + unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); + if (!unicode) + return NULL; + + /* Unpack UTF-8 encoded data */ + p = unicode; + e = s + size; + while (s < e) { + Py_UCS4 ch = (unsigned char)*s; + + if (ch < 0x80) { + *p++ = (wchar_t)ch; + s++; + continue; + } + + n = utf8_code_length[ch]; + if (s + n > e) { + goto surrogateescape; + } + + switch (n) { + case 0: + case 1: + goto surrogateescape; + + case 2: + if ((s[1] & 0xc0) != 0x80) + goto surrogateescape; + ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); + assert ((ch > 0x007F) && (ch <= 0x07FF)); + *p++ = (wchar_t)ch; + break; + + case 3: + /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf + will result in surrogates in range d800-dfff. Surrogates are + not valid UTF-8 so they are rejected. + See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xE0 && + (unsigned char)s[1] < 0xA0) || + ((unsigned char)s[0] == 0xED && + (unsigned char)s[1] > 0x9F)) { + + goto surrogateescape; + } + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + assert ((ch > 0x07FF) && (ch <= 0xFFFF)); + *p++ = (Py_UNICODE)ch; + break; + + case 4: + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xF0 && + (unsigned char)s[1] < 0x90) || + ((unsigned char)s[0] == 0xF4 && + (unsigned char)s[1] > 0x8F)) { + goto surrogateescape; + } + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); + assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); + +#if SIZEOF_WCHAR_T == 4 + *p++ = (wchar_t)ch; +#else + /* compute and append the two surrogates: */ + + /* translate from 10000..10FFFF to 0..FFFF */ + ch -= 0x10000; + + /* high surrogate = top 10 bits added to D800 */ + *p++ = (wchar_t)(0xD800 + (ch >> 10)); + + /* low surrogate = bottom 10 bits added to DC00 */ + *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); +#endif + break; + } + s += n; + continue; + + surrogateescape: + *p++ = 0xDC00 + ch; + s++; + } + *p = L'\0'; + return unicode; +} + +#endif /* __APPLE__ */ /* Allocation strategy: if the string is short, convert into a stack buffer and allocate exactly as much space needed at the end. Else allocate the @@ -3366,7 +3861,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, } for (i = 0; i < digits; ++i) { c = (unsigned char) s[i]; - if (!ISXDIGIT(c)) { + if (!Py_ISXDIGIT(c)) { endinpos = (s+i+1)-starts; if (unicode_decode_call_errorhandler( errors, &errorHandler, @@ -3732,7 +4227,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, outpos = p-PyUnicode_AS_UNICODE(v); for (x = 0, i = 0; i < count; ++i, ++s) { c = (unsigned char)*s; - if (!ISXDIGIT(c)) { + if (!Py_ISXDIGIT(c)) { endinpos = s-starts; if (unicode_decode_call_errorhandler( errors, &errorHandler, @@ -4431,32 +4926,46 @@ static int is_dbcs_lead_byte(const char *s, int offset) static int decode_mbcs(PyUnicodeObject **v, const char *s, /* MBCS string */ int size, /* sizeof MBCS string */ - int final) + int final, + const char *errors) { Py_UNICODE *p; - Py_ssize_t n = 0; - int usize = 0; + Py_ssize_t n; + DWORD usize; + DWORD flags; assert(size >= 0); + /* check and handle 'errors' arg */ + if (errors==NULL || strcmp(errors, "strict")==0) + flags = MB_ERR_INVALID_CHARS; + else if (strcmp(errors, "ignore")==0) + flags = 0; + else { + PyErr_Format(PyExc_ValueError, + "mbcs encoding does not support errors='%s'", + errors); + return -1; + } + /* Skip trailing lead-byte unless 'final' is set */ if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) --size; /* First get the size of the result */ if (size > 0) { - usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); - if (usize == 0) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; - } - } + usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0); + if (usize==0) + goto mbcs_decode_error; + } else + usize = 0; if (*v == NULL) { /* Create unicode object */ *v = _PyUnicode_New(usize); if (*v == NULL) return -1; + n = 0; } else { /* Extend unicode object */ @@ -4466,15 +4975,35 @@ static int decode_mbcs(PyUnicodeObject **v, } /* Do the conversion */ - if (size > 0) { + if (usize > 0) { p = PyUnicode_AS_UNICODE(*v) + n; - if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { - PyErr_SetFromWindowsErrWithFilename(0, NULL); - return -1; + if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) { + goto mbcs_decode_error; } } - return size; + +mbcs_decode_error: + /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then + we raise a UnicodeDecodeError - else it is a 'generic' + windows error + */ + if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) { + /* Ideally, we should get reason from FormatMessage - this + is the Windows 2000 English version of the message + */ + PyObject *exc = NULL; + const char *reason = "No mapping for the Unicode character exists " + "in the target multi-byte code page."; + make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason); + if (exc != NULL) { + PyCodec_StrictErrors(exc); + Py_DECREF(exc); + } + } else { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + } + return -1; } PyObject *PyUnicode_DecodeMBCSStateful(const char *s, @@ -4491,10 +5020,10 @@ PyObject *PyUnicode_DecodeMBCSStateful(const char *s, #ifdef NEED_RETRY retry: if (size > INT_MAX) - done = decode_mbcs(&v, s, INT_MAX, 0); + done = decode_mbcs(&v, s, INT_MAX, 0, errors); else #endif - done = decode_mbcs(&v, s, (int)size, !consumed); + done = decode_mbcs(&v, s, (int)size, !consumed, errors); if (done < 0) { Py_XDECREF(v); @@ -4528,20 +5057,45 @@ PyObject *PyUnicode_DecodeMBCS(const char *s, */ static int encode_mbcs(PyObject **repr, const Py_UNICODE *p, /* unicode */ - int size) /* size of unicode */ + int size, /* size of unicode */ + const char* errors) { - int mbcssize = 0; - Py_ssize_t n = 0; + BOOL usedDefaultChar = FALSE; + BOOL *pusedDefaultChar; + int mbcssize; + Py_ssize_t n; + PyObject *exc = NULL; + DWORD flags; assert(size >= 0); + /* check and handle 'errors' arg */ + if (errors==NULL || strcmp(errors, "strict")==0) { + flags = WC_NO_BEST_FIT_CHARS; + pusedDefaultChar = &usedDefaultChar; + } else if (strcmp(errors, "replace")==0) { + flags = 0; + pusedDefaultChar = NULL; + } else { + PyErr_Format(PyExc_ValueError, + "mbcs encoding does not support errors='%s'", + errors); + return -1; + } + /* First get the size of the result */ if (size > 0) { - mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0, + NULL, pusedDefaultChar); if (mbcssize == 0) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; } + /* If we used a default char, then we failed! */ + if (pusedDefaultChar && *pusedDefaultChar) + goto mbcs_encode_error; + } else { + mbcssize = 0; } if (*repr == NULL) { @@ -4549,6 +5103,7 @@ static int encode_mbcs(PyObject **repr, *repr = PyBytes_FromStringAndSize(NULL, mbcssize); if (*repr == NULL) return -1; + n = 0; } else { /* Extend string object */ @@ -4560,13 +5115,20 @@ static int encode_mbcs(PyObject **repr, /* Do the conversion */ if (size > 0) { char *s = PyBytes_AS_STRING(*repr) + n; - if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize, + NULL, pusedDefaultChar)) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; } + if (pusedDefaultChar && *pusedDefaultChar) + goto mbcs_encode_error; } - return 0; + +mbcs_encode_error: + raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character"); + Py_XDECREF(exc); + return -1; } PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, @@ -4579,10 +5141,10 @@ PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, #ifdef NEED_RETRY retry: if (size > INT_MAX) - ret = encode_mbcs(&repr, p, INT_MAX); + ret = encode_mbcs(&repr, p, INT_MAX, errors); else #endif - ret = encode_mbcs(&repr, p, (int)size); + ret = encode_mbcs(&repr, p, (int)size, errors); if (ret < 0) { Py_XDECREF(repr); @@ -5699,6 +6261,30 @@ PyObject *PyUnicode_Translate(PyObject *str, return NULL; } +PyObject * +PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, + Py_ssize_t length) +{ + PyObject *result; + Py_UNICODE *p; /* write pointer into result */ + Py_ssize_t i; + /* Copy to a new string */ + result = (PyObject *)_PyUnicode_New(length); + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); + if (result == NULL) + return result; + p = PyUnicode_AS_UNICODE(result); + /* Iterate over code points */ + for (i = 0; i < length; i++) { + Py_UNICODE ch =s[i]; + if (ch > 127) { + int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + p[i] = '0' + decimal; + } + } + return result; +} /* --- Decimal Encoder ---------------------------------------------------- */ int PyUnicode_EncodeDecimal(Py_UNICODE *s, @@ -5750,11 +6336,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, } /* All other characters are considered unencodable */ collstart = p; - collend = p+1; - while (collend < end) { + for (collend = p+1; collend < end; collend++) { if ((0 < *collend && *collend < 256) || - !Py_UNICODE_ISSPACE(*collend) || - Py_UNICODE_TODECIMAL(*collend)) + Py_UNICODE_ISSPACE(*collend) || + 0 <= Py_UNICODE_TODECIMAL(*collend)) break; } /* cache callback name lookup @@ -5840,28 +6425,61 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, #include "stringlib/unicodedefs.h" #include "stringlib/fastsearch.h" + #include "stringlib/count.h" -/* Include _ParseTupleFinds from find.h */ -#define FROM_UNICODE #include "stringlib/find.h" #include "stringlib/partition.h" +#include "stringlib/split.h" #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale #include "stringlib/localeutil.h" /* helper macro to fixup start/end slice values */ -#define FIX_START_END(obj) \ - if (start < 0) \ - start += (obj)->length; \ - if (start < 0) \ - start = 0; \ - if (end > (obj)->length) \ - end = (obj)->length; \ - if (end < 0) \ - end += (obj)->length; \ - if (end < 0) \ - end = 0; +#define ADJUST_INDICES(start, end, len) \ + if (end > len) \ + end = len; \ + else if (end < 0) { \ + end += len; \ + if (end < 0) \ + end = 0; \ + } \ + if (start < 0) { \ + start += len; \ + if (start < 0) \ + start = 0; \ + } + +/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed + * by 'ptr', possibly combining surrogate pairs on narrow builds. + * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character + * that should be returned and 'end' pointing to the end of the buffer. + * ('end' is used on narrow builds to detect a lone surrogate at the + * end of the buffer that should be returned unchanged.) + * The ptr and end arguments should be side-effect free and ptr must an lvalue. + * The type of the returned char is always Py_UCS4. + * + * Note: the macro advances ptr to next char, so it might have side-effects + * (especially if used with other macros). + */ + +/* helper macros used by _Py_UNICODE_NEXT */ +#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + (((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000) + +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ + _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) +#endif Py_ssize_t PyUnicode_Count(PyObject *str, PyObject *substr, @@ -5881,10 +6499,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str, return -1; } - FIX_START_END(str_obj); - + ADJUST_INDICES(start, end, str_obj->length); result = stringlib_count( - str_obj->str + start, end - start, sub_obj->str, sub_obj->length + str_obj->str + start, end - start, sub_obj->str, sub_obj->length, + PY_SSIZE_T_MAX ); Py_DECREF(sub_obj); @@ -5939,8 +6557,7 @@ int tailmatch(PyUnicodeObject *self, if (substring->length == 0) return 1; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); end -= substring->length; if (end < start) return 0; @@ -6080,13 +6697,13 @@ int fixcapitalize(PyUnicodeObject *self) if (len == 0) return 0; - if (Py_UNICODE_ISLOWER(*s)) { + if (!Py_UNICODE_ISUPPER(*s)) { *s = Py_UNICODE_TOUPPER(*s); status = 1; } s++; while (--len > 0) { - if (Py_UNICODE_ISUPPER(*s)) { + if (!Py_UNICODE_ISLOWER(*s)) { *s = Py_UNICODE_TOLOWER(*s); status = 1; } @@ -6281,305 +6898,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self, return u; } -#define SPLIT_APPEND(data, left, right) \ - str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ - if (!str) \ - goto onError; \ - if (PyList_Append(list, str)) { \ - Py_DECREF(str); \ - goto onError; \ - } \ - else \ - Py_DECREF(str); - -static -PyObject *split_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) +PyObject *PyUnicode_Splitlines(PyObject *string, int keepends) { - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - /* find a token */ - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - while (i < len && !Py_UNICODE_ISSPACE(buf[i])) - i++; - if (j < i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - while (i < len && Py_UNICODE_ISSPACE(buf[i])) - i++; - j = i; - } - } - if (j < len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -PyObject *PyUnicode_Splitlines(PyObject *string, - int keepends) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len; PyObject *list; - PyObject *str; - Py_UNICODE *data; string = PyUnicode_FromObject(string); if (string == NULL) return NULL; - data = PyUnicode_AS_UNICODE(string); - len = PyUnicode_GET_SIZE(string); - - list = PyList_New(0); - if (!list) - goto onError; - - for (i = j = 0; i < len; ) { - Py_ssize_t eol; - - /* Find a line and append it */ - while (i < len && !BLOOM_LINEBREAK(data[i])) - i++; - /* Skip the line break reading CRLF as one line break */ - eol = i; - if (i < len) { - if (data[i] == '\r' && i + 1 < len && - data[i+1] == '\n') - i += 2; - else - i++; - if (keepends) - eol = i; - } - SPLIT_APPEND(data, j, eol); - j = i; - } - if (j < len) { - SPLIT_APPEND(data, j, len); - } + list = stringlib_splitlines( + (PyObject*) string, PyUnicode_AS_UNICODE(string), + PyUnicode_GET_SIZE(string), keepends); Py_DECREF(string); return list; - - onError: - Py_XDECREF(list); - Py_DECREF(string); - return NULL; } static -PyObject *split_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = 0; i < len; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, j, i); - i = j = i + 1; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(buf, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *split_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = j = 0; i <= len - sublen; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, j, i); - i = j = i + sublen; - } else - i++; - } - if (j <= len) { - SPLIT_APPEND(self->str, j, len); - } - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_whitespace(PyUnicodeObject *self, - PyObject *list, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - /* find a token */ - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) - i--; - if (j > i) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) - i--; - j = i; - } - } - if (j >= 0) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_char(PyUnicodeObject *self, - PyObject *list, - Py_UNICODE ch, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - PyObject *str; - register const Py_UNICODE *buf = self->str; - - for (i = j = len - 1; i >= 0; ) { - if (buf[i] == ch) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(buf, i + 1, j + 1); - j = i = i - 1; - } else - i--; - } - if (j >= -1) { - SPLIT_APPEND(buf, 0, j + 1); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -static -PyObject *rsplit_substring(PyUnicodeObject *self, - PyObject *list, - PyUnicodeObject *substring, - Py_ssize_t maxcount) -{ - register Py_ssize_t i; - register Py_ssize_t j; - Py_ssize_t len = self->length; - Py_ssize_t sublen = substring->length; - PyObject *str; - - for (i = len - sublen, j = len; i >= 0; ) { - if (Py_UNICODE_MATCH(self, i, substring)) { - if (maxcount-- <= 0) - break; - SPLIT_APPEND(self->str, i + sublen, j); - j = i; - i -= sublen; - } else - i--; - } - if (j >= 0) { - SPLIT_APPEND(self->str, 0, j); - } - if (PyList_Reverse(list) < 0) - goto onError; - return list; - - onError: - Py_DECREF(list); - return NULL; -} - -#undef SPLIT_APPEND - -static PyObject *split(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return split_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return split_char(self,list,substring->str[0],maxcount); + return stringlib_split_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return split_substring(self,list,substring,maxcount); + return stringlib_split( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -6587,28 +6939,19 @@ PyObject *rsplit(PyUnicodeObject *self, PyUnicodeObject *substring, Py_ssize_t maxcount) { - PyObject *list; - if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; - list = PyList_New(0); - if (!list) - return NULL; - if (substring == NULL) - return rsplit_whitespace(self,list,maxcount); - - else if (substring->length == 1) - return rsplit_char(self,list,substring->str[0],maxcount); + return stringlib_rsplit_whitespace( + (PyObject*) self, self->str, self->length, maxcount + ); - else if (substring->length == 0) { - Py_DECREF(list); - PyErr_SetString(PyExc_ValueError, "empty separator"); - return NULL; - } - else - return rsplit_substring(self,list,substring,maxcount); + return stringlib_rsplit( + (PyObject*) self, self->str, self->length, + substring->str, substring->length, + maxcount + ); } static @@ -6621,10 +6964,14 @@ PyObject *replace(PyUnicodeObject *self, if (maxcount < 0) maxcount = PY_SSIZE_T_MAX; + else if (maxcount == 0 || self->length == 0) + goto nothing; if (str1->length == str2->length) { - /* same length */ Py_ssize_t i; + /* same length */ + if (str1->length == 0) + goto nothing; if (str1->length == 1) { /* replace characters */ Py_UNICODE u1, u2; @@ -6643,8 +6990,8 @@ PyObject *replace(PyUnicodeObject *self, u->str[i] = u2; } } else { - i = fastsearch( - self->str, self->length, str1->str, str1->length, FAST_SEARCH + i = stringlib_find( + self->str, self->length, str1->str, str1->length, 0 ); if (i < 0) goto nothing; @@ -6652,25 +6999,30 @@ PyObject *replace(PyUnicodeObject *self, if (!u) return NULL; Py_UNICODE_COPY(u->str, self->str, self->length); - while (i <= self->length - str1->length) - if (Py_UNICODE_MATCH(self, i, str1)) { - if (--maxcount < 0) - break; - Py_UNICODE_COPY(u->str+i, str2->str, str2->length); - i += str1->length; - } else - i++; + + /* change everything in-place, starting with this one */ + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + + while ( --maxcount > 0) { + i = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (i == -1) + break; + Py_UNICODE_COPY(u->str+i, str2->str, str2->length); + i += str1->length; + } } } else { - Py_ssize_t n, i, j, e; + Py_ssize_t n, i, j; Py_ssize_t product, new_size, delta; Py_UNICODE *p; /* replace strings */ - n = stringlib_count(self->str, self->length, str1->str, str1->length); - if (n > maxcount) - n = maxcount; + n = stringlib_count(self->str, self->length, str1->str, str1->length, + maxcount); if (n == 0) goto nothing; /* new_size = self->length + n * (str2->length - str1->length)); */ @@ -6696,19 +7048,15 @@ PyObject *replace(PyUnicodeObject *self, return NULL; i = 0; p = u->str; - e = self->length - str1->length; if (str1->length > 0) { while (n-- > 0) { /* look for next match */ - j = i; - while (j <= e) { - if (Py_UNICODE_MATCH(self, j, str1)) - break; - j++; - } - if (j > i) { - if (j > e) - break; + j = stringlib_find(self->str+i, self->length-i, + str1->str, str1->length, + i); + if (j == -1) + break; + else if (j > i) { /* copy unchanged part [i:j] */ Py_UNICODE_COPY(p, self->str+i, j-i); p += j - i; @@ -6970,10 +7318,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) return ((int)id[i] < (int)str[i]) ? -1 : 1; /* This check keeps Python strings that end in '\0' from comparing equal to C strings identical up to that point. */ - if (PyUnicode_GET_SIZE(uni) != i) - /* We'll say the Python string is longer. */ - return 1; - if (id[i]) + if (PyUnicode_GET_SIZE(uni) != i || id[i]) return 1; /* uni is longer */ if (str[i]) return -1; /* str is longer */ @@ -7154,11 +7499,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args) &start, &end)) return NULL; - FIX_START_END(self); - + ADJUST_INDICES(start, end, self->length); result = PyLong_FromSsize_t( stringlib_count(self->str + start, end - start, - substring->str, substring->length) + substring->str, substring->length, + PY_SSIZE_T_MAX) ); Py_DECREF(substring); @@ -7167,39 +7512,26 @@ unicode_count(PyUnicodeObject *self, PyObject *args) } PyDoc_STRVAR(encode__doc__, - "S.encode([encoding[, errors]]) -> bytes\n\ + "S.encode(encoding='utf-8', errors='strict') -> bytes\n\ \n\ -Encode S using the codec registered for encoding. encoding defaults\n\ -to the default encoding. errors may be given to set a different error\n\ +Encode S using the codec registered for encoding. Default encoding\n\ +is 'utf-8'. errors may be given to set a different error\n\ handling scheme. Default is 'strict' meaning that encoding errors raise\n\ a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ 'xmlcharrefreplace' as well as any other name registered with\n\ codecs.register_error that can handle UnicodeEncodeErrors."); static PyObject * -unicode_encode(PyUnicodeObject *self, PyObject *args) +unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"encoding", "errors", 0}; char *encoding = NULL; char *errors = NULL; - PyObject *v; - if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode", + kwlist, &encoding, &errors)) return NULL; - v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); - if (v == NULL) - goto onError; - if (!PyBytes_Check(v)) { - PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object " - "(type=%.400s)", - Py_TYPE(v)->tp_name); - Py_DECREF(v); - return NULL; - } - return v; - - onError: - return NULL; + return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); } PyDoc_STRVAR(expandtabs__doc__, @@ -7293,7 +7625,7 @@ PyDoc_STRVAR(find__doc__, "S.find(sub[, start[, end]]) -> int\n\ \n\ Return the lowest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -7334,12 +7666,12 @@ unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) /* Believe it or not, this produces the same value for ASCII strings as string_hash(). */ -static long +static Py_hash_t unicode_hash(PyUnicodeObject *self) { Py_ssize_t len; Py_UNICODE *p; - long x; + Py_hash_t x; assert(_Py_HashSecret_Initialized); if (self->hash != -1) @@ -7357,7 +7689,7 @@ unicode_hash(PyUnicodeObject *self) x = _Py_HashSecret.prefix; x ^= *p << 7; while (--len >= 0) - x = (1000003*x) ^ *p++; + x = (_PyHASH_MULTIPLIER*x) ^ *p++; x ^= Py_SIZE(self); x ^= _Py_HashSecret.suffix; if (x == -1) @@ -7422,8 +7754,8 @@ unicode_islower(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7456,8 +7788,8 @@ unicode_isupper(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7494,8 +7826,8 @@ unicode_istitle(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; previous_is_cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { if (previous_is_cased) @@ -7537,8 +7869,9 @@ unicode_isspace(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISSPACE(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISSPACE(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7566,8 +7899,8 @@ unicode_isalpha(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALPHA(*p)) + while (p < e) { + if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7595,8 +7928,9 @@ unicode_isalnum(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALNUM(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISALNUM(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7624,8 +7958,8 @@ unicode_isdecimal(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDECIMAL(*p)) + while (p < e) { + if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7653,8 +7987,8 @@ unicode_isdigit(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDIGIT(*p)) + while (p < e) { + if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7682,8 +8016,8 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISNUMERIC(*p)) + while (p < e) { + if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7692,8 +8026,9 @@ unicode_isnumeric(PyUnicodeObject *self) int PyUnicode_IsIdentifier(PyObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); - register const Py_UNICODE *e; + const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); + const Py_UNICODE *e; + Py_UCS4 first; /* Special case for empty strings */ if (PyUnicode_GET_SIZE(self) == 0) @@ -7707,14 +8042,14 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) + e = p + PyUnicode_GET_SIZE(self); + first = _Py_UNICODE_NEXT(p, e); + if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) return 0; - e = p + PyUnicode_GET_SIZE(self); - for (p++; p < e; p++) { - if (!_PyUnicode_IsXidContinue(*p)) + while (p < e) + if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e))) return 0; - } return 1; } @@ -7748,8 +8083,8 @@ unicode_isprintable(PyObject *self) } e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISPRINTABLE(*p)) { + while (p < e) { + if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) { Py_RETURN_FALSE; } } @@ -8230,7 +8565,7 @@ PyDoc_STRVAR(rfind__doc__, "S.rfind(sub[, start[, end]]) -> int\n\ \n\ Return the highest index in S where substring sub is found,\n\ -such that sub is contained within s[start:end]. Optional\n\ +such that sub is contained within S[start:end]. Optional\n\ arguments start and end are interpreted as in slice notation.\n\ \n\ Return -1 on failure."); @@ -8577,9 +8912,13 @@ unicode_maketrans(PyUnicodeObject *null, PyObject *args) /* create entries for translating chars in x to those in y */ for (i = 0; i < PyUnicode_GET_SIZE(x); i++) { key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]); + if (!key) + goto err; value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]); - if (!key || !value) + if (!value) { + Py_DECREF(key); goto err; + } res = PyDict_SetItem(new, key, value); Py_DECREF(key); Py_DECREF(value); @@ -8715,6 +9054,13 @@ unicode_freelistsize(PyUnicodeObject *self) { return PyLong_FromLong(numfree); } + +static PyObject * +unicode__decimal2ascii(PyObject *self) +{ + return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self)); +} #endif PyDoc_STRVAR(startswith__doc__, @@ -8821,6 +9167,12 @@ PyDoc_STRVAR(format__doc__, Return a formatted version of S, using substitutions from args and kwargs.\n\ The substitutions are identified by braces ('{' and '}')."); +PyDoc_STRVAR(format_map__doc__, + "S.format_map(mapping) -> str\n\ +\n\ +Return a formatted version of S, using substitutions from mapping.\n\ +The substitutions are identified by braces ('{' and '}')."); + static PyObject * unicode__format__(PyObject* self, PyObject* args) { @@ -8855,13 +9207,12 @@ unicode_getnewargs(PyUnicodeObject *v) return Py_BuildValue("(u#)", v->str, v->length); } - static PyMethodDef unicode_methods[] = { /* Order is according to common usage: often used methods should appear first, since lookup is done sequentially. */ - {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, + {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__}, {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, @@ -8902,9 +9253,8 @@ static PyMethodDef unicode_methods[] = { {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__}, {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, + {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__}, {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, - {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, - {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS | METH_STATIC, maketrans__doc__}, {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, @@ -8913,8 +9263,9 @@ static PyMethodDef unicode_methods[] = { #endif #if 0 - /* This one is just used for debugging the implementation. */ + /* These methods are just used for debugging the implementation. */ {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, + {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, #endif {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, @@ -8965,7 +9316,7 @@ unicode_subscript(PyUnicodeObject* self, PyObject* item) Py_UNICODE* result_buf; PyObject* result; - if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), + if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self), &start, &stop, &step, &slicelength) < 0) { return NULL; } @@ -9394,8 +9745,6 @@ PyObject *PyUnicode_Format(PyObject *format, case 'o': case 'x': case 'X': - if (c == 'i') - c = 'd'; isnumok = 0; if (PyNumber_Check(v)) { PyObject *iobj=NULL; @@ -9410,7 +9759,7 @@ PyObject *PyUnicode_Format(PyObject *format, if (iobj!=NULL) { if (PyLong_Check(iobj)) { isnumok = 1; - temp = formatlong(iobj, flags, prec, c); + temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c)); Py_DECREF(iobj); if (!temp) goto onError; @@ -10019,6 +10368,15 @@ Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) return s1; } +Py_UNICODE* +Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2) +{ + Py_UNICODE *u1 = s1; + u1 += Py_UNICODE_strlen(u1); + Py_UNICODE_strcpy(u1, s2); + return s1; +} + int Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) { @@ -10033,6 +10391,23 @@ Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2) return 0; } +int +Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n) +{ + register Py_UNICODE u1, u2; + for (; n != 0; n--) { + u1 = *s1; + u2 = *s2; + if (u1 != u2) + return (u1 < u2) ? -1 : +1; + if (u1 == '\0') + return 0; + s1++; + s2++; + } + return 0; +} + Py_UNICODE* Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) { @@ -10043,15 +10418,72 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c) return NULL; } +Py_UNICODE* +Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c) +{ + const Py_UNICODE *p; + p = s + Py_UNICODE_strlen(s); + while (p != s) { + p--; + if (*p == c) + return (Py_UNICODE*)p; + } + return NULL; +} + +Py_UNICODE* +PyUnicode_AsUnicodeCopy(PyObject *object) +{ + PyUnicodeObject *unicode = (PyUnicodeObject *)object; + Py_UNICODE *copy; + Py_ssize_t size; -#ifdef __cplusplus + /* Ensure we won't overflow the size. */ + if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { + PyErr_NoMemory(); + return NULL; + } + size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */ + size *= sizeof(Py_UNICODE); + copy = PyMem_Malloc(size); + if (copy == NULL) { + PyErr_NoMemory(); + return NULL; + } + memcpy(copy, PyUnicode_AS_UNICODE(unicode), size); + return copy; } -#endif +/* A _string module, to export formatter_parser and formatter_field_name_split + to the string.Formatter class implemented in Python. */ -/* - Local variables: - c-basic-offset: 4 - indent-tabs-mode: nil - End: -*/ +static PyMethodDef _string_methods[] = { + {"formatter_field_name_split", (PyCFunction) formatter_field_name_split, + METH_O, PyDoc_STR("split the argument as a field name")}, + {"formatter_parser", (PyCFunction) formatter_parser, + METH_O, PyDoc_STR("parse the argument as a format string")}, + {NULL, NULL} +}; + +static struct PyModuleDef _string_module = { + PyModuleDef_HEAD_INIT, + "_string", + PyDoc_STR("string helper module"), + 0, + _string_methods, + NULL, + NULL, + NULL, + NULL +}; + +PyMODINIT_FUNC +PyInit__string(void) +{ + return PyModule_Create(&_string_module); +} + + +#ifdef __cplusplus +} +#endif |