diff options
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 113 |
1 files changed, 81 insertions, 32 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 11a3a6d0a4..c72871074b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6408,9 +6408,10 @@ PyUnicode_AsUTF16String(PyObject *unicode) static _PyUnicode_Name_CAPI *ucnhash_capi = NULL; PyObject * -_PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, Py_ssize_t size, const char *errors, + Py_ssize_t *consumed, const char **first_invalid_escape) { const char *starts = s; @@ -6423,6 +6424,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, *first_invalid_escape = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } /* Escaped strings will always be longer than the resulting @@ -6441,8 +6445,6 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_ASCII_CHAR(ch) \ @@ -6469,11 +6471,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, continue; } - startinpos = s - starts - 1; + Py_ssize_t startinpos = s - starts - 1; /* \ - Escapes */ if (s >= end) { message = "\\ at end of string"; - goto error; + goto incomplete; } c = (unsigned char) *s++; @@ -6527,7 +6529,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, count = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6540,12 +6545,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (count) { - goto error; - } /* when we get here, ch is a 32-bit unicode character */ if (ch > MAX_UNICODE) { @@ -6572,14 +6574,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } message = "malformed \\N character escape"; - if (s < end && *s == '{') { + if (s >= end) { + goto incomplete; + } + if (*s == '{') { const char *start = ++s; size_t namelen; /* look for the closing brace */ while (s < end && *s != '}') s++; + if (s >= end) { + goto incomplete; + } namelen = s - start; - if (namelen && s < end) { + if (namelen) { /* found a name. look it up in the unicode database */ s++; ch = 0xffffffff; /* in case 'getcode' messes up */ @@ -6605,8 +6613,13 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, continue; } - error: - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6633,12 +6646,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, } PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, +_PyUnicode_DecodeUnicodeEscapeStateful(const char *s, Py_ssize_t size, - const char *errors) + const char *errors, + Py_ssize_t *consumed) { const char *first_invalid_escape; - PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, + consumed, &first_invalid_escape); if (result == NULL) return NULL; @@ -6653,6 +6668,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s, return result; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL); +} + /* Return a Unicode-Escape string version of the Unicode object. */ PyObject * @@ -6791,9 +6814,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, /* --- Raw Unicode Escape Codec ------------------------------------------- */ PyObject * -PyUnicode_DecodeRawUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) { const char *starts = s; _PyUnicodeWriter writer; @@ -6802,6 +6826,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *exc = NULL; if (size == 0) { + if (consumed) { + *consumed = 0; + } _Py_RETURN_UNICODE_EMPTY(); } @@ -6820,8 +6847,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c = (unsigned char) *s++; Py_UCS4 ch; int count; - Py_ssize_t startinpos; - Py_ssize_t endinpos; const char *message; #define WRITE_CHAR(ch) \ @@ -6836,11 +6861,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (c != '\\' || s >= end) { + if (c != '\\' || (s >= end && !consumed)) { WRITE_CHAR(c); continue; } + Py_ssize_t startinpos = s - starts - 1; + /* \ - Escapes */ + if (s >= end) { + assert(consumed); + // Set message to silent compiler warning. + // Actually it is never used. + message = "\\ at end of string"; + goto incomplete; + } + c = (unsigned char) *s++; if (c == 'u') { count = 4; @@ -6856,10 +6891,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, WRITE_CHAR(c); continue; } - startinpos = s - starts - 2; /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ - for (ch = 0; count && s < end; ++s, --count) { + for (ch = 0; count; ++s, --count) { + if (s >= end) { + goto incomplete; + } c = (unsigned char)*s; ch <<= 4; if (c >= '0' && c <= '9') { @@ -6872,18 +6909,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ch += c - ('A' - 10); } else { - break; + goto error; } } - if (!count) { - if (ch <= MAX_UNICODE) { - WRITE_CHAR(ch); - continue; - } + if (ch > MAX_UNICODE) { message = "\\Uxxxxxxxx out of range"; + goto error; } + WRITE_CHAR(ch); + continue; - endinpos = s-starts; + incomplete: + if (consumed) { + *consumed = startinpos; + break; + } + error:; + Py_ssize_t endinpos = s-starts; writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, @@ -6905,7 +6947,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; +} +PyObject * +PyUnicode_DecodeRawUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL); } |