14 files changed, 2145 insertions, 1348 deletions
diff --git a/Objects/abstract.c b/Objects/abstract.c
index a0362e74e4..3e1ff97547 100644
--- a/Objects/abstract.c
+++ b/Objects/abstract.c
@@ -141,8 +141,11 @@ PyObject_GetItem(PyObject *o, PyObject *key)
         return null_error();
 
     m = o->ob_type->tp_as_mapping;
-    if (m && m->mp_subscript)
-        return m->mp_subscript(o, key);
+    if (m && m->mp_subscript) {
+        PyObject *item = m->mp_subscript(o, key);
+        assert((item != NULL) ^ (PyErr_Occurred() != NULL));
+        return item;
+    }
 
     if (o->ob_type->tp_as_sequence) {
         if (PyIndex_Check(key)) {
@@ -1544,8 +1547,10 @@ PySequence_GetItem(PyObject *s, Py_ssize_t i)
         if (i < 0) {
             if (m->sq_length) {
                 Py_ssize_t l = (*m->sq_length)(s);
-                if (l < 0)
+                if (l < 0) {
+                    assert(PyErr_Occurred());
                     return NULL;
+                }
                 i += l;
             }
         }
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index c59ad2499a..96ab57d24c 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -282,26 +282,14 @@ PyByteArray_Concat(PyObject *a, PyObject *b)
 static PyObject *
 bytearray_format(PyByteArrayObject *self, PyObject *args)
 {
-    PyObject *bytes_in, *bytes_out, *res;
-    char *bytestring;
-
-    if (self == NULL || !PyByteArray_Check(self) || args == NULL) {
+    if (self == NULL || !PyByteArray_Check(self)) {
         PyErr_BadInternalCall();
         return NULL;
     }
-    bytestring = PyByteArray_AS_STRING(self);
-    bytes_in = PyBytes_FromString(bytestring);
-    if (bytes_in == NULL)
-        return NULL;
-    bytes_out = _PyBytes_Format(bytes_in, args);
-    Py_DECREF(bytes_in);
-    if (bytes_out == NULL)
-        return NULL;
-    res = PyByteArray_FromObject(bytes_out);
-    Py_DECREF(bytes_out);
-    if (res == NULL)
-        return NULL;
-    return res;
+
+    return _PyBytes_FormatEx(PyByteArray_AS_STRING(self),
+                             PyByteArray_GET_SIZE(self),
+                             args, 1);
 }
 
 /* Functions stuffed into the type object */
@@ -1171,16 +1159,15 @@ bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
     ADJUST_INDICES(start, end, len);
     if (end - start < sub_len)
         res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyByteArray_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
+    else if (sub_len == 1) {
+        if (dir > 0)
+            res = stringlib_find_char(
+                PyByteArray_AS_STRING(self) + start, end - start,
+                *sub);
+        else
+            res = stringlib_rfind_char(
+                PyByteArray_AS_STRING(self) + start, end - start,
+                *sub);
         if (res >= 0)
             res += start;
     }
@@ -2801,22 +2788,6 @@ bytearray_splitlines_impl(PyByteArrayObject *self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytearray.fromhex
@@ -2835,48 +2806,7 @@ static PyObject *
 bytearray_fromhex_impl(PyObject*cls, PyObject *string)
 /*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
 {
-    PyObject *newbytes;
-    char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
-
-    assert(PyUnicode_Check(string));
-    if (PyUnicode_READY(string))
-        return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
-    hexlen = PyUnicode_GET_LENGTH(string);
-
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
-    if (!newbytes)
-        return NULL;
-    buf = PyByteArray_AS_STRING(newbytes);
-    for (i = j = 0; i < hexlen; i += 2) {
-        /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
-            goto error;
-        }
-        buf[j++] = (top << 4) + bot;
-    }
-    if (PyByteArray_Resize(newbytes, j) < 0)
-        goto error;
-    return newbytes;
-
-  error:
-    Py_DECREF(newbytes);
-    return NULL;
+    return _PyBytes_FromHex(string, 1);
 }
 
 PyDoc_STRVAR(hex__doc__,
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 51d0871c8b..f9805162fe 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
 */
 #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
 
+/* Forward declaration */
+Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
+                                                   char *str);
+
 /*
    For PyBytes_FromString(), the parameter `str' points to a null-terminated
    string containing exactly `size' bytes.
@@ -174,190 +178,184 @@ PyBytes_FromString(const char *str)
 PyObject *
 PyBytes_FromFormatV(const char *format, va_list vargs)
 {
-    va_list count;
-    Py_ssize_t n = 0;
-    const char* f;
     char *s;
-    PyObject* string;
+    const char *f;
+    const char *p;
+    Py_ssize_t prec;
+    int longflag;
+    int size_tflag;
+    /* Longest 64-bit formatted numbers:
+       - "18446744073709551615\0" (21 bytes)
+       - "-9223372036854775808\0" (21 bytes)
+       Decimal takes the most space (it isn't enough for octal.)
+
+       Longest 64-bit pointer representation:
+       "0xffffffffffffffff\0" (19 bytes). */
+    char buffer[21];
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    s = _PyBytesWriter_Alloc(&writer, strlen(format));
+    if (s == NULL)
+        return NULL;
+    writer.overallocate = 1;
+
+#define WRITE_BYTES(str) \
+    do { \
+        s = _PyBytesWriter_WriteBytes(&writer, s, (str), strlen(str)); \
+        if (s == NULL) \
+            goto error; \
+    } while (0)
 
-    Py_VA_COPY(count, vargs);
-    /* step 1: figure out how large a buffer we need */
     for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f;
-            while (*++f && *f != '%' && !Py_ISALPHA(*f))
-                ;
-
-            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
-             * they don't affect the amount of space we reserve.
-             */
-            if ((*f == 'l' || *f == 'z') &&
-                            (f[1] == 'd' || f[1] == 'u'))
-                ++f;
-
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(count, int);
-                if (c < 0 || c > 255) {
-                    PyErr_SetString(PyExc_OverflowError,
-                                    "PyBytes_FromFormatV(): %c format "
-                                    "expects an integer in range [0; 255]");
-                    return NULL;
-                }
-                n++;
-                break;
+        if (*f != '%') {
+            *s++ = *f;
+            continue;
+        }
+
+        p = f++;
+
+        /* ignore the width (ex: 10 in "%10s") */
+        while (Py_ISDIGIT(*f))
+            f++;
+
+        /* parse the precision (ex: 10 in "%.10s") */
+        prec = 0;
+        if (*f == '.') {
+            f++;
+            for (; Py_ISDIGIT(*f); f++) {
+                prec = (prec * 10) + (*f - '0');
             }
-            case '%':
-                n++;
-                break;
-            case 'd': case 'u': case 'i': case 'x':
-                (void) va_arg(count, int);
-                /* 20 bytes is enough to hold a 64-bit
-                   integer.  Decimal takes the most space.
-                   This isn't enough for octal. */
-                n += 20;
-                break;
-            case 's':
-                s = va_arg(count, char*);
-                n += strlen(s);
-                break;
-            case 'p':
-                (void) va_arg(count, int);
-                /* maximum 64-bit pointer representation:
-                 * 0xffffffffffffffff
-                 * so 19 characters is enough.
-                 * XXX I count 18 -- what's the extra for?
-                 */
-                n += 19;
-                break;
-            default:
-                /* if we stumble upon an unknown
-                   formatting code, copy the rest of
-                   the format string to the output
-                   string. (we cannot just skip the
-                   code, since there's no way to know
-                   what's in the argument list) */
-                n += strlen(p);
-                goto expand;
+        }
+
+        while (*f && *f != '%' && !Py_ISALPHA(*f))
+            f++;
+
+        /* handle the long flag ('l'), but only for %ld and %lu.
+           others can be added when necessary. */
+        longflag = 0;
+        if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+            longflag = 1;
+            ++f;
+        }
+
+        /* handle the size_t flag ('z'). */
+        size_tflag = 0;
+        if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+            size_tflag = 1;
+            ++f;
+        }
+
+        /* substract bytes preallocated for the format string
+           (ex: 2 for "%s") */
+        writer.min_size -= (f - p + 1);
+
+        switch (*f) {
+        case 'c':
+        {
+            int c = va_arg(vargs, int);
+            if (c < 0 || c > 255) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "PyBytes_FromFormatV(): %c format "
+                                "expects an integer in range [0; 255]");
+                goto error;
             }
-        } else
-            n++;
-    }
- expand:
-    /* step 2: fill the buffer */
-    /* Since we've analyzed how much space we need for the worst case,
-       use sprintf directly instead of the slower PyOS_snprintf. */
-    string = PyBytes_FromStringAndSize(NULL, n);
-    if (!string)
-        return NULL;
+            writer.min_size++;
+            *s++ = (unsigned char)c;
+            break;
+        }
 
-    s = PyBytes_AsString(string);
+        case 'd':
+            if (longflag)
+                sprintf(buffer, "%ld", va_arg(vargs, long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
+                    va_arg(vargs, Py_ssize_t));
+            else
+                sprintf(buffer, "%d", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
 
-    for (f = format; *f; f++) {
-        if (*f == '%') {
-            const char* p = f++;
+        case 'u':
+            if (longflag)
+                sprintf(buffer, "%lu",
+                    va_arg(vargs, unsigned long));
+            else if (size_tflag)
+                sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
+                    va_arg(vargs, size_t));
+            else
+                sprintf(buffer, "%u",
+                    va_arg(vargs, unsigned int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'i':
+            sprintf(buffer, "%i", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 'x':
+            sprintf(buffer, "%x", va_arg(vargs, int));
+            assert(strlen(buffer) < sizeof(buffer));
+            WRITE_BYTES(buffer);
+            break;
+
+        case 's':
+        {
             Py_ssize_t i;
-            int longflag = 0;
-            int size_tflag = 0;
-            /* parse the width.precision part (we're only
-               interested in the precision value, if any) */
-            n = 0;
-            while (Py_ISDIGIT(*f))
-                n = (n*10) + *f++ - '0';
-            if (*f == '.') {
-                f++;
-                n = 0;
-                while (Py_ISDIGIT(*f))
-                    n = (n*10) + *f++ - '0';
-            }
-            while (*f && *f != '%' && !Py_ISALPHA(*f))
-                f++;
-            /* handle the long flag, but only for %ld and %lu.
-               others can be added when necessary. */
-            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
-                longflag = 1;
-                ++f;
-            }
-            /* handle the size_t flag. */
-            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
-                size_tflag = 1;
-                ++f;
-            }
 
-            switch (*f) {
-            case 'c':
-            {
-                int c = va_arg(vargs, int);
-                /* c has been checked for overflow in the first step */
-                *s++ = (unsigned char)c;
-                break;
+            p = va_arg(vargs, char*);
+            i = strlen(p);
+            if (prec > 0 && i > prec)
+                i = prec;
+            s = _PyBytesWriter_WriteBytes(&writer, s, p, i);
+            if (s == NULL)
+                goto error;
+            break;
+        }
+
+        case 'p':
+            sprintf(buffer, "%p", va_arg(vargs, void*));
+            assert(strlen(buffer) < sizeof(buffer));
+            /* %p is ill-defined:  ensure leading 0x. */
+            if (buffer[1] == 'X')
+                buffer[1] = 'x';
+            else if (buffer[1] != 'x') {
+                memmove(buffer+2, buffer, strlen(buffer)+1);
+                buffer[0] = '0';
+                buffer[1] = 'x';
             }
-            case 'd':
-                if (longflag)
-                    sprintf(s, "%ld", va_arg(vargs, long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "d",
-                        va_arg(vargs, Py_ssize_t));
-                else
-                    sprintf(s, "%d", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'u':
-                if (longflag)
-                    sprintf(s, "%lu",
-                        va_arg(vargs, unsigned long));
-                else if (size_tflag)
-                    sprintf(s, "%" PY_FORMAT_SIZE_T "u",
-                        va_arg(vargs, size_t));
-                else
-                    sprintf(s, "%u",
-                        va_arg(vargs, unsigned int));
-                s += strlen(s);
-                break;
-            case 'i':
-                sprintf(s, "%i", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 'x':
-                sprintf(s, "%x", va_arg(vargs, int));
-                s += strlen(s);
-                break;
-            case 's':
-                p = va_arg(vargs, char*);
-                i = strlen(p);
-                if (n > 0 && i > n)
-                    i = n;
-                Py_MEMCPY(s, p, i);
-                s += i;
-                break;
-            case 'p':
-                sprintf(s, "%p", va_arg(vargs, void*));
-                /* %p is ill-defined:  ensure leading 0x. */
-                if (s[1] == 'X')
-                    s[1] = 'x';
-                else if (s[1] != 'x') {
-                    memmove(s+2, s, strlen(s)+1);
-                    s[0] = '0';
-                    s[1] = 'x';
-                }
-                s += strlen(s);
-                break;
-            case '%':
-                *s++ = '%';
-                break;
-            default:
-                strcpy(s, p);
-                s += strlen(s);
-                goto end;
+            WRITE_BYTES(buffer);
+            break;
+
+        case '%':
+            writer.min_size++;
+            *s++ = '%';
+            break;
+
+        default:
+            if (*f == 0) {
+                /* fix min_size if we reached the end of the format string */
+                writer.min_size++;
             }
-        } else
-            *s++ = *f;
+
+            /* invalid format string: copy unformatted string and exit */
+            WRITE_BYTES(p);
+            return _PyBytesWriter_Finish(&writer, s);
+        }
     }
 
- end:
-    _PyBytes_Resize(&string, s - PyBytes_AS_STRING(string));
-    return string;
+#undef WRITE_BYTES
+
+    return _PyBytesWriter_Finish(&writer, s);
+
+ error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -409,12 +407,14 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
 
 /* Returns a new reference to a PyBytes object, or NULL on failure. */
 
-static PyObject *
-formatfloat(PyObject *v, int flags, int prec, int type)
+static char*
+formatfloat(PyObject *v, int flags, int prec, int type,
+            PyObject **p_result, _PyBytesWriter *writer, char *str)
 {
     char *p;
     PyObject *result;
     double x;
+    size_t len;
 
     x = PyFloat_AsDouble(v);
     if (x == -1.0 && PyErr_Occurred()) {
@@ -431,9 +431,21 @@ formatfloat(PyObject *v, int flags, int prec, int type)
 
     if (p == NULL)
         return NULL;
-    result = PyBytes_FromStringAndSize(p, strlen(p));
+
+    len = strlen(p);
+    if (writer != NULL) {
+        str = _PyBytesWriter_Prepare(writer, str, len);
+        if (str == NULL)
+            return NULL;
+        Py_MEMCPY(str, p, len);
+        str += len;
+        return str;
+    }
+
+    result = PyBytes_FromStringAndSize(p, len);
     PyMem_Free(p);
-    return result;
+    *p_result = result;
+    return str;
 }
 
 static PyObject *
@@ -557,36 +569,36 @@ format_obj(PyObject *v, const char **pbuf, Py_ssize_t *plen)
     return NULL;
 }
 
-/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
-
-   FORMATBUFLEN is the length of the buffer in which the ints &
-   chars are formatted. XXX This is a magic number. Each formatting
-   routine does bounds checking to ensure no overflow, but a better
-   solution may be to malloc a buffer of appropriate size for each
-   format. For now, the current solution is sufficient.
-*/
-#define FORMATBUFLEN (size_t)120
+/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) */
 
 PyObject *
-_PyBytes_Format(PyObject *format, PyObject *args)
+_PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
+                  PyObject *args, int use_bytearray)
 {
-    char *fmt, *res;
+    const char *fmt;
+    char *res;
     Py_ssize_t arglen, argidx;
-    Py_ssize_t reslen, rescnt, fmtcnt;
+    Py_ssize_t fmtcnt;
     int args_owned = 0;
-    PyObject *result;
     PyObject *dict = NULL;
-    if (format == NULL || !PyBytes_Check(format) || args == NULL) {
+    _PyBytesWriter writer;
+
+    if (args == NULL) {
         PyErr_BadInternalCall();
         return NULL;
     }
-    fmt = PyBytes_AS_STRING(format);
-    fmtcnt = PyBytes_GET_SIZE(format);
-    reslen = rescnt = fmtcnt + 100;
-    result = PyBytes_FromStringAndSize((char *)NULL, reslen);
-    if (result == NULL)
+    fmt = format;
+    fmtcnt = format_len;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
+
+    res = _PyBytesWriter_Alloc(&writer, fmtcnt);
+    if (res == NULL)
         return NULL;
-    res = PyBytes_AsString(result);
+    if (!use_bytearray)
+        writer.overallocate = 1;
+
     if (PyTuple_Check(args)) {
         arglen = PyTuple_GET_SIZE(args);
         argidx = 0;
@@ -600,18 +612,23 @@ _PyBytes_Format(PyObject *format, PyObject *args)
         !PyByteArray_Check(args)) {
             dict = args;
     }
+
     while (--fmtcnt >= 0) {
         if (*fmt != '%') {
-            if (--rescnt < 0) {
-                rescnt = fmtcnt + 100;
-                reslen += rescnt;
-                if (_PyBytes_Resize(&result, reslen))
-                    return NULL;
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
-                --rescnt;
-            }
-            *res++ = *fmt++;
+            Py_ssize_t len;
+            char *pos;
+
+            pos = strchr(fmt + 1, '%');
+            if (pos != NULL)
+                len = pos - fmt;
+            else
+                len = format_len - (fmt - format);
+            assert(len != 0);
+
+            Py_MEMCPY(res, fmt, len);
+            res += len;
+            fmt += len;
+            fmtcnt -= (len - 1);
         }
         else {
             /* Got a format specifier */
@@ -626,10 +643,14 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             int sign;
             Py_ssize_t len = 0;
             char onechar; /* For byte_converter() */
+            Py_ssize_t alloc;
+#ifdef Py_DEBUG
+            char *before;
+#endif
 
             fmt++;
             if (*fmt == '(') {
-                char *keystart;
+                const char *keystart;
                 Py_ssize_t keylen;
                 PyObject *key;
                 int pcount = 1;
@@ -673,6 +694,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 arglen = -1;
                 argidx = -2;
             }
+
+            /* Parse flags. Example: "%+i" => flags=F_SIGN. */
             while (--fmtcnt >= 0) {
                 switch (c = *fmt++) {
                 case '-': flags |= F_LJUST; continue;
@@ -683,6 +706,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 }
                 break;
             }
+
+            /* Parse width. Example: "%10s" => width=10 */
             if (c == '*') {
                 v = getnextarg(args, arglen, &argidx);
                 if (v == NULL)
@@ -717,6 +742,8 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     width = width*10 + (c - '0');
                 }
             }
+
+            /* Parse precision. Example: "%.3f" => prec=3 */
             if (c == '.') {
                 prec = 0;
                 if (--fmtcnt >= 0)
@@ -771,13 +798,19 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (v == NULL)
                     goto error;
             }
+
+            if (fmtcnt < 0) {
+                /* last writer: disable writer overallocation */
+                writer.overallocate = 0;
+            }
+
             sign = 0;
             fill = ' ';
             switch (c) {
             case '%':
-                pbuf = "%";
-                len = 1;
-                break;
+                *res++ = '%';
+                continue;
+
             case 'r':
                 // %r is only for 2/3 code; 3 only code should use %a
             case 'a':
@@ -790,6 +823,7 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 's':
                 // %s is only for 2/3 code; 3 only code should use %b
             case 'b':
@@ -799,12 +833,49 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (prec >= 0 && len > prec)
                     len = prec;
                 break;
+
             case 'i':
             case 'd':
             case 'u':
             case 'o':
             case 'x':
             case 'X':
+                if (PyLong_CheckExact(v)
+                    && width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK))
+                    && c != 'X')
+                {
+                    /* Fast path */
+                    int alternate = flags & F_ALT;
+                    int base;
+
+                    switch(c)
+                    {
+                        default:
+                            assert(0 && "'type' not in [diuoxX]");
+                        case 'd':
+                        case 'i':
+                        case 'u':
+                            base = 10;
+                            break;
+                        case 'o':
+                            base = 8;
+                            break;
+                        case 'x':
+                        case 'X':
+                            base = 16;
+                            break;
+                    }
+
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%d" */
+                    res = _PyLong_FormatBytesWriter(&writer, res,
+                                                    v, base, alternate);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
                 temp = formatlong(v, flags, prec, c);
                 if (!temp)
                     goto error;
@@ -815,14 +886,25 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'e':
             case 'E':
             case 'f':
             case 'F':
             case 'g':
             case 'G':
-                temp = formatfloat(v, flags, prec, c);
-                if (temp == NULL)
+                if (width == -1 && prec == -1
+                    && !(flags & (F_SIGN | F_BLANK)))
+                {
+                    /* Fast path */
+                    writer.min_size -= 2; /* size preallocated for "%f" */
+                    res = formatfloat(v, flags, prec, c, NULL, &writer, res);
+                    if (res == NULL)
+                        goto error;
+                    continue;
+                }
+
+                if (!formatfloat(v, flags, prec, c, &temp, NULL, res))
                     goto error;
                 pbuf = PyBytes_AS_STRING(temp);
                 len = PyBytes_GET_SIZE(temp);
@@ -830,21 +912,28 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 if (flags & F_ZERO)
                     fill = '0';
                 break;
+
             case 'c':
                 pbuf = &onechar;
                 len = byte_converter(v, &onechar);
                 if (!len)
                     goto error;
+                if (width == -1) {
+                    /* Fast path */
+                    *res++ = onechar;
+                    continue;
+                }
                 break;
+
             default:
                 PyErr_Format(PyExc_ValueError,
                   "unsupported format character '%c' (0x%x) "
                   "at index %zd",
                   c, c,
-                  (Py_ssize_t)(fmt - 1 -
-                               PyBytes_AsString(format)));
+                  (Py_ssize_t)(fmt - 1 - format));
                 goto error;
             }
+
             if (sign) {
                 if (*pbuf == '-' || *pbuf == '+') {
                     sign = *pbuf++;
@@ -859,29 +948,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
             }
             if (width < len)
                 width = len;
-            if (rescnt - (sign != 0) < width) {
-                reslen -= rescnt;
-                rescnt = width + fmtcnt + 100;
-                reslen += rescnt;
-                if (reslen < 0) {
-                    Py_DECREF(result);
-                    Py_XDECREF(temp);
-                    return PyErr_NoMemory();
-                }
-                if (_PyBytes_Resize(&result, reslen)) {
-                    Py_XDECREF(temp);
-                    return NULL;
-                }
-                res = PyBytes_AS_STRING(result)
-                    + reslen - rescnt;
+
+            alloc = width;
+            if (sign != 0 && len == width)
+                alloc++;
+            /* 2: size preallocated for %s */
+            if (alloc > 2) {
+                res = _PyBytesWriter_Prepare(&writer, res, alloc - 2);
+                if (res == NULL)
+                    goto error;
             }
+#ifdef Py_DEBUG
+            before = res;
+#endif
+
+            /* Write the sign if needed */
             if (sign) {
                 if (fill != ' ')
                     *res++ = sign;
-                rescnt--;
                 if (width > len)
                     width--;
             }
+
+            /* Write the numeric prefix for "x", "X" and "o" formats
+               if the alternate form is used.
+               For example, write "0x" for the "%#x" format. */
             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
                 assert(pbuf[0] == '0');
                 assert(pbuf[1] == c);
@@ -889,18 +980,21 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                     *res++ = *pbuf++;
                 }
-                rescnt -= 2;
                 width -= 2;
                 if (width < 0)
                     width = 0;
                 len -= 2;
             }
+
+            /* Pad left with the fill character if needed */
             if (width > len && !(flags & F_LJUST)) {
-                do {
-                    --rescnt;
-                    *res++ = fill;
-                } while (--width > len);
+                memset(res, fill, width - len);
+                res += (width - len);
+                width = len;
             }
+
+            /* If padding with spaces: write sign if needed and/or numeric
+               prefix if the alternate form is used */
             if (fill == ' ') {
                 if (sign)
                     *res++ = sign;
@@ -912,13 +1006,17 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                     *res++ = *pbuf++;
                 }
             }
+
+            /* Copy bytes */
             Py_MEMCPY(res, pbuf, len);
             res += len;
-            rescnt -= len;
-            while (--width >= len) {
-                --rescnt;
-                *res++ = ' ';
+
+            /* Pad right with the fill character if needed */
+            if (width > len) {
+                memset(res, ' ', width - len);
+                res += (width - len);
             }
+
             if (dict && (argidx < arglen) && c != '%') {
                 PyErr_SetString(PyExc_TypeError,
                            "not all arguments converted during bytes formatting");
@@ -926,22 +1024,31 @@ _PyBytes_Format(PyObject *format, PyObject *args)
                 goto error;
             }
             Py_XDECREF(temp);
+
+#ifdef Py_DEBUG
+            /* check that we computed the exact size for this write */
+            assert((res - before) == alloc);
+#endif
         } /* '%' */
+
+        /* If overallocation was disabled, ensure that it was the last
+           write. Otherwise, we missed an optimization */
+        assert(writer.overallocate || fmtcnt < 0 || use_bytearray);
     } /* until end */
+
     if (argidx < arglen && !dict) {
         PyErr_SetString(PyExc_TypeError,
                         "not all arguments converted during bytes formatting");
         goto error;
     }
+
     if (args_owned) {
         Py_DECREF(args);
     }
-    if (_PyBytes_Resize(&result, reslen - rescnt))
-        return NULL;
-    return result;
+    return _PyBytesWriter_Finish(&writer, res);
 
  error:
-    Py_DECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     if (args_owned) {
         Py_DECREF(args);
     }
@@ -961,6 +1068,42 @@ bytes_dealloc(PyObject *op)
    the string is UTF-8 encoded and should be re-encoded in the
    specified encoding.  */
 
+static char *
+_PyBytes_DecodeEscapeRecode(const char **s, const char *end,
+                            const char *errors, const char *recode_encoding,
+                            _PyBytesWriter *writer, char *p)
+{
+    PyObject *u, *w;
+    const char* t;
+
+    t = *s;
+    /* Decode non-ASCII bytes as UTF-8. */
+    while (t < end && (*t & 0x80))
+        t++;
+    u = PyUnicode_DecodeUTF8(*s, t - *s, errors);
+    if (u == NULL)
+        return NULL;
+
+    /* Recode them in target encoding. */
+    w = PyUnicode_AsEncodedString(u, recode_encoding, errors);
+    Py_DECREF(u);
+    if  (w == NULL)
+        return NULL;
+    assert(PyBytes_Check(w));
+
+    /* Append bytes to output buffer. */
+    writer->min_size--;   /* substract 1 preallocated byte */
+    p = _PyBytesWriter_WriteBytes(writer, p,
+                                  PyBytes_AS_STRING(w),
+                                  PyBytes_GET_SIZE(w));
+    Py_DECREF(w);
+    if (p == NULL)
+        return NULL;
+
+    *s = t;
+    return p;
+}
+
 PyObject *PyBytes_DecodeEscape(const char *s,
                                 Py_ssize_t len,
                                 const char *errors,
@@ -968,54 +1111,42 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                                 const char *recode_encoding)
 {
     int c;
-    char *p, *buf;
+    char *p;
     const char *end;
-    PyObject *v;
-    Py_ssize_t newlen = recode_encoding ? 4*len:len;
-    v = PyBytes_FromStringAndSize((char *)NULL, newlen);
-    if (v == NULL)
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
         return NULL;
-    p = buf = PyBytes_AsString(v);
+    writer.overallocate = 1;
+
     end = s + len;
     while (s < end) {
         if (*s != '\\') {
           non_esc:
-            if (recode_encoding && (*s & 0x80)) {
-                PyObject *u, *w;
-                char *r;
-                const char* t;
-                Py_ssize_t rn;
-                t = s;
-                /* Decode non-ASCII bytes as UTF-8. */
-                while (t < end && (*t & 0x80)) t++;
-                u = PyUnicode_DecodeUTF8(s, t - s, errors);
-                if(!u) goto failed;
-
-                /* Recode them in target encoding. */
-                w = PyUnicode_AsEncodedString(
-                    u, recode_encoding, errors);
-                Py_DECREF(u);
-                if (!w)                 goto failed;
-
-                /* Append bytes to output buffer. */
-                assert(PyBytes_Check(w));
-                r = PyBytes_AS_STRING(w);
-                rn = PyBytes_GET_SIZE(w);
-                Py_MEMCPY(p, r, rn);
-                p += rn;
-                Py_DECREF(w);
-                s = t;
-            } else {
+            if (!(recode_encoding && (*s & 0x80))) {
                 *p++ = *s++;
             }
+            else {
+                /* non-ASCII character and need to recode */
+                p = _PyBytes_DecodeEscapeRecode(&s, end,
+                                                errors, recode_encoding,
+                                                &writer, p);
+                if (p == NULL)
+                    goto failed;
+            }
             continue;
         }
+
         s++;
-        if (s==end) {
+        if (s == end) {
             PyErr_SetString(PyExc_ValueError,
                             "Trailing \\ in string");
             goto failed;
         }
+
         switch (*s++) {
         /* XXX This assumes ASCII! */
         case '\n': break;
@@ -1040,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             *p++ = c;
             break;
         case 'x':
-            if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) {
-                unsigned int x = 0;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x = c - '0';
-                else if (Py_ISLOWER(c))
-                    x = 10 + c - 'a';
-                else
-                    x = 10 + c - 'A';
-                x = x << 4;
-                c = Py_CHARMASK(*s);
-                s++;
-                if (Py_ISDIGIT(c))
-                    x += c - '0';
-                else if (Py_ISLOWER(c))
-                    x += 10 + c - 'a';
-                else
-                    x += 10 + c - 'A';
-                *p++ = x;
-                break;
+            if (s+1 < end) {
+                int digit1, digit2;
+                digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])];
+                digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])];
+                if (digit1 < 16 && digit2 < 16) {
+                    *p++ = (unsigned char)((digit1 << 4) + digit2);
+                    s += 2;
+                    break;
+                }
             }
+            /* invalid hexadecimal digits */
+
             if (!errors || strcmp(errors, "strict") == 0) {
                 PyErr_Format(PyExc_ValueError,
                              "invalid \\x escape at position %d",
@@ -1083,6 +1204,7 @@ PyObject *PyBytes_DecodeEscape(const char *s,
             if (s < end && Py_ISXDIGIT(s[0]))
                 s++; /* and a hexdigit */
             break;
+
         default:
             *p++ = '\\';
             s--;
@@ -1090,11 +1212,11 @@ PyObject *PyBytes_DecodeEscape(const char *s,
                              UTF-8 bytes may follow. */
         }
     }
-    if (p-buf < newlen)
-        _PyBytes_Resize(&v, p - buf);
-    return v;
+
+    return _PyBytesWriter_Finish(&writer, p);
+
   failed:
-    Py_DECREF(v);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -1815,16 +1937,15 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
     ADJUST_INDICES(start, end, len);
     if (end - start < sub_len)
         res = -1;
-    else if (sub_len == 1
-#ifndef HAVE_MEMRCHR
-            && dir > 0
-#endif
-    ) {
-        unsigned char needle = *sub;
-        int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH;
-        res = stringlib_fastsearch_memchr_1char(
-            PyBytes_AS_STRING(self) + start, end - start,
-            needle, needle, mode);
+    else if (sub_len == 1) {
+        if (dir > 0)
+            res = stringlib_find_char(
+                PyBytes_AS_STRING(self) + start, end - start,
+                *sub);
+        else
+            res = stringlib_rfind_char(
+                PyBytes_AS_STRING(self) + start, end - start,
+                *sub);
         if (res >= 0)
             res += start;
     }
@@ -2975,22 +3096,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
         );
 }
 
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
-    if (c >= 128)
-        return -1;
-    if (Py_ISDIGIT(c))
-        return c - '0';
-    else {
-        if (Py_ISUPPER(c))
-            c = Py_TOLOWER(c);
-        if (c >= 'a' && c <= 'f')
-            return c - 'a' + 10;
-    }
-    return -1;
-}
-
 /*[clinic input]
 @classmethod
 bytes.fromhex
@@ -3008,47 +3113,83 @@ static PyObject *
 bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
 /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
 {
-    PyObject *newstring;
+    return _PyBytes_FromHex(string, 0);
+}
+
+PyObject*
+_PyBytes_FromHex(PyObject *string, int use_bytearray)
+{
     char *buf;
-    Py_ssize_t hexlen, byteslen, i, j;
-    int top, bot;
-    void *data;
-    unsigned int kind;
+    Py_ssize_t hexlen, invalid_char;
+    unsigned int top, bot;
+    Py_UCS1 *str, *end;
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
+    writer.use_bytearray = use_bytearray;
 
     assert(PyUnicode_Check(string));
     if (PyUnicode_READY(string))
         return NULL;
-    kind = PyUnicode_KIND(string);
-    data = PyUnicode_DATA(string);
     hexlen = PyUnicode_GET_LENGTH(string);
 
-    byteslen = hexlen/2; /* This overestimates if there are spaces */
-    newstring = PyBytes_FromStringAndSize(NULL, byteslen);
-    if (!newstring)
+    if (!PyUnicode_IS_ASCII(string)) {
+        void *data = PyUnicode_DATA(string);
+        unsigned int kind = PyUnicode_KIND(string);
+        Py_ssize_t i;
+
+        /* search for the first non-ASCII character */
+        for (i = 0; i < hexlen; i++) {
+            if (PyUnicode_READ(kind, data, i) >= 128)
+                break;
+        }
+        invalid_char = i;
+        goto error;
+    }
+
+    assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
+    str = PyUnicode_1BYTE_DATA(string);
+
+    /* This overestimates if there are spaces */
+    buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
+    if (buf == NULL)
         return NULL;
-    buf = PyBytes_AS_STRING(newstring);
-    for (i = j = 0; i < hexlen; i += 2) {
+
+    end = str + hexlen;
+    while (str < end) {
         /* skip over spaces in the input */
-        while (PyUnicode_READ(kind, data, i) == ' ')
-            i++;
-        if (i >= hexlen)
-            break;
-        top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
-        bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
-        if (top == -1 || bot == -1) {
-            PyErr_Format(PyExc_ValueError,
-                         "non-hexadecimal number found in "
-                         "fromhex() arg at position %zd", i);
+        if (*str == ' ') {
+            do {
+                str++;
+            } while (*str == ' ');
+            if (str >= end)
+                break;
+        }
+
+        top = _PyLong_DigitValue[*str];
+        if (top >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
             goto error;
         }
-        buf[j++] = (top << 4) + bot;
+        str++;
+
+        bot = _PyLong_DigitValue[*str];
+        if (bot >= 16) {
+            invalid_char = str - PyUnicode_1BYTE_DATA(string);
+            goto error;
+        }
+        str++;
+
+        *buf++ = (unsigned char)((top << 4) + bot);
     }
-    if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
-        goto error;
-    return newstring;
+
+    return _PyBytesWriter_Finish(&writer, buf);
 
   error:
-    Py_XDECREF(newstring);
+    PyErr_Format(PyExc_ValueError,
+                 "non-hexadecimal number found in "
+                 "fromhex() arg at position %zd", invalid_char);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 }
 
@@ -3131,11 +3272,15 @@ bytes_methods[] = {
 };
 
 static PyObject *
-bytes_mod(PyObject *v, PyObject *w)
+bytes_mod(PyObject *self, PyObject *args)
 {
-    if (!PyBytes_Check(v))
-        Py_RETURN_NOTIMPLEMENTED;
-    return _PyBytes_Format(v, w);
+    if (self == NULL || !PyBytes_Check(self)) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    return _PyBytes_FormatEx(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self),
+                             args, 0);
 }
 
 static PyNumberMethods bytes_as_number = {
@@ -3244,103 +3389,95 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
     return PyBytes_FromObject(x);
 }
 
-PyObject *
-PyBytes_FromObject(PyObject *x)
+static PyObject*
+_PyBytes_FromBuffer(PyObject *x)
 {
-    PyObject *new, *it;
-    Py_ssize_t i, size;
+    PyObject *new;
+    Py_buffer view;
 
-    if (x == NULL) {
-        PyErr_BadInternalCall();
+    if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
         return NULL;
-    }
 
-    if (PyBytes_CheckExact(x)) {
-        Py_INCREF(x);
-        return x;
-    }
+    new = PyBytes_FromStringAndSize(NULL, view.len);
+    if (!new)
+        goto fail;
+    if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
+                &view, view.len, 'C') < 0)
+        goto fail;
+    PyBuffer_Release(&view);
+    return new;
 
-    /* Use the modern buffer interface */
-    if (PyObject_CheckBuffer(x)) {
-        Py_buffer view;
-        if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0)
-            return NULL;
-        new = PyBytes_FromStringAndSize(NULL, view.len);
-        if (!new)
-            goto fail;
-        if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval,
-                                  &view, view.len, 'C') < 0)
-            goto fail;
-        PyBuffer_Release(&view);
-        return new;
-      fail:
-        Py_XDECREF(new);
-        PyBuffer_Release(&view);
-        return NULL;
-    }
-    if (PyUnicode_Check(x)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "cannot convert unicode object to bytes");
-        return NULL;
-    }
+fail:
+    Py_XDECREF(new);
+    PyBuffer_Release(&view);
+    return NULL;
+}
 
-    if (PyList_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyList_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
-    if (PyTuple_CheckExact(x)) {
-        new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));
-        if (new == NULL)
-            return NULL;
-        for (i = 0; i < Py_SIZE(x); i++) {
-            Py_ssize_t value = PyNumber_AsSsize_t(
-                PyTuple_GET_ITEM(x, i), PyExc_ValueError);
-            if (value == -1 && PyErr_Occurred()) {
-                Py_DECREF(new);
-                return NULL;
-            }
-            if (value < 0 || value >= 256) {
-                PyErr_SetString(PyExc_ValueError,
-                                "bytes must be in range(0, 256)");
-                Py_DECREF(new);
-                return NULL;
-            }
-            ((PyBytesObject *)new)->ob_sval[i] = (char) value;
-        }
-        return new;
-    }
+#define _PyBytes_FROM_LIST_BODY(x, GET_ITEM)                                \
+    do {                                                                    \
+        PyObject *bytes;                                                    \
+        Py_ssize_t i;                                                       \
+        Py_ssize_t value;                                                   \
+        char *str;                                                          \
+        PyObject *item;                                                     \
+                                                                            \
+        bytes = PyBytes_FromStringAndSize(NULL, Py_SIZE(x));                \
+        if (bytes == NULL)                                                  \
+            return NULL;                                                    \
+        str = ((PyBytesObject *)bytes)->ob_sval;                            \
+                                                                            \
+        for (i = 0; i < Py_SIZE(x); i++) {                                  \
+            item = GET_ITEM((x), i);                                        \
+            value = PyNumber_AsSsize_t(item, PyExc_ValueError);             \
+            if (value == -1 && PyErr_Occurred())                            \
+                goto error;                                                 \
+                                                                            \
+            if (value < 0 || value >= 256) {                                \
+                PyErr_SetString(PyExc_ValueError,                           \
+                                "bytes must be in range(0, 256)");          \
+                goto error;                                                 \
+            }                                                               \
+            *str++ = (char) value;                                          \
+        }                                                                   \
+        return bytes;                                                       \
+                                                                            \
+    error:                                                                  \
+        Py_DECREF(bytes);                                                   \
+        return NULL;                                                        \
+    } while (0)
+
+static PyObject*
+_PyBytes_FromList(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyList_GET_ITEM);
+}
+
+static PyObject*
+_PyBytes_FromTuple(PyObject *x)
+{
+    _PyBytes_FROM_LIST_BODY(x, PyTuple_GET_ITEM);
+}
+
+static PyObject *
+_PyBytes_FromIterator(PyObject *x)
+{
+    char *str;
+    PyObject *it;
+    Py_ssize_t i, size;
+    _PyBytesWriter writer;
+
+    _PyBytesWriter_Init(&writer);
 
     /* For iterator version, create a string object and resize as needed */
     size = PyObject_LengthHint(x, 64);
     if (size == -1 && PyErr_Occurred())
         return NULL;
-    /* Allocate an extra byte to prevent PyBytes_FromStringAndSize() from
-       returning a shared empty bytes string. This required because we
-       want to call _PyBytes_Resize() the returned object, which we can
-       only do on bytes objects with refcount == 1. */
-    if (size == 0)
-        size = 1;
-    new = PyBytes_FromStringAndSize(NULL, size);
-    if (new == NULL)
+
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
         return NULL;
-    assert(Py_REFCNT(new) == 1);
+    writer.overallocate = 1;
+    size = writer.allocated;
 
     /* Get the iterator */
     it = PyObject_GetIter(x);
@@ -3375,24 +3512,55 @@ PyBytes_FromObject(PyObject *x)
 
         /* Append the byte */
         if (i >= size) {
-            size = 2 * size + 1;
-            if (_PyBytes_Resize(&new, size) < 0)
-                goto error;
+            str = _PyBytesWriter_Resize(&writer, str, size+1);
+            if (str == NULL)
+                return NULL;
+            size = writer.allocated;
         }
-        ((PyBytesObject *)new)->ob_sval[i] = (char) value;
+        *str++ = (char) value;
     }
-    _PyBytes_Resize(&new, i);
-
-    /* Clean up and return success */
     Py_DECREF(it);
-    return new;
+
+    return _PyBytesWriter_Finish(&writer, str);
 
   error:
+    _PyBytesWriter_Dealloc(&writer);
     Py_XDECREF(it);
-    Py_XDECREF(new);
     return NULL;
 }
 
+PyObject *
+PyBytes_FromObject(PyObject *x)
+{
+    if (x == NULL) {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+
+    if (PyBytes_CheckExact(x)) {
+        Py_INCREF(x);
+        return x;
+    }
+
+    /* Use the modern buffer interface */
+    if (PyObject_CheckBuffer(x))
+        return _PyBytes_FromBuffer(x);
+
+    if (PyList_CheckExact(x))
+        return _PyBytes_FromList(x);
+
+    if (PyTuple_CheckExact(x))
+        return _PyBytes_FromTuple(x);
+
+    if (PyUnicode_Check(x)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "cannot convert unicode object to bytes");
+        return NULL;
+    }
+
+    return _PyBytes_FromIterator(x);
+}
+
 static PyObject *
 bytes_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 {
@@ -3740,3 +3908,277 @@ bytes_iter(PyObject *seq)
     _PyObject_GC_TRACK(it);
     return (PyObject *)it;
 }
+
+
+/* _PyBytesWriter API */
+
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
+void
+_PyBytesWriter_Init(_PyBytesWriter *writer)
+{
+    /* Set all attributes before small_buffer to 0 */
+    memset(writer, 0, offsetof(_PyBytesWriter, small_buffer));
+#ifdef Py_DEBUG
+    memset(writer->small_buffer, 0xCB, sizeof(writer->small_buffer));
+#endif
+}
+
+void
+_PyBytesWriter_Dealloc(_PyBytesWriter *writer)
+{
+    Py_CLEAR(writer->buffer);
+}
+
+Py_LOCAL_INLINE(char*)
+_PyBytesWriter_AsString(_PyBytesWriter *writer)
+{
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+        return writer->small_buffer;
+    }
+    else if (writer->use_bytearray) {
+        assert(writer->buffer != NULL);
+        return PyByteArray_AS_STRING(writer->buffer);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        return PyBytes_AS_STRING(writer->buffer);
+    }
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
+{
+    char *start = _PyBytesWriter_AsString(writer);
+    assert(str != NULL);
+    assert(str >= start);
+    assert(str - start <= writer->allocated);
+    return str - start;
+}
+
+Py_LOCAL_INLINE(void)
+_PyBytesWriter_CheckConsistency(_PyBytesWriter *writer, char *str)
+{
+#ifdef Py_DEBUG
+    char *start, *end;
+
+    if (writer->use_small_buffer) {
+        assert(writer->buffer == NULL);
+    }
+    else {
+        assert(writer->buffer != NULL);
+        if (writer->use_bytearray)
+            assert(PyByteArray_CheckExact(writer->buffer));
+        else
+            assert(PyBytes_CheckExact(writer->buffer));
+        assert(Py_REFCNT(writer->buffer) == 1);
+    }
+
+    if (writer->use_bytearray) {
+        /* bytearray has its own overallocation algorithm,
+           writer overallocation must be disabled */
+        assert(!writer->overallocate);
+    }
+
+    assert(0 <= writer->allocated);
+    assert(0 <= writer->min_size && writer->min_size <= writer->allocated);
+    /* the last byte must always be null */
+    start = _PyBytesWriter_AsString(writer);
+    assert(start[writer->allocated] == 0);
+
+    end = start + writer->allocated;
+    assert(str != NULL);
+    assert(start <= str && str <= end);
+#endif
+}
+
+void*
+_PyBytesWriter_Resize(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t allocated, pos;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(writer->allocated < size);
+
+    allocated = size;
+    if (writer->overallocate
+        && allocated <= (PY_SSIZE_T_MAX - allocated / OVERALLOCATE_FACTOR)) {
+        /* overallocate to limit the number of realloc() */
+        allocated += allocated / OVERALLOCATE_FACTOR;
+    }
+
+    pos = _PyBytesWriter_GetSize(writer, str);
+    if (!writer->use_small_buffer) {
+        if (writer->use_bytearray) {
+            if (PyByteArray_Resize(writer->buffer, allocated))
+                goto error;
+            /* writer->allocated can be smaller than writer->buffer->ob_alloc,
+               but we cannot use ob_alloc because bytes may need to be moved
+               to use the whole buffer. bytearray uses an internal optimization
+               to avoid moving or copying bytes when bytes are removed at the
+               beginning (ex: del bytearray[:1]). */
+        }
+        else {
+            if (_PyBytes_Resize(&writer->buffer, allocated))
+                goto error;
+        }
+    }
+    else {
+        /* convert from stack buffer to bytes object buffer */
+        assert(writer->buffer == NULL);
+
+        if (writer->use_bytearray)
+            writer->buffer = PyByteArray_FromStringAndSize(NULL, allocated);
+        else
+            writer->buffer = PyBytes_FromStringAndSize(NULL, allocated);
+        if (writer->buffer == NULL)
+            goto error;
+
+        if (pos != 0) {
+            char *dest;
+            if (writer->use_bytearray)
+                dest = PyByteArray_AS_STRING(writer->buffer);
+            else
+                dest = PyBytes_AS_STRING(writer->buffer);
+            Py_MEMCPY(dest,
+                      writer->small_buffer,
+                      pos);
+        }
+
+        writer->use_small_buffer = 0;
+#ifdef Py_DEBUG
+        memset(writer->small_buffer, 0xDB, sizeof(writer->small_buffer));
+#endif
+    }
+    writer->allocated = allocated;
+
+    str = _PyBytesWriter_AsString(writer) + pos;
+    _PyBytesWriter_CheckConsistency(writer, str);
+    return str;
+
+error:
+    _PyBytesWriter_Dealloc(writer);
+    return NULL;
+}
+
+void*
+_PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
+{
+    Py_ssize_t new_min_size;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+    assert(size >= 0);
+
+    if (size == 0) {
+        /* nothing to do */
+        return str;
+    }
+
+    if (writer->min_size > PY_SSIZE_T_MAX - size) {
+        PyErr_NoMemory();
+        _PyBytesWriter_Dealloc(writer);
+        return NULL;
+    }
+    new_min_size = writer->min_size + size;
+
+    if (new_min_size > writer->allocated)
+        str = _PyBytesWriter_Resize(writer, str, new_min_size);
+
+    writer->min_size = new_min_size;
+    return str;
+}
+
+/* Allocate the buffer to write size bytes.
+   Return the pointer to the beginning of buffer data.
+   Raise an exception and return NULL on error. */
+void*
+_PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
+{
+    /* ensure that _PyBytesWriter_Alloc() is only called once */
+    assert(writer->min_size == 0 && writer->buffer == NULL);
+    assert(size >= 0);
+
+    writer->use_small_buffer = 1;
+#ifdef Py_DEBUG
+    writer->allocated = sizeof(writer->small_buffer) - 1;
+    /* In debug mode, don't use the full small buffer because it is less
+       efficient than bytes and bytearray objects to detect buffer underflow
+       and buffer overflow. Use 10 bytes of the small buffer to test also
+       code using the smaller buffer in debug mode.
+
+       Don't modify the _PyBytesWriter structure (use a shorter small buffer)
+       in debug mode to also be able to detect stack overflow when running
+       tests in debug mode. The _PyBytesWriter is large (more than 512 bytes),
+       if Py_EnterRecursiveCall() is not used in deep C callback, we may hit a
+       stack overflow. */
+    writer->allocated = Py_MIN(writer->allocated, 10);
+    /* _PyBytesWriter_CheckConsistency() requires the last byte to be 0,
+       to detect buffer overflow */
+    writer->small_buffer[writer->allocated] = 0;
+#else
+    writer->allocated = sizeof(writer->small_buffer);
+#endif
+    return _PyBytesWriter_Prepare(writer, writer->small_buffer, size);
+}
+
+PyObject *
+_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
+{
+    Py_ssize_t size;
+    PyObject *result;
+
+    _PyBytesWriter_CheckConsistency(writer, str);
+
+    size = _PyBytesWriter_GetSize(writer, str);
+    if (size == 0 && !writer->use_bytearray) {
+        Py_CLEAR(writer->buffer);
+        /* Get the empty byte string singleton */
+        result = PyBytes_FromStringAndSize(NULL, 0);
+    }
+    else if (writer->use_small_buffer) {
+        result = PyBytes_FromStringAndSize(writer->small_buffer, size);
+    }
+    else {
+        result = writer->buffer;
+        writer->buffer = NULL;
+
+        if (size != writer->allocated) {
+            if (writer->use_bytearray) {
+                if (PyByteArray_Resize(result, size)) {
+                    Py_DECREF(result);
+                    return NULL;
+                }
+            }
+            else {
+                if (_PyBytes_Resize(&result, size)) {
+                    assert(result == NULL);
+                    return NULL;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+void*
+_PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr,
+                          const void *bytes, Py_ssize_t size)
+{
+    char *str = (char *)ptr;
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    Py_MEMCPY(str, bytes, size);
+    str += size;
+
+    return str;
+}
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index f53cd40796..6f57db01c0 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -1165,39 +1165,42 @@ _PyDict_GetItemIdWithError(PyObject *dp, struct _Py_Identifier *key)
     return PyDict_GetItemWithError(dp, kv);
 }
 
-/* Fast version of global value lookup.
+/* Fast version of global value lookup (LOAD_GLOBAL).
  * Lookup in globals, then builtins.
+ *
+ * Raise an exception and return NULL if an error occurred (ex: computing the
+ * key hash failed, key comparison failed, ...). Return NULL if the key doesn't
+ * exist. Return the value if the key exists.
  */
 PyObject *
 _PyDict_LoadGlobal(PyDictObject *globals, PyDictObject *builtins, PyObject *key)
 {
-    PyObject *x;
-    if (PyUnicode_CheckExact(key)) {
-        PyObject **value_addr;
-        Py_hash_t hash = ((PyASCIIObject *)key)->hash;
-        if (hash != -1) {
-            PyDictKeyEntry *e;
-            e = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            if (x != NULL)
-                return x;
-            e = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
-            if (e == NULL) {
-                return NULL;
-            }
-            x = *value_addr;
-            return x;
-        }
+    Py_hash_t hash;
+    PyDictKeyEntry *entry;
+    PyObject **value_addr;
+    PyObject *value;
+
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1)
+    {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return NULL;
     }
-    x = PyDict_GetItemWithError((PyObject *)globals, key);
-    if (x != NULL)
-        return x;
-    if (PyErr_Occurred())
+
+    /* namespace 1: globals */
+    entry = globals->ma_keys->dk_lookup(globals, key, hash, &value_addr);
+    if (entry == NULL)
         return NULL;
-    return PyDict_GetItemWithError((PyObject *)builtins, key);
+    value = *value_addr;
+    if (value != NULL)
+        return value;
+
+    /* namespace 2: builtins */
+    entry = builtins->ma_keys->dk_lookup(builtins, key, hash, &value_addr);
+    if (entry == NULL)
+        return NULL;
+    return *value_addr;
 }
 
 /* CAUTION: PyDict_SetItem() must guarantee that it won't resize the
diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 832e4f2ea6..fef982f8a6 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -486,7 +486,7 @@ sub-run, yet finding such very efficiently when they exist.
 I first learned about the galloping strategy in a related context; see:
 
     "Adaptive Set Intersections, Unions, and Differences" (2000)
-    Erik D. Demaine, Alejandro L�pez-Ortiz, J. Ian Munro
+    Erik D. Demaine, Alejandro López-Ortiz, J. Ian Munro
 
 and its followup(s).  An earlier paper called the same strategy
 "exponential search":
diff --git a/Objects/longobject.c b/Objects/longobject.c
index d821e4bfb6..d05de8bf07 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -1582,10 +1582,12 @@ divrem1(PyLongObject *a, digit n, digit *prem)
 static int
 long_to_decimal_string_internal(PyObject *aa,
                                 PyObject **p_output,
-                                _PyUnicodeWriter *writer)
+                                _PyUnicodeWriter *writer,
+                                _PyBytesWriter *bytes_writer,
+                                char **bytes_str)
 {
     PyLongObject *scratch, *a;
-    PyObject *str;
+    PyObject *str = NULL;
     Py_ssize_t size, strlen, size_a, i, j;
     digit *pout, *pin, rem, tenpow;
     int negative;
@@ -1662,7 +1664,13 @@ long_to_decimal_string_internal(PyObject *aa,
             return -1;
         }
         kind = writer->kind;
-        str = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, strlen);
+        if (*bytes_str == NULL) {
+            Py_DECREF(scratch);
+            return -1;
+        }
     }
     else {
         str = PyUnicode_New(strlen, '9');
@@ -1673,13 +1681,8 @@ long_to_decimal_string_internal(PyObject *aa,
         kind = PyUnicode_KIND(str);
     }
 
-#define WRITE_DIGITS(TYPE)                                            \
+#define WRITE_DIGITS(p)                                               \
     do {                                                              \
-        if (writer)                                                   \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
-        else                                                          \
-            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
-                                                                      \
         /* pout[0] through pout[size-2] contribute exactly            \
            _PyLong_DECIMAL_SHIFT digits each */                       \
         for (i=0; i < size - 1; i++) {                                \
@@ -1699,6 +1702,16 @@ long_to_decimal_string_internal(PyObject *aa,
         /* and sign */                                                \
         if (negative)                                                 \
             *--p = '-';                                               \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                    \
+    do {                                                              \
+        if (writer)                                                   \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + strlen; \
+        else                                                          \
+            p = (TYPE*)PyUnicode_DATA(str) + strlen;                  \
+                                                                      \
+        WRITE_DIGITS(p);                                              \
                                                                       \
         /* check we've counted correctly */                           \
         if (writer)                                                   \
@@ -1708,25 +1721,34 @@ long_to_decimal_string_internal(PyObject *aa,
     } while (0)
 
     /* fill the string right-to-left */
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + strlen;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     Py_DECREF(scratch);
     if (writer) {
         writer->pos += strlen;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += strlen;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(str, 1));
         *p_output = (PyObject *)str;
@@ -1738,7 +1760,7 @@ static PyObject *
 long_to_decimal_string(PyObject *aa)
 {
     PyObject *v;
-    if (long_to_decimal_string_internal(aa, &v, NULL) == -1)
+    if (long_to_decimal_string_internal(aa, &v, NULL, NULL, NULL) == -1)
         return NULL;
     return v;
 }
@@ -1750,10 +1772,11 @@ long_to_decimal_string(PyObject *aa)
 
 static int
 long_format_binary(PyObject *aa, int base, int alternate,
-                   PyObject **p_output, _PyUnicodeWriter *writer)
+                   PyObject **p_output, _PyUnicodeWriter *writer,
+                   _PyBytesWriter *bytes_writer, char **bytes_str)
 {
     PyLongObject *a = (PyLongObject *)aa;
-    PyObject *v;
+    PyObject *v = NULL;
     Py_ssize_t sz;
     Py_ssize_t size_a;
     enum PyUnicode_Kind kind;
@@ -1810,7 +1833,11 @@ long_format_binary(PyObject *aa, int base, int alternate,
         if (_PyUnicodeWriter_Prepare(writer, sz, 'x') == -1)
             return -1;
         kind = writer->kind;
-        v = NULL;
+    }
+    else if (bytes_writer) {
+        *bytes_str = _PyBytesWriter_Prepare(bytes_writer, *bytes_str, sz);
+        if (*bytes_str == NULL)
+            return -1;
     }
     else {
         v = PyUnicode_New(sz, 'x');
@@ -1819,13 +1846,8 @@ long_format_binary(PyObject *aa, int base, int alternate,
         kind = PyUnicode_KIND(v);
     }
 
-#define WRITE_DIGITS(TYPE)                                              \
+#define WRITE_DIGITS(p)                                                 \
     do {                                                                \
-        if (writer)                                                     \
-            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
-        else                                                            \
-            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
-                                                                        \
         if (size_a == 0) {                                              \
             *--p = '0';                                                 \
         }                                                               \
@@ -1860,30 +1882,50 @@ long_format_binary(PyObject *aa, int base, int alternate,
         }                                                               \
         if (negative)                                                   \
             *--p = '-';                                                 \
+    } while (0)
+
+#define WRITE_UNICODE_DIGITS(TYPE)                                      \
+    do {                                                                \
+        if (writer)                                                     \
+            p = (TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos + sz; \
+        else                                                            \
+            p = (TYPE*)PyUnicode_DATA(v) + sz;                          \
+                                                                        \
+        WRITE_DIGITS(p);                                                \
+                                                                        \
         if (writer)                                                     \
             assert(p == ((TYPE*)PyUnicode_DATA(writer->buffer) + writer->pos)); \
         else                                                            \
             assert(p == (TYPE*)PyUnicode_DATA(v));                      \
     } while (0)
 
-    if (kind == PyUnicode_1BYTE_KIND) {
+    if (bytes_writer) {
+        char *p = *bytes_str + sz;
+        WRITE_DIGITS(p);
+        assert(p == *bytes_str);
+    }
+    else if (kind == PyUnicode_1BYTE_KIND) {
         Py_UCS1 *p;
-        WRITE_DIGITS(Py_UCS1);
+        WRITE_UNICODE_DIGITS(Py_UCS1);
     }
     else if (kind == PyUnicode_2BYTE_KIND) {
         Py_UCS2 *p;
-        WRITE_DIGITS(Py_UCS2);
+        WRITE_UNICODE_DIGITS(Py_UCS2);
     }
     else {
         Py_UCS4 *p;
         assert (kind == PyUnicode_4BYTE_KIND);
-        WRITE_DIGITS(Py_UCS4);
+        WRITE_UNICODE_DIGITS(Py_UCS4);
     }
 #undef WRITE_DIGITS
+#undef WRITE_UNICODE_DIGITS
 
     if (writer) {
         writer->pos += sz;
     }
+    else if (bytes_writer) {
+        (*bytes_str) += sz;
+    }
     else {
         assert(_PyUnicode_CheckConsistency(v, 1));
         *p_output = v;
@@ -1897,9 +1939,9 @@ _PyLong_Format(PyObject *obj, int base)
     PyObject *str;
     int err;
     if (base == 10)
-        err = long_to_decimal_string_internal(obj, &str, NULL);
+        err = long_to_decimal_string_internal(obj, &str, NULL, NULL, NULL);
     else
-        err = long_format_binary(obj, base, 1, &str, NULL);
+        err = long_format_binary(obj, base, 1, &str, NULL, NULL, NULL);
     if (err == -1)
         return NULL;
     return str;
@@ -1911,9 +1953,31 @@ _PyLong_FormatWriter(_PyUnicodeWriter *writer,
                      int base, int alternate)
 {
     if (base == 10)
-        return long_to_decimal_string_internal(obj, NULL, writer);
+        return long_to_decimal_string_internal(obj, NULL, writer,
+                                               NULL, NULL);
     else
-        return long_format_binary(obj, base, alternate, NULL, writer);
+        return long_format_binary(obj, base, alternate, NULL, writer,
+                                  NULL, NULL);
+}
+
+char*
+_PyLong_FormatBytesWriter(_PyBytesWriter *writer, char *str,
+                          PyObject *obj,
+                          int base, int alternate)
+{
+    char *str2;
+    int res;
+    str2 = str;
+    if (base == 10)
+        res = long_to_decimal_string_internal(obj, NULL, NULL,
+                                              writer, &str2);
+    else
+        res = long_format_binary(obj, base, alternate, NULL, NULL,
+                                 writer, &str2);
+    if (res < 0)
+        return NULL;
+    assert(str2 != NULL);
+    return str2;
 }
 
 /* Table of digit values for 8-bit string -> integer conversion.
diff --git a/Objects/object.c b/Objects/object.c
index 6fc4df1639..e1718eafcd 100644
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -686,11 +686,10 @@ do_richcompare(PyObject *v, PyObject *w, int op)
         res = (v != w) ? Py_True : Py_False;
         break;
     default:
-        /* XXX Special-case None so it doesn't show as NoneType() */
         PyErr_Format(PyExc_TypeError,
-                     "unorderable types: %.100s() %s %.100s()",
-                     v->ob_type->tp_name,
+                     "'%s' not supported between instances of '%.100s' and '%.100s'",
                      opstrings[op],
+                     v->ob_type->tp_name,
                      w->ob_type->tp_name);
         return NULL;
     }
diff --git a/Objects/setobject.c b/Objects/setobject.c
index 2faaf12954..9468ee673e 100644
--- a/Objects/setobject.c
+++ b/Objects/setobject.c
@@ -29,7 +29,6 @@
 
 #include "Python.h"
 #include "structmember.h"
-#include "stringlib/eq.h"
 
 /* Object used as dummy key to fill deleted entries */
 static PyObject _dummy_struct;
@@ -51,19 +50,20 @@ static PyObject _dummy_struct;
 static setentry *
 set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
-    setentry *freeslot = NULL;
+    setentry *table;
     setentry *entry;
-    size_t perturb = hash;
+    size_t perturb;
     size_t mask = so->mask;
     size_t i = (size_t)hash & mask; /* Unsigned for defined overflow behavior */
     size_t j;
     int cmp;
 
-    entry = &table[i];
+    entry = &so->table[i];
     if (entry->key == NULL)
         return entry;
 
+    perturb = hash;
+
     while (1) {
         if (entry->hash == hash) {
             PyObject *startkey = entry->key;
@@ -73,8 +73,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             if (PyUnicode_CheckExact(startkey)
                 && PyUnicode_CheckExact(key)
-                && unicode_eq(startkey, key))
+                && _PyUnicode_EQ(startkey, key))
                 return entry;
+            table = so->table;
             Py_INCREF(startkey);
             cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
             Py_DECREF(startkey);
@@ -86,14 +87,12 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                 return entry;
             mask = so->mask;                 /* help avoid a register spill */
         }
-        if (entry->hash == -1 && freeslot == NULL)
-            freeslot = entry;
 
         if (i + LINEAR_PROBES <= mask) {
             for (j = 0 ; j < LINEAR_PROBES ; j++) {
                 entry++;
-                if (entry->key == NULL)
-                    goto found_null;
+                if (entry->hash == 0 && entry->key == NULL)
+                    return entry;
                 if (entry->hash == hash) {
                     PyObject *startkey = entry->key;
                     assert(startkey != dummy);
@@ -101,8 +100,9 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     if (PyUnicode_CheckExact(startkey)
                         && PyUnicode_CheckExact(key)
-                        && unicode_eq(startkey, key))
+                        && _PyUnicode_EQ(startkey, key))
                         return entry;
+                    table = so->table;
                     Py_INCREF(startkey);
                     cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
                     Py_DECREF(startkey);
@@ -114,7 +114,104 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
                         return entry;
                     mask = so->mask;
                 }
-                if (entry->hash == -1 && freeslot == NULL)
+            }
+        }
+
+        perturb >>= PERTURB_SHIFT;
+        i = (i * 5 + 1 + perturb) & mask;
+
+        entry = &so->table[i];
+        if (entry->key == NULL)
+            return entry;
+    }
+}
+
+static int set_table_resize(PySetObject *, Py_ssize_t);
+
+static int
+set_add_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *table;
+    setentry *freeslot;
+    setentry *entry;
+    size_t perturb;
+    size_t mask;
+    size_t i;                       /* Unsigned for defined overflow behavior */
+    size_t j;
+    int cmp;
+
+    /* Pre-increment is necessary to prevent arbitrary code in the rich
+       comparison from deallocating the key just before the insertion. */
+    Py_INCREF(key);
+
+  restart:
+
+    mask = so->mask;
+    i = (size_t)hash & mask;
+
+    entry = &so->table[i];
+    if (entry->key == NULL)
+        goto found_unused;
+
+    freeslot = NULL;
+    perturb = hash;
+
+    while (1) {
+        if (entry->hash == hash) {
+            PyObject *startkey = entry->key;
+            /* startkey cannot be a dummy because the dummy hash field is -1 */
+            assert(startkey != dummy);
+            if (startkey == key)
+                goto found_active;
+            if (PyUnicode_CheckExact(startkey)
+                && PyUnicode_CheckExact(key)
+                && _PyUnicode_EQ(startkey, key))
+                goto found_active;
+            table = so->table;
+            Py_INCREF(startkey);
+            cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+            Py_DECREF(startkey);
+            if (cmp > 0)                                          /* likely */
+                goto found_active;
+            if (cmp < 0)
+                goto comparison_error;
+            /* Continuing the search from the current entry only makes
+               sense if the table and entry are unchanged; otherwise,
+               we have to restart from the beginning */
+            if (table != so->table || entry->key != startkey)
+                goto restart;
+            mask = so->mask;                 /* help avoid a register spill */
+        }
+        else if (entry->hash == -1 && freeslot == NULL)
+            freeslot = entry;
+
+        if (i + LINEAR_PROBES <= mask) {
+            for (j = 0 ; j < LINEAR_PROBES ; j++) {
+                entry++;
+                if (entry->hash == 0 && entry->key == NULL)
+                    goto found_unused_or_dummy;
+                if (entry->hash == hash) {
+                    PyObject *startkey = entry->key;
+                    assert(startkey != dummy);
+                    if (startkey == key)
+                        goto found_active;
+                    if (PyUnicode_CheckExact(startkey)
+                        && PyUnicode_CheckExact(key)
+                        && _PyUnicode_EQ(startkey, key))
+                        goto found_active;
+                    table = so->table;
+                    Py_INCREF(startkey);
+                    cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
+                    Py_DECREF(startkey);
+                    if (cmp > 0)
+                        goto found_active;
+                    if (cmp < 0)
+                        goto comparison_error;
+                    if (table != so->table || entry->key != startkey)
+                        goto restart;
+                    mask = so->mask;
+                }
+                else if (entry->hash == -1 && freeslot == NULL)
                     freeslot = entry;
             }
         }
@@ -122,29 +219,51 @@ set_lookkey(PySetObject *so, PyObject *key, Py_hash_t hash)
         perturb >>= PERTURB_SHIFT;
         i = (i * 5 + 1 + perturb) & mask;
 
-        entry = &table[i];
+        entry = &so->table[i];
         if (entry->key == NULL)
-            goto found_null;
+            goto found_unused_or_dummy;
     }
-  found_null:
-    return freeslot == NULL ? entry : freeslot;
+
+  found_unused_or_dummy:
+    if (freeslot == NULL)
+        goto found_unused;
+    so->used++;
+    freeslot->key = key;
+    freeslot->hash = hash;
+    return 0;
+
+  found_unused:
+    so->fill++;
+    so->used++;
+    entry->key = key;
+    entry->hash = hash;
+    if ((size_t)so->fill*3 < mask*2)
+        return 0;
+    return set_table_resize(so, so->used);
+
+  found_active:
+    Py_DECREF(key);
+    return 0;
+
+  comparison_error:
+    Py_DECREF(key);
+    return -1;
 }
 
 /*
 Internal routine used by set_table_resize() to insert an item which is
 known to be absent from the set.  This routine also assumes that
 the set contains no deleted entries.  Besides the performance benefit,
-using set_insert_clean() in set_table_resize() is dangerous (SF bug #1456209).
-Note that no refcounts are changed by this routine; if needed, the caller
-is responsible for incref'ing `key`.
+there is also safety benefit since using set_add_entry() risks making
+a callback in the middle of a set_table_resize(), see issue 1456209.
+The caller is responsible for updating the key's reference count and
+the setobject's fill and used fields.
 */
 static void
-set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
+set_insert_clean(setentry *table, size_t mask, PyObject *key, Py_hash_t hash)
 {
-    setentry *table = so->table;
     setentry *entry;
     size_t perturb = hash;
-    size_t mask = (size_t)so->mask;
     size_t i = (size_t)hash & mask;
     size_t j;
 
@@ -165,45 +284,11 @@ set_insert_clean(PySetObject *so, PyObject *key, Py_hash_t hash)
   found_null:
     entry->key = key;
     entry->hash = hash;
-    so->fill++;
-    so->used++;
 }
 
 /* ======== End logic for probing the hash table ========================== */
 /* ======================================================================== */
 
-
-/*
-Internal routine to insert a new key into the table.
-Used by the public insert routine.
-Eats a reference to key.
-*/
-static int
-set_insert_key(PySetObject *so, PyObject *key, Py_hash_t hash)
-{
-    setentry *entry;
-
-    entry = set_lookkey(so, key, hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL) {
-        /* UNUSED */
-        entry->key = key;
-        entry->hash = hash;
-        so->fill++;
-        so->used++;
-    } else if (entry->key == dummy) {
-        /* DUMMY */
-        entry->key = key;
-        entry->hash = hash;
-        so->used++;
-    } else {
-        /* ACTIVE */
-        Py_DECREF(key);
-    }
-    return 0;
-}
-
 /*
 Restructure the table by allocating a new table and reinserting all
 keys again.  When entries have been deleted, the new table may
@@ -216,10 +301,13 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     setentry *oldtable, *newtable, *entry;
     Py_ssize_t oldfill = so->fill;
     Py_ssize_t oldused = so->used;
+    Py_ssize_t oldmask = so->mask;
+    size_t newmask;
     int is_oldtable_malloced;
     setentry small_copy[PySet_MINSIZE];
 
     assert(minused >= 0);
+    minused = (minused > 50000) ? minused * 2 : minused * 4;
 
     /* Find the smallest table size > minused. */
     /* XXX speed-up with intrinsics */
@@ -267,25 +355,24 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     /* Make the set empty, using the new table. */
     assert(newtable != oldtable);
     memset(newtable, 0, sizeof(setentry) * newsize);
-    so->fill = 0;
-    so->used = 0;
+    so->fill = oldused;
+    so->used = oldused;
     so->mask = newsize - 1;
     so->table = newtable;
 
     /* Copy the data over; this is refcount-neutral for active entries;
        dummy entries aren't copied over, of course */
+    newmask = (size_t)so->mask;
     if (oldfill == oldused) {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     } else {
-        for (entry = oldtable; oldused > 0; entry++) {
+        for (entry = oldtable; entry <= oldtable + oldmask; entry++) {
             if (entry->key != NULL && entry->key != dummy) {
-                oldused--;
-                set_insert_clean(so, entry->key, entry->hash);
+                set_insert_clean(newtable, newmask, entry->key, entry->hash);
             }
         }
     }
@@ -295,31 +382,42 @@ set_table_resize(PySetObject *so, Py_ssize_t minused)
     return 0;
 }
 
-/* CAUTION: set_add_key/entry() must guarantee it won't resize the table */
+static int
+set_contains_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
+{
+    setentry *entry;
+
+    entry = set_lookkey(so, key, hash);
+    if (entry != NULL)
+        return entry->key != NULL;
+    return -1;
+}
+
+#define DISCARD_NOTFOUND 0
+#define DISCARD_FOUND 1
 
 static int
-set_add_entry(PySetObject *so, setentry *entry)
+set_discard_entry(PySetObject *so, PyObject *key, Py_hash_t hash)
 {
-    Py_ssize_t n_used;
-    PyObject *key = entry->key;
-    Py_hash_t hash = entry->hash;
+    setentry *entry;
+    PyObject *old_key;
 
-    assert(so->fill <= so->mask);  /* at least one empty slot */
-    n_used = so->used;
-    Py_INCREF(key);
-    if (set_insert_key(so, key, hash)) {
-        Py_DECREF(key);
+    entry = set_lookkey(so, key, hash);
+    if (entry == NULL)
         return -1;
-    }
-    if (!(so->used > n_used && so->fill*3 >= (so->mask+1)*2))
-        return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    if (entry->key == NULL)
+        return DISCARD_NOTFOUND;
+    old_key = entry->key;
+    entry->key = dummy;
+    entry->hash = -1;
+    so->used--;
+    Py_DECREF(old_key);
+    return DISCARD_FOUND;
 }
 
 static int
 set_add_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
     if (!PyUnicode_CheckExact(key) ||
@@ -328,50 +426,35 @@ set_add_key(PySetObject *so, PyObject *key)
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_add_entry(so, &entry);
+    return set_add_entry(so, key, hash);
 }
 
-#define DISCARD_NOTFOUND 0
-#define DISCARD_FOUND 1
-
 static int
-set_discard_entry(PySetObject *so, setentry *oldentry)
+set_contains_key(PySetObject *so, PyObject *key)
 {
-    setentry *entry;
-    PyObject *old_key;
+    Py_hash_t hash;
 
-    entry = set_lookkey(so, oldentry->key, oldentry->hash);
-    if (entry == NULL)
-        return -1;
-    if (entry->key == NULL  ||  entry->key == dummy)
-        return DISCARD_NOTFOUND;
-    old_key = entry->key;
-    entry->key = dummy;
-    entry->hash = -1;
-    so->used--;
-    Py_DECREF(old_key);
-    return DISCARD_FOUND;
+    if (!PyUnicode_CheckExact(key) ||
+        (hash = ((PyASCIIObject *) key)->hash) == -1) {
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            return -1;
+    }
+    return set_contains_entry(so, key, hash);
 }
 
 static int
 set_discard_key(PySetObject *so, PyObject *key)
 {
-    setentry entry;
     Py_hash_t hash;
 
-    assert (PyAnySet_Check(so));
-
     if (!PyUnicode_CheckExact(key) ||
         (hash = ((PyASCIIObject *) key)->hash) == -1) {
         hash = PyObject_Hash(key);
         if (hash == -1)
             return -1;
     }
-    entry.key = key;
-    entry.hash = hash;
-    return set_discard_entry(so, &entry);
+    return set_discard_entry(so, key, hash);
 }
 
 static void
@@ -452,20 +535,22 @@ set_next(PySetObject *so, Py_ssize_t *pos_ptr, setentry **entry_ptr)
 {
     Py_ssize_t i;
     Py_ssize_t mask;
-    setentry *table;
+    setentry *entry;
 
     assert (PyAnySet_Check(so));
     i = *pos_ptr;
     assert(i >= 0);
-    table = so->table;
     mask = so->mask;
-    while (i <= mask && (table[i].key == NULL || table[i].key == dummy))
+    entry = &so->table[i];
+    while (i <= mask && (entry->key == NULL || entry->key == dummy)) {
         i++;
+        entry++;
+    }
     *pos_ptr = i+1;
     if (i > mask)
         return 0;
-    assert(table[i].key != NULL);
-    *entry_ptr = &table[i];
+    assert(entry != NULL);
+    *entry_ptr = entry;
     return 1;
 }
 
@@ -563,8 +648,8 @@ set_merge(PySetObject *so, PyObject *otherset)
      * incrementally resizing as we insert new keys.  Expect
      * that there will be no (or few) overlapping keys.
      */
-    if ((so->fill + other->used)*3 >= (so->mask+1)*2) {
-       if (set_table_resize(so, (so->used + other->used)*2) != 0)
+    if ((so->fill + other->used)*3 >= so->mask*2) {
+       if (set_table_resize(so, so->used + other->used) != 0)
            return -1;
     }
     so_entry = so->table;
@@ -589,11 +674,15 @@ set_merge(PySetObject *so, PyObject *otherset)
 
     /* If our table is empty, we can use set_insert_clean() */
     if (so->fill == 0) {
-        for (i = 0; i <= other->mask; i++, other_entry++) {
+        setentry *newtable = so->table;
+        size_t newmask = (size_t)so->mask;
+        so->fill = other->used;
+        so->used = other->used;
+        for (i = other->mask + 1; i > 0 ; i--, other_entry++) {
             key = other_entry->key;
             if (key != NULL && key != dummy) {
                 Py_INCREF(key);
-                set_insert_clean(so, key, other_entry->hash);
+                set_insert_clean(newtable, newmask, key, other_entry->hash);
             }
         }
         return 0;
@@ -604,46 +693,13 @@ set_merge(PySetObject *so, PyObject *otherset)
         other_entry = &other->table[i];
         key = other_entry->key;
         if (key != NULL && key != dummy) {
-            Py_INCREF(key);
-            if (set_insert_key(so, key, other_entry->hash)) {
-                Py_DECREF(key);
+            if (set_add_entry(so, key, other_entry->hash))
                 return -1;
-            }
         }
     }
     return 0;
 }
 
-static int
-set_contains_entry(PySetObject *so, setentry *entry)
-{
-    PyObject *key;
-    setentry *lu_entry;
-
-    lu_entry = set_lookkey(so, entry->key, entry->hash);
-    if (lu_entry == NULL)
-        return -1;
-    key = lu_entry->key;
-    return key != NULL && key != dummy;
-}
-
-static int
-set_contains_key(PySetObject *so, PyObject *key)
-{
-    setentry entry;
-    Py_hash_t hash;
-
-    if (!PyUnicode_CheckExact(key) ||
-        (hash = ((PyASCIIObject *) key)->hash) == -1) {
-        hash = PyObject_Hash(key);
-        if (hash == -1)
-            return -1;
-    }
-    entry.key = key;
-    entry.hash = hash;
-    return set_contains_entry(so, &entry);
-}
-
 static PyObject *
 set_pop(PySetObject *so)
 {
@@ -685,43 +741,64 @@ set_traverse(PySetObject *so, visitproc visit, void *arg)
     return 0;
 }
 
-static Py_hash_t
-frozenset_hash(PyObject *self)
+/* Work to increase the bit dispersion for closely spaced hash values.
+   This is important because some use cases have many combinations of a
+   small number of elements with nearby hashes so that many distinct
+   combinations collapse to only a handful of distinct hash values. */
+
+static Py_uhash_t
+_shuffle_bits(Py_uhash_t h)
 {
-    /* Most of the constants in this hash algorithm are randomly choosen
-       large primes with "interesting bit patterns" and that passed
-       tests for good collision statistics on a variety of problematic
-       datasets such as:
+    return ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
+}
 
-          ps = []
-          for r in range(21):
-              ps += itertools.combinations(range(20), r)
-          num_distinct_hashes = len({hash(frozenset(s)) for s in ps})
+/* Most of the constants in this hash algorithm are randomly chosen
+   large primes with "interesting bit patterns" and that passed tests
+   for good collision statistics on a variety of problematic datasets
+   including powersets and graph structures (such as David Eppstein's
+   graph recipes in Lib/test/test_set.py) */
 
-    */
+static Py_hash_t
+frozenset_hash(PyObject *self)
+{
     PySetObject *so = (PySetObject *)self;
-    Py_uhash_t h, hash = 1927868237UL;
+    Py_uhash_t hash = 0;
     setentry *entry;
-    Py_ssize_t pos = 0;
 
     if (so->hash != -1)
         return so->hash;
 
-    hash *= (Py_uhash_t)PySet_GET_SIZE(self) + 1;
-    while (set_next(so, &pos, &entry)) {
-        /* Work to increase the bit dispersion for closely spaced hash
-           values.  This is important because some use cases have many
-           combinations of a small number of elements with nearby
-           hashes so that many distinct combinations collapse to only
-           a handful of distinct hash values. */
-        h = entry->hash;
-        hash ^= ((h ^ 89869747UL) ^ (h << 16)) * 3644798167UL;
-    }
-    /* Make the final result spread-out in a different pattern
-       than the algorithm for tuples or other python objects. */
+    /* Xor-in shuffled bits from every entry's hash field because xor is
+       commutative and a frozenset hash should be independent of order.
+
+       For speed, include null entries and dummy entries and then
+       subtract out their effect afterwards so that the final hash
+       depends only on active entries.  This allows the code to be
+       vectorized by the compiler and it saves the unpredictable
+       branches that would arise when trying to exclude null and dummy
+       entries on every iteration. */
+
+    for (entry = so->table; entry <= &so->table[so->mask]; entry++)
+        hash ^= _shuffle_bits(entry->hash);
+
+    /* Remove the effect of an odd number of NULL entries */
+    if ((so->mask + 1 - so->fill) & 1)
+        hash ^= _shuffle_bits(0);
+
+    /* Remove the effect of an odd number of dummy entries */
+    if ((so->fill - so->used) & 1)
+        hash ^= _shuffle_bits(-1);
+
+    /* Factor in the number of active entries */
+    hash ^= ((Py_uhash_t)PySet_GET_SIZE(self) + 1) * 1927868237UL;
+
+    /* Disperse patterns arising in nested frozensets */
     hash = hash * 69069U + 907133923UL;
+
+    /* -1 is reserved as an error code */
     if (hash == (Py_uhash_t)-1)
         hash = 590923713UL;
+
     so->hash = hash;
     return hash;
 }
@@ -868,7 +945,7 @@ PyTypeObject PySetIter_Type = {
     PyObject_GenericGetAttr,                    /* tp_getattro */
     0,                                          /* tp_setattro */
     0,                                          /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,    /* tp_flags */
     0,                                          /* tp_doc */
     (traverseproc)setiter_traverse,             /* tp_traverse */
     0,                                          /* tp_clear */
@@ -913,18 +990,14 @@ set_update_internal(PySetObject *so, PyObject *other)
         * incrementally resizing as we insert new keys.  Expect
         * that there will be no (or few) overlapping keys.
         */
-        if (dictsize == -1)
+        if (dictsize < 0)
             return -1;
-        if ((so->fill + dictsize)*3 >= (so->mask+1)*2) {
-            if (set_table_resize(so, (so->used + dictsize)*2) != 0)
+        if ((so->fill + dictsize)*3 >= so->mask*2) {
+            if (set_table_resize(so, so->used + dictsize) != 0)
                 return -1;
         }
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
-            an_entry.hash = hash;
-            an_entry.key = key;
-            if (set_add_entry(so, &an_entry))
+            if (set_add_entry(so, key, hash))
                 return -1;
         }
         return 0;
@@ -1204,6 +1277,8 @@ set_intersection(PySetObject *so, PyObject *other)
 {
     PySetObject *result;
     PyObject *key, *it, *tmp;
+    Py_hash_t hash;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_copy(so);
@@ -1223,13 +1298,15 @@ set_intersection(PySetObject *so, PyObject *other)
         }
 
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1) {
+            key = entry->key;
+            hash = entry->hash;
+            rv = set_contains_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (rv) {
-                if (set_add_entry(result, entry)) {
+                if (set_add_entry(result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1245,32 +1322,15 @@ set_intersection(PySetObject *so, PyObject *other)
     }
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
-        Py_hash_t hash = PyObject_Hash(key);
-
-        if (hash == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
-        if (rv == -1) {
-            Py_DECREF(it);
-            Py_DECREF(result);
-            Py_DECREF(key);
-            return NULL;
-        }
+        hash = PyObject_Hash(key);
+        if (hash == -1)
+            goto error;
+        rv = set_contains_entry(so, key, hash);
+        if (rv < 0)
+            goto error;
         if (rv) {
-            if (set_add_entry(result, &entry)) {
-                Py_DECREF(it);
-                Py_DECREF(result);
-                Py_DECREF(key);
-                return NULL;
-            }
+            if (set_add_entry(result, key, hash))
+                goto error;
         }
         Py_DECREF(key);
     }
@@ -1280,6 +1340,11 @@ set_intersection(PySetObject *so, PyObject *other)
         return NULL;
     }
     return (PyObject *)result;
+  error:
+    Py_DECREF(it);
+    Py_DECREF(result);
+    Py_DECREF(key);
+    return NULL;
 }
 
 static PyObject *
@@ -1366,6 +1431,7 @@ static PyObject *
 set_isdisjoint(PySetObject *so, PyObject *other)
 {
     PyObject *key, *it, *tmp;
+    int rv;
 
     if ((PyObject *)so == other) {
         if (PySet_GET_SIZE(so) == 0)
@@ -1384,8 +1450,8 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             other = tmp;
         }
         while (set_next((PySetObject *)other, &pos, &entry)) {
-            int rv = set_contains_entry(so, entry);
-            if (rv == -1)
+            rv = set_contains_entry(so, entry->key, entry->hash);
+            if (rv < 0)
                 return NULL;
             if (rv)
                 Py_RETURN_FALSE;
@@ -1398,8 +1464,6 @@ set_isdisjoint(PySetObject *so, PyObject *other)
         return NULL;
 
     while ((key = PyIter_Next(it)) != NULL) {
-        int rv;
-        setentry entry;
         Py_hash_t hash = PyObject_Hash(key);
 
         if (hash == -1) {
@@ -1407,11 +1471,9 @@ set_isdisjoint(PySetObject *so, PyObject *other)
             Py_DECREF(it);
             return NULL;
         }
-        entry.hash = hash;
-        entry.key = key;
-        rv = set_contains_entry(so, &entry);
+        rv = set_contains_entry(so, key, hash);
         Py_DECREF(key);
-        if (rv == -1) {
+        if (rv < 0) {
             Py_DECREF(it);
             return NULL;
         }
@@ -1440,7 +1502,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         Py_ssize_t pos = 0;
 
         while (set_next((PySetObject *)other, &pos, &entry))
-            if (set_discard_entry(so, entry) == -1)
+            if (set_discard_entry(so, entry->key, entry->hash) < 0)
                 return -1;
     } else {
         PyObject *key, *it;
@@ -1449,7 +1511,7 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
             return -1;
 
         while ((key = PyIter_Next(it)) != NULL) {
-            if (set_discard_key(so, key) == -1) {
+            if (set_discard_key(so, key) < 0) {
                 Py_DECREF(it);
                 Py_DECREF(key);
                 return -1;
@@ -1460,10 +1522,10 @@ set_difference_update_internal(PySetObject *so, PyObject *other)
         if (PyErr_Occurred())
             return -1;
     }
-    /* If more than 1/5 are dummies, then resize them away. */
-    if ((so->fill - so->used) * 5 < so->mask)
+    /* If more than 1/4th are dummies, then resize them away. */
+    if ((size_t)(so->fill - so->used) <= (size_t)so->mask / 4)
         return 0;
-    return set_table_resize(so, so->used>50000 ? so->used*2 : so->used*4);
+    return set_table_resize(so, so->used);
 }
 
 static PyObject *
@@ -1490,7 +1552,7 @@ set_copy_and_difference(PySetObject *so, PyObject *other)
     result = set_copy(so);
     if (result == NULL)
         return NULL;
-    if (set_difference_update_internal((PySetObject *) result, other) != -1)
+    if (set_difference_update_internal((PySetObject *) result, other) == 0)
         return result;
     Py_DECREF(result);
     return NULL;
@@ -1500,8 +1562,11 @@ static PyObject *
 set_difference(PySetObject *so, PyObject *other)
 {
     PyObject *result;
+    PyObject *key;
+    Py_hash_t hash;
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)  && !PyDict_CheckExact(other)) {
         return set_copy_and_difference(so, other);
@@ -1519,17 +1584,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     if (PyDict_CheckExact(other)) {
         while (set_next(so, &pos, &entry)) {
-            setentry entrycopy;
-            int rv;
-            entrycopy.hash = entry->hash;
-            entrycopy.key = entry->key;
-            rv = _PyDict_Contains(other, entry->key, entry->hash);
+            key = entry->key;
+            hash = entry->hash;
+            rv = _PyDict_Contains(other, key, hash);
             if (rv < 0) {
                 Py_DECREF(result);
                 return NULL;
             }
             if (!rv) {
-                if (set_add_entry((PySetObject *)result, &entrycopy)) {
+                if (set_add_entry((PySetObject *)result, key, hash)) {
                     Py_DECREF(result);
                     return NULL;
                 }
@@ -1540,13 +1603,15 @@ set_difference(PySetObject *so, PyObject *other)
 
     /* Iterate over so, checking for common elements in other. */
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_contains_entry((PySetObject *)other, key, hash);
+        if (rv < 0) {
             Py_DECREF(result);
             return NULL;
         }
         if (!rv) {
-            if (set_add_entry((PySetObject *)result, entry)) {
+            if (set_add_entry((PySetObject *)result, key, hash)) {
                 Py_DECREF(result);
                 return NULL;
             }
@@ -1608,29 +1673,24 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     PySetObject *otherset;
     PyObject *key;
     Py_ssize_t pos = 0;
+    Py_hash_t hash;
     setentry *entry;
+    int rv;
 
     if ((PyObject *)so == other)
         return set_clear(so);
 
     if (PyDict_CheckExact(other)) {
         PyObject *value;
-        int rv;
-        Py_hash_t hash;
         while (_PyDict_Next(other, &pos, &key, &value, &hash)) {
-            setentry an_entry;
-
             Py_INCREF(key);
-            an_entry.hash = hash;
-            an_entry.key = key;
-
-            rv = set_discard_entry(so, &an_entry);
-            if (rv == -1) {
+            rv = set_discard_entry(so, key, hash);
+            if (rv < 0) {
                 Py_DECREF(key);
                 return NULL;
             }
             if (rv == DISCARD_NOTFOUND) {
-                if (set_add_entry(so, &an_entry)) {
+                if (set_add_entry(so, key, hash)) {
                     Py_DECREF(key);
                     return NULL;
                 }
@@ -1650,13 +1710,15 @@ set_symmetric_difference_update(PySetObject *so, PyObject *other)
     }
 
     while (set_next(otherset, &pos, &entry)) {
-        int rv = set_discard_entry(so, entry);
-        if (rv == -1) {
+        key = entry->key;
+        hash = entry->hash;
+        rv = set_discard_entry(so, key, hash);
+        if (rv < 0) {
             Py_DECREF(otherset);
             return NULL;
         }
         if (rv == DISCARD_NOTFOUND) {
-            if (set_add_entry(so, entry)) {
+            if (set_add_entry(so, key, hash)) {
                 Py_DECREF(otherset);
                 return NULL;
             }
@@ -1718,6 +1780,7 @@ set_issubset(PySetObject *so, PyObject *other)
 {
     setentry *entry;
     Py_ssize_t pos = 0;
+    int rv;
 
     if (!PyAnySet_Check(other)) {
         PyObject *tmp, *result;
@@ -1732,8 +1795,8 @@ set_issubset(PySetObject *so, PyObject *other)
         Py_RETURN_FALSE;
 
     while (set_next(so, &pos, &entry)) {
-        int rv = set_contains_entry((PySetObject *)other, entry);
-        if (rv == -1)
+        rv = set_contains_entry((PySetObject *)other, entry->key, entry->hash);
+        if (rv < 0)
             return NULL;
         if (!rv)
             Py_RETURN_FALSE;
@@ -1824,7 +1887,7 @@ set_contains(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_contains_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return -1;
         PyErr_Clear();
@@ -1843,7 +1906,7 @@ set_direct_contains(PySetObject *so, PyObject *key)
     long result;
 
     result = set_contains(so, key);
-    if (result == -1)
+    if (result < 0)
         return NULL;
     return PyBool_FromLong(result);
 }
@@ -1857,7 +1920,7 @@ set_remove(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1866,7 +1929,7 @@ set_remove(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
 
@@ -1889,7 +1952,7 @@ set_discard(PySetObject *so, PyObject *key)
     int rv;
 
     rv = set_discard_key(so, key);
-    if (rv == -1) {
+    if (rv < 0) {
         if (!PySet_Check(key) || !PyErr_ExceptionMatches(PyExc_TypeError))
             return NULL;
         PyErr_Clear();
@@ -1898,7 +1961,7 @@ set_discard(PySetObject *so, PyObject *key)
             return NULL;
         rv = set_discard_key(so, tmpkey);
         Py_DECREF(tmpkey);
-        if (rv == -1)
+        if (rv < 0)
             return NULL;
     }
     Py_RETURN_NONE;
@@ -2125,7 +2188,7 @@ static PyMethodDef frozenset_methods[] = {
      copy_doc},
     {"difference",      (PyCFunction)set_difference_multi,      METH_VARARGS,
      difference_doc},
-    {"intersection",(PyCFunction)set_intersection_multi,        METH_VARARGS,
+    {"intersection",    (PyCFunction)set_intersection_multi,    METH_VARARGS,
      intersection_doc},
     {"isdisjoint",      (PyCFunction)set_isdisjoint,    METH_O,
      isdisjoint_doc},
@@ -2196,7 +2259,7 @@ PyTypeObject PyFrozenSet_Type = {
     (traverseproc)set_traverse,         /* tp_traverse */
     (inquiry)set_clear_internal,        /* tp_clear */
     (richcmpfunc)set_richcompare,       /* tp_richcompare */
-    offsetof(PySetObject, weakreflist),         /* tp_weaklistoffset */
+    offsetof(PySetObject, weakreflist), /* tp_weaklistoffset */
     (getiterfunc)set_iter,              /* tp_iter */
     0,                                  /* tp_iternext */
     frozenset_methods,                  /* tp_methods */
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 0fc6b582d2..2beb604f11 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -263,50 +263,34 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
 
     Py_ssize_t i;                /* index into s of next input byte */
-    PyObject *result;            /* result string object */
     char *p;                     /* next free byte in output buffer */
-    Py_ssize_t nallocated;      /* number of result bytes allocated */
-    Py_ssize_t nneeded;            /* number of result bytes needed */
 #if STRINGLIB_SIZEOF_CHAR > 1
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
     PyObject *rep = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 #endif
 #if STRINGLIB_SIZEOF_CHAR == 1
     const Py_ssize_t max_char_size = 2;
-    char stackbuf[MAX_SHORT_UNICHARS * 2];
 #elif STRINGLIB_SIZEOF_CHAR == 2
     const Py_ssize_t max_char_size = 3;
-    char stackbuf[MAX_SHORT_UNICHARS * 3];
 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
     const Py_ssize_t max_char_size = 4;
-    char stackbuf[MAX_SHORT_UNICHARS * 4];
 #endif
+    _PyBytesWriter writer;
 
     assert(size >= 0);
+    _PyBytesWriter_Init(&writer);
 
-    if (size <= MAX_SHORT_UNICHARS) {
-        /* Write into the stack buffer; nallocated can't overflow.
-         * At the end, we'll allocate exactly as much heap space as it
-         * turns out we need.
-         */
-        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
-        result = NULL;   /* will allocate after we're done */
-        p = stackbuf;
-    }
-    else {
-        if (size > PY_SSIZE_T_MAX / max_char_size) {
-            /* integer overflow */
-            return PyErr_NoMemory();
-        }
-        /* Overallocate on the heap, and give the excess back at the end. */
-        nallocated = size * max_char_size;
-        result = PyBytes_FromStringAndSize(NULL, nallocated);
-        if (result == NULL)
-            return NULL;
-        p = PyBytes_AS_STRING(result);
+    if (size > PY_SSIZE_T_MAX / max_char_size) {
+        /* integer overflow */
+        return PyErr_NoMemory();
     }
 
+    p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
+    if (p == NULL)
+        return NULL;
+
     for (i = 0; i < size;) {
         Py_UCS4 ch = data[i++];
 
@@ -326,72 +310,118 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
         }
 #if STRINGLIB_SIZEOF_CHAR > 1
         else if (Py_UNICODE_IS_SURROGATE(ch)) {
-            Py_ssize_t newpos;
-            Py_ssize_t repsize, k, startpos;
+            Py_ssize_t startpos, endpos, newpos;
+            Py_ssize_t k;
+            if (error_handler == _Py_ERROR_UNKNOWN)
+                error_handler = get_error_handler(errors);
+
             startpos = i-1;
-            rep = unicode_encode_call_errorhandler(
-                  errors, &errorHandler, "utf-8", "surrogates not allowed",
-                  unicode, &exc, startpos, startpos+1, &newpos);
-            if (!rep)
-                goto error;
-
-            if (PyBytes_Check(rep))
-                repsize = PyBytes_GET_SIZE(rep);
-            else
-                repsize = PyUnicode_GET_LENGTH(rep);
+            endpos = startpos+1;
+
+            while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
+                endpos++;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (endpos < size);
+
+            switch (error_handler)
+            {
+            case _Py_ERROR_REPLACE:
+                memset(p, '?', endpos - startpos);
+                p += (endpos - startpos);
+                /* fall through the ignore handler */
+            case _Py_ERROR_IGNORE:
+                i += (endpos - startpos - 1);
+                break;
 
-            if (repsize > max_char_size) {
-                Py_ssize_t offset;
+            case _Py_ERROR_SURROGATEPASS:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    *p++ = (char)(0xe0 | (ch >> 12));
+                    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                    *p++ = (char)(0x80 | (ch & 0x3f));
+                }
+                i += (endpos - startpos - 1);
+                break;
 
-                if (result == NULL)
-                    offset = p - stackbuf;
-                else
-                    offset = p - PyBytes_AS_STRING(result);
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = backslashreplace(&writer, p,
+                                     unicode, startpos, endpos);
+                if (p == NULL)
+                    goto error;
+                i += (endpos - startpos - 1);
+                break;
 
-                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
-                    /* integer overflow */
-                    PyErr_NoMemory();
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size * (endpos - startpos);
+                p = xmlcharrefreplace(&writer, p,
+                                      unicode, startpos, endpos);
+                if (p == NULL)
                     goto error;
+                i += (endpos - startpos - 1);
+                break;
+
+            case _Py_ERROR_SURROGATEESCAPE:
+                for (k=startpos; k<endpos; k++) {
+                    ch = data[k];
+                    if (!(0xDC80 <= ch && ch <= 0xDCFF))
+                        break;
+                    *p++ = (char)(ch & 0xff);
                 }
-                nallocated += repsize - max_char_size;
-                if (result != NULL) {
-                    if (_PyBytes_Resize(&result, nallocated) < 0)
-                        goto error;
-                } else {
-                    result = PyBytes_FromStringAndSize(NULL, nallocated);
-                    if (result == NULL)
-                        goto error;
-                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+                if (k >= endpos) {
+                    i += (endpos - startpos - 1);
+                    break;
                 }
-                p = PyBytes_AS_STRING(result) + offset;
-            }
+                startpos = k;
+                assert(startpos < endpos);
+                /* fall through the default handler */
+            default:
+                rep = unicode_encode_call_errorhandler(
+                      errors, &error_handler_obj, "utf-8", "surrogates not allowed",
+                      unicode, &exc, startpos, endpos, &newpos);
+                if (!rep)
+                    goto error;
 
-            if (PyBytes_Check(rep)) {
-                char *prep = PyBytes_AS_STRING(rep);
-                for(k = repsize; k > 0; k--)
-                    *p++ = *prep++;
-            } else /* rep is unicode */ {
-                enum PyUnicode_Kind repkind;
-                void *repdata;
+                /* substract preallocated bytes */
+                writer.min_size -= max_char_size;
 
-                if (PyUnicode_READY(rep) < 0)
-                    goto error;
-                repkind = PyUnicode_KIND(rep);
-                repdata = PyUnicode_DATA(rep);
+                if (PyBytes_Check(rep)) {
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyBytes_AS_STRING(rep),
+                                                  PyBytes_GET_SIZE(rep));
+                }
+                else {
+                    /* rep is unicode */
+                    if (PyUnicode_READY(rep) < 0)
+                        goto error;
 
-                for(k=0; k<repsize; k++) {
-                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
-                    if (0x80 <= c) {
+                    if (!PyUnicode_IS_ASCII(rep)) {
                         raise_encode_exception(&exc, "utf-8",
                                                unicode,
                                                i-1, i,
                                                "surrogates not allowed");
                         goto error;
                     }
-                    *p++ = (char)c;
+
+                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                    p = _PyBytesWriter_WriteBytes(&writer, p,
+                                                  PyUnicode_DATA(rep),
+                                                  PyUnicode_GET_LENGTH(rep));
                 }
+
+                if (p == NULL)
+                    goto error;
+                Py_CLEAR(rep);
+
+                i = newpos;
             }
-            Py_CLEAR(rep);
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || i == size);
         }
         else
 #if STRINGLIB_SIZEOF_CHAR > 2
@@ -416,31 +446,18 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
     }
 
-    if (result == NULL) {
-        /* This was stack allocated. */
-        nneeded = p - stackbuf;
-        assert(nneeded <= nallocated);
-        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
-    }
-    else {
-        /* Cut back to size actually needed. */
-        nneeded = p - PyBytes_AS_STRING(result);
-        assert(nneeded <= nallocated);
-        _PyBytes_Resize(&result, nneeded);
-    }
-
 #if STRINGLIB_SIZEOF_CHAR > 1
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
 #endif
-    return result;
+    return _PyBytesWriter_Finish(&writer, p);
 
 #if STRINGLIB_SIZEOF_CHAR > 1
  error:
     Py_XDECREF(rep);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    Py_XDECREF(result);
+    _PyBytesWriter_Dealloc(&writer);
     return NULL;
 #endif
 
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index cda68e77c8..98165ad114 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -32,52 +32,98 @@
 #define STRINGLIB_BLOOM(mask, ch)     \
     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
 
-
 Py_LOCAL_INLINE(Py_ssize_t)
-STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n,
-                                   STRINGLIB_CHAR ch, unsigned char needle,
-                                   int mode)
+STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
 {
-    if (mode == FAST_SEARCH) {
-        const STRINGLIB_CHAR *ptr = s;
-        const STRINGLIB_CHAR *e = s + n;
-        while (ptr < e) {
-            void *candidate = memchr((const void *) ptr, needle, (e - ptr) * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            ptr = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            if (sizeof(STRINGLIB_CHAR) == 1 || *ptr == ch)
-                return (ptr - s);
-            /* False positive */
-            ptr++;
-        }
+    const STRINGLIB_CHAR *p, *e;
+
+    p = s;
+    e = s + n;
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
+#else
+        /* use memchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (p < e) {
+                void *candidate = memchr(p, needle,
+                                         (e - p) * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                if (*p == ch)
+                    return (p - s);
+                /* False positive */
+                p++;
+            }
+            return -1;
+        }
+#endif
     }
+    while (p < e) {
+        if (*p == ch)
+            return (p - s);
+        p++;
+    }
+    return -1;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
+{
+    const STRINGLIB_CHAR *p;
 #ifdef HAVE_MEMRCHR
     /* memrchr() is a GNU extension, available since glibc 2.1.91.
        it doesn't seem as optimized as memchr(), but is still quite
-       faster than our hand-written loop in FASTSEARCH below */
-    else if (mode == FAST_RSEARCH) {
-        while (n > 0) {
-            const STRINGLIB_CHAR *found;
-            void *candidate = memrchr((const void *) s, needle, n * sizeof(STRINGLIB_CHAR));
-            if (candidate == NULL)
-                return -1;
-            found = (const STRINGLIB_CHAR *) _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
-            n = found - s;
-            if (sizeof(STRINGLIB_CHAR) == 1 || *found == ch)
-                return n;
-            /* False positive */
-        }
+       faster than our hand-written loop below */
+
+    if (n > 10) {
+#if STRINGLIB_SIZEOF_CHAR == 1
+        p = memrchr(s, ch, n);
+        if (p != NULL)
+            return (p - s);
         return -1;
-    }
+#else
+        /* use memrchr if we can choose a needle without two many likely
+           false positives */
+        unsigned char needle = ch & 0xff;
+        /* If looking for a multiple of 256, we'd have too
+           many false positives looking for the '\0' byte in UCS2
+           and UCS4 representations. */
+        if (needle != 0) {
+            while (n > 0) {
+                void *candidate = memrchr(s, needle,
+                                          n * sizeof(STRINGLIB_CHAR));
+                if (candidate == NULL)
+                    return -1;
+                p = (const STRINGLIB_CHAR *)
+                        _Py_ALIGN_DOWN(candidate, sizeof(STRINGLIB_CHAR));
+                n = p - s;
+                if (*p == ch)
+                    return n;
+                /* False positive */
+            }
+            return -1;
+        }
 #endif
-    else {
-        assert(0); /* Should never get here */
-        return 0;
     }
-
-#undef DO_MEMCHR
+#endif  /* HAVE_MEMRCHR */
+    p = s + n;
+    while (p > s) {
+        p--;
+        if (*p == ch)
+            return (p - s);
+    }
+    return -1;
 }
 
 Py_LOCAL_INLINE(Py_ssize_t)
@@ -99,25 +145,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
         if (m <= 0)
             return -1;
         /* use special case for 1-character strings */
-        if (n > 10 && (mode == FAST_SEARCH
-#ifdef HAVE_MEMRCHR
-                    || mode == FAST_RSEARCH
-#endif
-                    )) {
-            /* use memchr if we can choose a needle without two many likely
-               false positives */
-            unsigned char needle;
-            needle = p[0] & 0xff;
-#if STRINGLIB_SIZEOF_CHAR > 1
-            /* If looking for a multiple of 256, we'd have too
-               many false positives looking for the '\0' byte in UCS2
-               and UCS4 representations. */
-            if (needle != 0)
-#endif
-                return STRINGLIB(fastsearch_memchr_1char)
-                       (s, n, p[0], needle, mode);
-        }
-        if (mode == FAST_COUNT) {
+        if (mode == FAST_SEARCH)
+            return STRINGLIB(find_char)(s, n, p[0]);
+        else if (mode == FAST_RSEARCH)
+            return STRINGLIB(rfind_char)(s, n, p[0]);
+        else {  /* FAST_COUNT */
             for (i = 0; i < n; i++)
                 if (s[i] == p[0]) {
                     count++;
@@ -125,14 +157,6 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
                         return maxcount;
                 }
             return count;
-        } else if (mode == FAST_SEARCH) {
-            for (i = 0; i < n; i++)
-                if (s[i] == p[0])
-                    return i;
-        } else {    /* FAST_RSEARCH */
-            for (i = n - 1; i > -1; i--)
-                if (s[i] == p[0])
-                    return i;
         }
         return -1;
     }
diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
index aec221acff..d72e47d348 100644
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -67,7 +67,7 @@ SubString_new_object(SubString *str)
     return PyUnicode_Substring(str->str, str->start, str->end);
 }
 
-/* return a new string.  if str->str is NULL, return None */
+/* return a new string.  if str->str is NULL, return a new empty string */
 Py_LOCAL_INLINE(PyObject *)
 SubString_new_object_or_empty(SubString *str)
 {
diff --git a/Objects/structseq.c b/Objects/structseq.c
index 664344be6c..720973816e 100644
--- a/Objects/structseq.c
+++ b/Objects/structseq.c
@@ -16,14 +16,14 @@ _Py_IDENTIFIER(n_fields);
 _Py_IDENTIFIER(n_unnamed_fields);
 
 #define VISIBLE_SIZE(op) Py_SIZE(op)
-#define VISIBLE_SIZE_TP(tp) PyLong_AsLong( \
+#define VISIBLE_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_sequence_fields))
 
-#define REAL_SIZE_TP(tp) PyLong_AsLong( \
+#define REAL_SIZE_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_fields))
 #define REAL_SIZE(op) REAL_SIZE_TP(Py_TYPE(op))
 
-#define UNNAMED_FIELDS_TP(tp) PyLong_AsLong( \
+#define UNNAMED_FIELDS_TP(tp) PyLong_AsSsize_t( \
                       _PyDict_GetItemId((tp)->tp_dict, &PyId_n_unnamed_fields))
 #define UNNAMED_FIELDS(op) UNNAMED_FIELDS_TP(Py_TYPE(op))
 
@@ -164,7 +164,8 @@ structseq_repr(PyStructSequence *obj)
 #define TYPE_MAXSIZE 100
 
     PyTypeObject *typ = Py_TYPE(obj);
-    int i, removelast = 0;
+    Py_ssize_t i;
+    int removelast = 0;
     Py_ssize_t len;
     char buf[REPR_BUFFER_SIZE];
     char *endofbuf, *pbuf = buf;
@@ -236,8 +237,7 @@ structseq_reduce(PyStructSequence* self)
     PyObject* tup = NULL;
     PyObject* dict = NULL;
     PyObject* result;
-    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields;
-    int i;
+    Py_ssize_t n_fields, n_visible_fields, n_unnamed_fields, i;
 
     n_fields = REAL_SIZE(self);
     n_visible_fields = VISIBLE_SIZE(self);
@@ -325,7 +325,7 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
 {
     PyObject *dict;
     PyMemberDef* members;
-    int n_members, n_unnamed_members, i, k;
+    Py_ssize_t n_members, n_unnamed_members, i, k;
     PyObject *v;
 
 #ifdef Py_TRACE_REFS
@@ -373,9 +373,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
     Py_INCREF(type);
 
     dict = type->tp_dict;
-#define SET_DICT_FROM_INT(key, value)                           \
+#define SET_DICT_FROM_SIZE(key, value)                          \
     do {                                                        \
-        v = PyLong_FromLong((long) value);                      \
+        v = PyLong_FromSsize_t(value);                          \
         if (v == NULL)                                          \
             return -1;                                          \
         if (PyDict_SetItemString(dict, key, v) < 0) {           \
@@ -385,9 +385,9 @@ PyStructSequence_InitType2(PyTypeObject *type, PyStructSequence_Desc *desc)
         Py_DECREF(v);                                           \
     } while (0)
 
-    SET_DICT_FROM_INT(visible_length_key, desc->n_in_sequence);
-    SET_DICT_FROM_INT(real_length_key, n_members);
-    SET_DICT_FROM_INT(unnamed_fields_key, n_unnamed_members);
+    SET_DICT_FROM_SIZE(visible_length_key, desc->n_in_sequence);
+    SET_DICT_FROM_SIZE(real_length_key, n_members);
+    SET_DICT_FROM_SIZE(unnamed_fields_key, n_unnamed_members);
 
     return 0;
 }
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 5b858bde03..341e18c455 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -904,25 +904,33 @@ type_call(PyTypeObject *type, PyObject *args, PyObject *kwds)
 #endif
 
     obj = type->tp_new(type, args, kwds);
-    if (obj != NULL) {
-        /* Ugly exception: when the call was type(something),
-           don't call tp_init on the result. */
-        if (type == &PyType_Type &&
-            PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
-            (kwds == NULL ||
-             (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
-            return obj;
-        /* If the returned object is not an instance of type,
-           it won't be initialized. */
-        if (!PyType_IsSubtype(Py_TYPE(obj), type))
-            return obj;
-        type = Py_TYPE(obj);
-        if (type->tp_init != NULL) {
-            int res = type->tp_init(obj, args, kwds);
-            if (res < 0) {
-                Py_DECREF(obj);
-                obj = NULL;
-            }
+    obj = _Py_CheckFunctionResult((PyObject*)type, obj, NULL);
+    if (obj == NULL)
+        return NULL;
+
+    /* Ugly exception: when the call was type(something),
+       don't call tp_init on the result. */
+    if (type == &PyType_Type &&
+        PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 1 &&
+        (kwds == NULL ||
+         (PyDict_Check(kwds) && PyDict_Size(kwds) == 0)))
+        return obj;
+
+    /* If the returned object is not an instance of type,
+       it won't be initialized. */
+    if (!PyType_IsSubtype(Py_TYPE(obj), type))
+        return obj;
+
+    type = Py_TYPE(obj);
+    if (type->tp_init != NULL) {
+        int res = type->tp_init(obj, args, kwds);
+        if (res < 0) {
+            assert(PyErr_Occurred());
+            Py_DECREF(obj);
+            obj = NULL;
+        }
+        else {
+            assert(!PyErr_Occurred());
         }
     }
     return obj;
@@ -4093,7 +4101,7 @@ _PyObject_GetItemsIter(PyObject *obj, PyObject **listitems,
 }
 
 static PyObject *
-reduce_newobj(PyObject *obj, int proto)
+reduce_newobj(PyObject *obj)
 {
     PyObject *args = NULL, *kwargs = NULL;
     PyObject *copyreg;
@@ -4151,7 +4159,7 @@ reduce_newobj(PyObject *obj, int proto)
         }
         Py_DECREF(args);
     }
-    else if (proto >= 4) {
+    else {
         _Py_IDENTIFIER(__newobj_ex__);
 
         newobj = _PyObject_GetAttrId(copyreg, &PyId___newobj_ex__);
@@ -4169,16 +4177,6 @@ reduce_newobj(PyObject *obj, int proto)
             return NULL;
         }
     }
-    else {
-        PyErr_SetString(PyExc_ValueError,
-                        "must use protocol 4 or greater to copy this "
-                        "object; since __getnewargs_ex__ returned "
-                        "keyword arguments.");
-        Py_DECREF(args);
-        Py_DECREF(kwargs);
-        Py_DECREF(copyreg);
-        return NULL;
-    }
 
     state = _PyObject_GetState(obj);
     if (state == NULL) {
@@ -4223,7 +4221,7 @@ _common_reduce(PyObject *self, int proto)
     PyObject *copyreg, *res;
 
     if (proto >= 2)
-        return reduce_newobj(self, proto);
+        return reduce_newobj(self);
 
     copyreg = import_copyreg();
     if (!copyreg)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c5b35e1194..a985d6fb42 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -42,6 +42,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #include "Python.h"
 #include "ucnhash.h"
 #include "bytes_methods.h"
+#include "stringlib/eq.h"
 
 #ifdef MS_WINDOWS
 #include <windows.h>
@@ -162,6 +163,14 @@ extern "C" {
             *_to++ = (to_type) *_iter++;                \
     } while (0)
 
+#ifdef MS_WINDOWS
+   /* On Windows, overallocate by 50% is the best factor */
+#  define OVERALLOCATE_FACTOR 2
+#else
+   /* On Linux, overallocate by 25% is the best factor */
+#  define OVERALLOCATE_FACTOR 4
+#endif
+
 /* This dictionary holds all interned unicode strings.  Note that references
    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    When the interned string reaches a refcnt of 0 the string deallocation
@@ -292,6 +301,38 @@ static unsigned char ascii_linebreak[] = {
 
 #include "clinic/unicodeobject.c.h"
 
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+static _Py_error_handler
+get_error_handler(const char *errors)
+{
+    if (errors == NULL || strcmp(errors, "strict") == 0)
+        return _Py_ERROR_STRICT;
+    if (strcmp(errors, "surrogateescape") == 0)
+        return _Py_ERROR_SURROGATEESCAPE;
+    if (strcmp(errors, "replace") == 0)
+        return _Py_ERROR_REPLACE;
+    if (strcmp(errors, "ignore") == 0)
+        return _Py_ERROR_IGNORE;
+    if (strcmp(errors, "backslashreplace") == 0)
+        return _Py_ERROR_BACKSLASHREPLACE;
+    if (strcmp(errors, "surrogatepass") == 0)
+        return _Py_ERROR_SURROGATEPASS;
+    if (strcmp(errors, "xmlcharrefreplace") == 0)
+        return _Py_ERROR_XMLCHARREFREPLACE;
+    return _Py_ERROR_OTHER;
+}
+
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@@ -521,6 +562,129 @@ unicode_result_unchanged(PyObject *unicode)
         return _PyUnicode_Copy(unicode);
 }
 
+/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+backslashreplace(_PyBytesWriter *writer, char *str,
+                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 0x100)
+            incr = 2+2;
+        else if (ch < 0x10000)
+            incr = 2+4;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+8;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        ch = PyUnicode_READ(kind, data, i);
+        *str++ = '\\';
+        if (ch >= 0x00010000) {
+            *str++ = 'U';
+            *str++ = Py_hexdigits[(ch>>28)&0xf];
+            *str++ = Py_hexdigits[(ch>>24)&0xf];
+            *str++ = Py_hexdigits[(ch>>20)&0xf];
+            *str++ = Py_hexdigits[(ch>>16)&0xf];
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else if (ch >= 0x100) {
+            *str++ = 'u';
+            *str++ = Py_hexdigits[(ch>>12)&0xf];
+            *str++ = Py_hexdigits[(ch>>8)&0xf];
+        }
+        else
+            *str++ = 'x';
+        *str++ = Py_hexdigits[(ch>>4)&0xf];
+        *str++ = Py_hexdigits[ch&0xf];
+    }
+    return str;
+}
+
+/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
+   ASCII, Latin1, UTF-8, etc. */
+static char*
+xmlcharrefreplace(_PyBytesWriter *writer, char *str,
+                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
+{
+    Py_ssize_t size, i;
+    Py_UCS4 ch;
+    enum PyUnicode_Kind kind;
+    void *data;
+
+    assert(PyUnicode_IS_READY(unicode));
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    size = 0;
+    /* determine replacement size */
+    for (i = collstart; i < collend; ++i) {
+        Py_ssize_t incr;
+
+        ch = PyUnicode_READ(kind, data, i);
+        if (ch < 10)
+            incr = 2+1+1;
+        else if (ch < 100)
+            incr = 2+2+1;
+        else if (ch < 1000)
+            incr = 2+3+1;
+        else if (ch < 10000)
+            incr = 2+4+1;
+        else if (ch < 100000)
+            incr = 2+5+1;
+        else if (ch < 1000000)
+            incr = 2+6+1;
+        else {
+            assert(ch <= MAX_UNICODE);
+            incr = 2+7+1;
+        }
+        if (size > PY_SSIZE_T_MAX - incr) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "encoded result is too long for a Python string");
+            return NULL;
+        }
+        size += incr;
+    }
+
+    str = _PyBytesWriter_Prepare(writer, str, size);
+    if (str == NULL)
+        return NULL;
+
+    /* generate replacement */
+    for (i = collstart; i < collend; ++i) {
+        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+    }
+    return str;
+}
+
 /* --- Bloom Filters ----------------------------------------------------- */
 
 /* stuff to implement simple "bloom filters" for Unicode characters.
@@ -647,27 +811,26 @@ Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
                                      Py_ssize_t size, Py_UCS4 ch,
                                      int direction)
 {
-    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
-
     switch (kind) {
     case PyUnicode_1BYTE_KIND:
-        {
-            Py_UCS1 ch1 = (Py_UCS1) ch;
-            if (ch1 == ch)
-                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS1) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
+        else
+            return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
     case PyUnicode_2BYTE_KIND:
-        {
-            Py_UCS2 ch2 = (Py_UCS2) ch;
-            if (ch2 == ch)
-                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
-            else
-                return -1;
-        }
+        if ((Py_UCS2) ch != ch)
+            return -1;
+        if (direction > 0)
+            return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
+        else
+            return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
     case PyUnicode_4BYTE_KIND:
-        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
+        if (direction > 0)
+            return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
+        else
+            return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
     default:
         assert(0);
         return -1;
@@ -3165,24 +3328,22 @@ wcstombs_errorpos(const wchar_t *wstr)
 static int
 locale_error_handler(const char *errors, int *surrogateescape)
 {
-    if (errors == NULL) {
-        *surrogateescape = 0;
-        return 0;
-    }
-
-    if (strcmp(errors, "strict") == 0) {
+    _Py_error_handler error_handler = get_error_handler(errors);
+    switch (error_handler)
+    {
+    case _Py_ERROR_STRICT:
         *surrogateescape = 0;
         return 0;
-    }
-    if (strcmp(errors, "surrogateescape") == 0) {
+    case _Py_ERROR_SURROGATEESCAPE:
         *surrogateescape = 1;
         return 0;
+    default:
+        PyErr_Format(PyExc_ValueError,
+                     "only 'strict' and 'surrogateescape' error handlers "
+                     "are supported, not '%s'",
+                     errors);
+        return -1;
     }
-    PyErr_Format(PyExc_ValueError,
-                 "only 'strict' and 'surrogateescape' error handlers "
-                 "are supported, not '%s'",
-                 errors);
-    return -1;
 }
 
 PyObject *
@@ -4688,8 +4849,9 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
     const char *errmsg = "";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4714,6 +4876,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     while (s < end) {
         Py_UCS4 ch;
         int kind = writer.kind;
+
         if (kind == PyUnicode_1BYTE_KIND) {
             if (PyUnicode_IS_ASCII(writer.buffer))
                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
@@ -4752,24 +4915,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
             continue;
         }
 
-        if (unicode_decode_call_errorhandler_writer(
-                errors, &errorHandler,
-                "utf-8", errmsg,
-                &starts, &end, &startinpos, &endinpos, &exc, &s,
-                &writer))
-            goto onError;
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler) {
+        case _Py_ERROR_IGNORE:
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_REPLACE:
+            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
+                goto onError;
+            s += (endinpos - startinpos);
+            break;
+
+        case _Py_ERROR_SURROGATEESCAPE:
+        {
+            Py_ssize_t i;
+
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            for (i=startinpos; i<endinpos; i++) {
+                ch = (Py_UCS4)(unsigned char)(starts[i]);
+                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
+                                ch + 0xdc00);
+                writer.pos++;
+            }
+            s += (endinpos - startinpos);
+            break;
+        }
+
+        default:
+            if (unicode_decode_call_errorhandler_writer(
+                    errors, &error_handler_obj,
+                    "utf-8", errmsg,
+                    &starts, &end, &startinpos, &endinpos, &exc, &s,
+                    &writer))
+                goto onError;
+        }
     }
 
 End:
     if (consumed)
         *consumed = s - starts;
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
 onError:
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     _PyUnicodeWriter_Dealloc(&writer);
     return NULL;
@@ -5860,11 +6055,10 @@ PyObject *
 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 {
     Py_ssize_t i, len;
-    PyObject *repr;
     char *p;
     int kind;
     void *data;
-    Py_ssize_t expandsize = 0;
+    _PyBytesWriter writer;
 
     /* Initial allocation is based on the longest-possible character
        escape.
@@ -5880,35 +6074,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
     }
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
+
+    _PyBytesWriter_Init(&writer);
+
     len = PyUnicode_GET_LENGTH(unicode);
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
-    switch (kind) {
-    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
-    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
-    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
-    }
 
-    if (len == 0)
-        return PyBytes_FromStringAndSize(NULL, 0);
-
-    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
-        return PyErr_NoMemory();
-
-    repr = PyBytes_FromStringAndSize(NULL,
-                                     2
-                                     + expandsize*len
-                                     + 1);
-    if (repr == NULL)
-        return NULL;
-
-    p = PyBytes_AS_STRING(repr);
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
+        goto error;
+    writer.overallocate = 1;
 
     for (i = 0; i < len; i++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
 
         /* Escape backslashes */
         if (ch == '\\') {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = (char) ch;
             continue;
@@ -5917,6 +6104,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
         /* Map 21-bit characters to '\U00xxxxxx' */
         else if (ch >= 0x10000) {
             assert(ch <= MAX_UNICODE);
+
+            p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'U';
             *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -5932,6 +6124,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 
         /* Map 16-bit characters to '\uxxxx' */
         if (ch >= 256) {
+            p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
@@ -5942,20 +6138,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 
         /* Map special whitespace to '\t', \n', '\r' */
         else if (ch == '\t') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 't';
         }
         else if (ch == '\n') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'n';
         }
         else if (ch == '\r') {
+            p = _PyBytesWriter_Prepare(&writer, p, 2-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'r';
         }
 
         /* Map non-printable US ASCII to '\xhh' */
         else if (ch < ' ' || ch >= 0x7F) {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 4-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'x';
             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
@@ -5967,10 +6180,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
             *p++ = (char) ch;
     }
 
-    assert(p - PyBytes_AS_STRING(repr) > 0);
-    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
-        return NULL;
-    return repr;
+    return _PyBytesWriter_Finish(&writer, p);
+
+error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -6099,13 +6313,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
 PyObject *
 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
 {
-    PyObject *repr;
     char *p;
-    char *q;
-    Py_ssize_t expandsize, pos;
+    Py_ssize_t pos;
     int kind;
     void *data;
     Py_ssize_t len;
+    _PyBytesWriter writer;
 
     if (!PyUnicode_Check(unicode)) {
         PyErr_BadArgument();
@@ -6113,28 +6326,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
     }
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
+
+    _PyBytesWriter_Init(&writer);
+
     kind = PyUnicode_KIND(unicode);
     data = PyUnicode_DATA(unicode);
     len = PyUnicode_GET_LENGTH(unicode);
-    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
-       bytes, and 1 byte characters 4. */
-    expandsize = kind * 2 + 2;
 
-    if (len > PY_SSIZE_T_MAX / expandsize)
-        return PyErr_NoMemory();
-
-    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
-    if (repr == NULL)
-        return NULL;
-    if (len == 0)
-        return repr;
+    p = _PyBytesWriter_Alloc(&writer, len);
+    if (p == NULL)
+        goto error;
+    writer.overallocate = 1;
 
-    p = q = PyBytes_AS_STRING(repr);
     for (pos = 0; pos < len; pos++) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
         /* Map 32-bit characters to '\Uxxxxxxxx' */
         if (ch >= 0x10000) {
             assert(ch <= MAX_UNICODE);
+
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 10-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'U';
             *p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6148,6 +6362,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
         }
         /* Map 16-bit characters to '\uxxxx' */
         else if (ch >= 256) {
+            /* -1: substract 1 preallocated byte */
+            p = _PyBytesWriter_Prepare(&writer, p, 6-1);
+            if (p == NULL)
+                goto error;
+
             *p++ = '\\';
             *p++ = 'u';
             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
@@ -6160,10 +6379,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
             *p++ = (char) ch;
     }
 
-    assert(p > q);
-    if (_PyBytes_Resize(&repr, p - q) < 0)
-        return NULL;
-    return repr;
+    return _PyBytesWriter_Finish(&writer, p);
+
+error:
+    _PyBytesWriter_Dealloc(&writer);
+    return NULL;
 }
 
 PyObject *
@@ -6394,25 +6614,22 @@ unicode_encode_call_errorhandler(const char *errors,
 static PyObject *
 unicode_encode_ucs1(PyObject *unicode,
                     const char *errors,
-                    unsigned int limit)
+                    const Py_UCS4 limit)
 {
     /* input state */
     Py_ssize_t pos=0, size;
     int kind;
     void *data;
-    /* output object */
-    PyObject *res;
     /* pointer into the output */
     char *str;
-    /* current output position */
-    Py_ssize_t ressize;
     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
+    PyObject *rep = NULL;
+    /* output object */
+    _PyBytesWriter writer;
 
     if (PyUnicode_READY(unicode) == -1)
         return NULL;
@@ -6423,186 +6640,157 @@ unicode_encode_ucs1(PyObject *unicode,
        replacements, if we need more, we'll resize */
     if (size == 0)
         return PyBytes_FromStringAndSize(NULL, 0);
-    res = PyBytes_FromStringAndSize(NULL, size);
-    if (res == NULL)
+
+    _PyBytesWriter_Init(&writer);
+    str = _PyBytesWriter_Alloc(&writer, size);
+    if (str == NULL)
         return NULL;
-    str = PyBytes_AS_STRING(res);
-    ressize = size;
 
     while (pos < size) {
-        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
+        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
 
         /* can we encode this? */
-        if (c<limit) {
+        if (ch < limit) {
             /* no overflow check, because we know that the space is enough */
-            *str++ = (char)c;
+            *str++ = (char)ch;
             ++pos;
         }
         else {
-            Py_ssize_t requiredsize;
-            PyObject *repunicode;
-            Py_ssize_t repsize, newpos, respos, i;
+            Py_ssize_t newpos, i;
             /* startpos for collecting unencodable chars */
             Py_ssize_t collstart = pos;
-            Py_ssize_t collend = pos;
+            Py_ssize_t collend = collstart + 1;
             /* find all unecodable characters */
+
             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
                 ++collend;
+
+            /* Only overallocate the buffer if it's not the last write */
+            writer.overallocate = (collend < size);
+
             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
-            if (known_errorHandler==-1) {
-                if ((errors==NULL) || (!strcmp(errors, "strict")))
-                    known_errorHandler = 1;
-                else if (!strcmp(errors, "replace"))
-                    known_errorHandler = 2;
-                else if (!strcmp(errors, "ignore"))
-                    known_errorHandler = 3;
-                else if (!strcmp(errors, "xmlcharrefreplace"))
-                    known_errorHandler = 4;
-                else
-                    known_errorHandler = 0;
-            }
-            switch (known_errorHandler) {
-            case 1: /* strict */
+            if (error_handler == _Py_ERROR_UNKNOWN)
+                error_handler = get_error_handler(errors);
+
+            switch (error_handler) {
+            case _Py_ERROR_STRICT:
                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
                 goto onError;
-            case 2: /* replace */
-                while (collstart++ < collend)
-                    *str++ = '?'; /* fall through */
-            case 3: /* ignore */
+
+            case _Py_ERROR_REPLACE:
+                memset(str, '?', collend - collstart);
+                str += (collend - collstart);
+                /* fall through ignore error handler */
+            case _Py_ERROR_IGNORE:
                 pos = collend;
                 break;
-            case 4: /* xmlcharrefreplace */
-                respos = str - PyBytes_AS_STRING(res);
-                requiredsize = respos;
-                /* determine replacement size */
+
+            case _Py_ERROR_BACKSLASHREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = backslashreplace(&writer, str,
+                                       unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
+                pos = collend;
+                break;
+
+            case _Py_ERROR_XMLCHARREFREPLACE:
+                /* substract preallocated bytes */
+                writer.min_size -= (collend - collstart);
+                str = xmlcharrefreplace(&writer, str,
+                                        unicode, collstart, collend);
+                if (str == NULL)
+                    goto onError;
+                pos = collend;
+                break;
+
+            case _Py_ERROR_SURROGATEESCAPE:
                 for (i = collstart; i < collend; ++i) {
-                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
-                    Py_ssize_t incr;
-                    if (ch < 10)
-                        incr = 2+1+1;
-                    else if (ch < 100)
-                        incr = 2+2+1;
-                    else if (ch < 1000)
-                        incr = 2+3+1;
-                    else if (ch < 10000)
-                        incr = 2+4+1;
-                    else if (ch < 100000)
-                        incr = 2+5+1;
-                    else if (ch < 1000000)
-                        incr = 2+6+1;
-                    else {
-                        assert(ch <= MAX_UNICODE);
-                        incr = 2+7+1;
+                    ch = PyUnicode_READ(kind, data, i);
+                    if (ch < 0xdc80 || 0xdcff < ch) {
+                        /* Not a UTF-8b surrogate */
+                        break;
                     }
-                    if (requiredsize > PY_SSIZE_T_MAX - incr)
-                        goto overflow;
-                    requiredsize += incr;
-                }
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize))
-                        goto onError;
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
-                }
-                /* generate replacement */
-                for (i = collstart; i < collend; ++i) {
-                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
+                    *str++ = (char)(ch - 0xdc00);
+                    ++pos;
                 }
-                pos = collend;
-                break;
+                if (i >= collend)
+                    break;
+                collstart = pos;
+                assert(collstart != collend);
+                /* fallback to general error handling */
+
             default:
-                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
-                                                              encoding, reason, unicode, &exc,
-                                                              collstart, collend, &newpos);
-                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
-                                           PyUnicode_READY(repunicode) == -1))
+                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
+                                                       encoding, reason, unicode, &exc,
+                                                       collstart, collend, &newpos);
+                if (rep == NULL)
                     goto onError;
-                if (PyBytes_Check(repunicode)) {
+
+                /* substract preallocated bytes */
+                writer.min_size -= 1;
+
+                if (PyBytes_Check(rep)) {
                     /* Directly copy bytes result to output. */
-                    repsize = PyBytes_Size(repunicode);
-                    if (repsize > 1) {
-                        /* Make room for all additional bytes. */
-                        respos = str - PyBytes_AS_STRING(res);
-                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
-                            Py_DECREF(repunicode);
-                            goto overflow;
-                        }
-                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
-                            Py_DECREF(repunicode);
-                            goto onError;
-                        }
-                        str = PyBytes_AS_STRING(res) + respos;
-                        ressize += repsize-1;
-                    }
-                    memcpy(str, PyBytes_AsString(repunicode), repsize);
-                    str += repsize;
-                    pos = newpos;
-                    Py_DECREF(repunicode);
-                    break;
-                }
-                /* need more space? (at least enough for what we
-                   have+the replacement+the rest of the string, so
-                   we won't have to check space for encodable characters) */
-                respos = str - PyBytes_AS_STRING(res);
-                repsize = PyUnicode_GET_LENGTH(repunicode);
-                requiredsize = respos;
-                if (requiredsize > PY_SSIZE_T_MAX - repsize)
-                    goto overflow;
-                requiredsize += repsize;
-                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
-                    goto overflow;
-                requiredsize += size - collend;
-                if (requiredsize > ressize) {
-                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
-                        requiredsize = 2*ressize;
-                    if (_PyBytes_Resize(&res, requiredsize)) {
-                        Py_DECREF(repunicode);
+                    str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                    PyBytes_AS_STRING(rep),
+                                                    PyBytes_GET_SIZE(rep));
+                    if (str == NULL)
                         goto onError;
-                    }
-                    str = PyBytes_AS_STRING(res) + respos;
-                    ressize = requiredsize;
                 }
-                /* check if there is anything unencodable in the replacement
-                   and copy it to the output */
-                for (i = 0; repsize-->0; ++i, ++str) {
-                    c = PyUnicode_READ_CHAR(repunicode, i);
-                    if (c >= limit) {
-                        raise_encode_exception(&exc, encoding, unicode,
-                                               pos, pos+1, reason);
-                        Py_DECREF(repunicode);
+                else {
+                    assert(PyUnicode_Check(rep));
+
+                    if (PyUnicode_READY(rep) < 0)
                         goto onError;
+
+                    if (PyUnicode_IS_ASCII(rep)) {
+                        /* Fast path: all characters are smaller than limit */
+                        assert(limit >= 128);
+                        assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
+                        str = _PyBytesWriter_WriteBytes(&writer, str,
+                                                        PyUnicode_DATA(rep),
+                                                        PyUnicode_GET_LENGTH(rep));
+                    }
+                    else {
+                        Py_ssize_t repsize = PyUnicode_GET_LENGTH(rep);
+
+                        str = _PyBytesWriter_Prepare(&writer, str, repsize);
+                        if (str == NULL)
+                            goto onError;
+
+                        /* check if there is anything unencodable in the
+                           replacement and copy it to the output */
+                        for (i = 0; repsize-->0; ++i, ++str) {
+                            ch = PyUnicode_READ_CHAR(rep, i);
+                            if (ch >= limit) {
+                                raise_encode_exception(&exc, encoding, unicode,
+                                                       pos, pos+1, reason);
+                                goto onError;
+                            }
+                            *str = (char)ch;
+                        }
                     }
-                    *str = (char)c;
                 }
                 pos = newpos;
-                Py_DECREF(repunicode);
+                Py_CLEAR(rep);
             }
+
+            /* If overallocation was disabled, ensure that it was the last
+               write. Otherwise, we missed an optimization */
+            assert(writer.overallocate || pos == size);
         }
     }
-    /* Resize if we allocated to much */
-    size = str - PyBytes_AS_STRING(res);
-    if (size < ressize) { /* If this falls res will be NULL */
-        assert(size >= 0);
-        if (_PyBytes_Resize(&res, size) < 0)
-            goto onError;
-    }
 
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
-    return res;
-
-  overflow:
-    PyErr_SetString(PyExc_OverflowError,
-                    "encoded result is too long for a Python string");
+    return _PyBytesWriter_Finish(&writer, str);
 
   onError:
-    Py_XDECREF(res);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(rep);
+    _PyBytesWriter_Dealloc(&writer);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -6662,8 +6850,9 @@ PyUnicode_DecodeASCII(const char *s,
     Py_ssize_t endinpos;
     Py_ssize_t outpos;
     const char *e;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
@@ -6692,12 +6881,42 @@ PyUnicode_DecodeASCII(const char *s,
             PyUnicode_WRITE(kind, data, writer.pos, c);
             writer.pos++;
             ++s;
+            continue;
         }
-        else {
+
+        /* byte outsize range 0x00..0x7f: call the error handler */
+
+        if (error_handler == _Py_ERROR_UNKNOWN)
+            error_handler = get_error_handler(errors);
+
+        switch (error_handler)
+        {
+        case _Py_ERROR_REPLACE:
+        case _Py_ERROR_SURROGATEESCAPE:
+            /* Fast-path: the error handler only writes one character,
+               but we may switch to UCS2 at the first write */
+            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
+                goto onError;
+            kind = writer.kind;
+            data = writer.data;
+
+            if (error_handler == _Py_ERROR_REPLACE)
+                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
+            else
+                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
+            writer.pos++;
+            ++s;
+            break;
+
+        case _Py_ERROR_IGNORE:
+            ++s;
+            break;
+
+        default:
             startinpos = s-starts;
             endinpos = startinpos + 1;
             if (unicode_decode_call_errorhandler_writer(
-                    errors, &errorHandler,
+                    errors, &error_handler_obj,
                     "ascii", "ordinal not in range(128)",
                     &starts, &e, &startinpos, &endinpos, &exc, &s,
                     &writer))
@@ -6706,13 +6925,13 @@ PyUnicode_DecodeASCII(const char *s,
             data = writer.data;
         }
     }
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return _PyUnicodeWriter_Finish(&writer);
 
   onError:
     _PyUnicodeWriter_Dealloc(&writer);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     Py_XDECREF(exc);
     return NULL;
 }
@@ -7111,7 +7330,6 @@ encode_code_page_strict(UINT code_page, PyObject **outbytes,
     BOOL usedDefaultChar = FALSE;
     BOOL *pusedDefaultChar = &usedDefaultChar;
     int outsize;
-    PyObject *exc = NULL;
     wchar_t *p;
     Py_ssize_t size;
     const DWORD flags = encode_code_page_flags(code_page, NULL);
@@ -8078,7 +8296,7 @@ static int
 charmap_encoding_error(
     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
     PyObject **exceptionObject,
-    int *known_errorHandler, PyObject **errorHandler, const char *errors,
+    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
     PyObject **res, Py_ssize_t *respos)
 {
     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
@@ -8125,23 +8343,15 @@ charmap_encoding_error(
     }
     /* cache callback name lookup
      * (if not done yet, i.e. it's the first error) */
-    if (*known_errorHandler==-1) {
-        if ((errors==NULL) || (!strcmp(errors, "strict")))
-            *known_errorHandler = 1;
-        else if (!strcmp(errors, "replace"))
-            *known_errorHandler = 2;
-        else if (!strcmp(errors, "ignore"))
-            *known_errorHandler = 3;
-        else if (!strcmp(errors, "xmlcharrefreplace"))
-            *known_errorHandler = 4;
-        else
-            *known_errorHandler = 0;
-    }
-    switch (*known_errorHandler) {
-    case 1: /* strict */
+    if (*error_handler == _Py_ERROR_UNKNOWN)
+        *error_handler = get_error_handler(errors);
+
+    switch (*error_handler) {
+    case _Py_ERROR_STRICT:
         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
         return -1;
-    case 2: /* replace */
+
+    case _Py_ERROR_REPLACE:
         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
             x = charmapencode_output('?', mapping, res, respos);
             if (x==enc_EXCEPTION) {
@@ -8153,10 +8363,11 @@ charmap_encoding_error(
             }
         }
         /* fall through */
-    case 3: /* ignore */
+    case _Py_ERROR_IGNORE:
         *inpos = collendpos;
         break;
-    case 4: /* xmlcharrefreplace */
+
+    case _Py_ERROR_XMLCHARREFREPLACE:
         /* generate replacement (temporarily (mis)uses p) */
         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
             char buffer[2+29+1+1];
@@ -8174,8 +8385,9 @@ charmap_encoding_error(
         }
         *inpos = collendpos;
         break;
+
     default:
-        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
+        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
                                                       encoding, reason, unicode, exceptionObject,
                                                       collstartpos, collendpos, &newpos);
         if (repunicode == NULL)
@@ -8238,12 +8450,9 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
     Py_ssize_t size;
     /* current output position */
     Py_ssize_t respos = 0;
-    PyObject *errorHandler = NULL;
+    PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    /* the following variable is used for caching string comparisons
-     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
-     * 3=ignore, 4=xmlcharrefreplace */
-    int known_errorHandler = -1;
+    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
     void *data;
     int kind;
 
@@ -8274,7 +8483,7 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
         if (x==enc_FAILED) { /* unencodable character */
             if (charmap_encoding_error(unicode, &inpos, mapping,
                                        &exc,
-                                       &known_errorHandler, &errorHandler, errors,
+                                       &error_handler, &error_handler_obj, errors,
                                        &res, &respos)) {
                 goto onError;
             }
@@ -8290,13 +8499,13 @@ _PyUnicode_EncodeCharmap(PyObject *unicode,
             goto onError;
 
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return res;
 
   onError:
     Py_XDECREF(res);
     Py_XDECREF(exc);
-    Py_XDECREF(errorHandler);
+    Py_XDECREF(error_handler_obj);
     return NULL;
 }
 
@@ -8622,7 +8831,7 @@ exit:
     return res;
 }
 
-PyObject *
+static PyObject *
 _PyUnicode_TranslateCharmap(PyObject *input,
                             PyObject *mapping,
                             const char *errors)
@@ -10893,6 +11102,12 @@ PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
 }
 
 int
+_PyUnicode_EQ(PyObject *aa, PyObject *bb)
+{
+    return unicode_eq(aa, bb);
+}
+
+int
 PyUnicode_Contains(PyObject *container, PyObject *element)
 {
     PyObject *str, *sub;
@@ -13225,44 +13440,50 @@ unicode_endswith(PyObject *self,
 Py_LOCAL_INLINE(void)
 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
 {
-    if (!writer->readonly)
+    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+    writer->data = PyUnicode_DATA(writer->buffer);
+
+    if (!writer->readonly) {
+        writer->kind = PyUnicode_KIND(writer->buffer);
         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
+    }
     else {
+        /* use a value smaller than PyUnicode_1BYTE_KIND() so
+           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+        writer->kind = PyUnicode_WCHAR_KIND;
+        assert(writer->kind <= PyUnicode_1BYTE_KIND);
+
         /* Copy-on-write mode: set buffer size to 0 so
          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
          * next write. */
         writer->size = 0;
     }
-    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
-    writer->data = PyUnicode_DATA(writer->buffer);
-    writer->kind = PyUnicode_KIND(writer->buffer);
 }
 
 void
 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
 {
     memset(writer, 0, sizeof(*writer));
-#ifdef Py_DEBUG
-    writer->kind = 5;    /* invalid kind */
-#endif
+
+    /* ASCII is the bare minimum */
     writer->min_char = 127;
+
+    /* use a value smaller than PyUnicode_1BYTE_KIND() so
+       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
+    writer->kind = PyUnicode_WCHAR_KIND;
+    assert(writer->kind <= PyUnicode_1BYTE_KIND);
 }
 
 int
 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                  Py_ssize_t length, Py_UCS4 maxchar)
 {
-#ifdef MS_WINDOWS
-   /* On Windows, overallocate by 50% is the best factor */
-#  define OVERALLOCATE_FACTOR 2
-#else
-   /* On Linux, overallocate by 25% is the best factor */
-#  define OVERALLOCATE_FACTOR 4
-#endif
     Py_ssize_t newlen;
     PyObject *newbuffer;
 
-    assert(length > 0);
+    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
+    assert((maxchar > writer->maxchar && length >= 0)
+           || length > 0);
 
     if (length > PY_SSIZE_T_MAX - writer->pos) {
         PyErr_NoMemory();
@@ -13328,6 +13549,28 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
 #undef OVERALLOCATE_FACTOR
 }
 
+int
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+                                     enum PyUnicode_Kind kind)
+{
+    Py_UCS4 maxchar;
+
+    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
+    assert(writer->kind < kind);
+
+    switch (kind)
+    {
+    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
+    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    default:
+        assert(0 && "invalid kind");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
+}
+
 Py_LOCAL_INLINE(int)
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
 {
@@ -13498,17 +13741,26 @@ _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
         return str;
     }
-    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
-        PyObject *newbuffer;
-        newbuffer = resize_compact(writer->buffer, writer->pos);
-        if (newbuffer == NULL) {
-            Py_CLEAR(writer->buffer);
-            return NULL;
+    if (writer->pos == 0) {
+        Py_CLEAR(writer->buffer);
+
+        /* Get the empty Unicode string singleton ('') */
+        _Py_INCREF_UNICODE_EMPTY();
+        str  = unicode_empty;
+    }
+    else {
+        str = writer->buffer;
+        writer->buffer = NULL;
+
+        if (PyUnicode_GET_LENGTH(str) != writer->pos) {
+            PyObject *str2;
+            str2 = resize_compact(str, writer->pos);
+            if (str2 == NULL)
+                return NULL;
+            str = str2;
         }
-        writer->buffer = newbuffer;
     }
-    str = writer->buffer;
-    writer->buffer = NULL;
+
     assert(_PyUnicode_CheckConsistency(str, 1));
     return unicode_result_ready(str);
 }