28 files changed, 404 insertions, 412 deletions
diff --git a/numpy/conftest.py b/numpy/conftest.py
index a843f725f..1d3e0349f 100644
--- a/numpy/conftest.py
+++ b/numpy/conftest.py
@@ -3,6 +3,7 @@ Pytest configuration and fixtures for the Numpy test suite.
 """
 import os
 
+import hypothesis
 import pytest
 import numpy
 
@@ -12,6 +13,12 @@ from numpy.core._multiarray_tests import get_fpu_mode
 _old_fpu_mode = None
 _collect_results = {}
 
+# See https://hypothesis.readthedocs.io/en/latest/settings.html
+hypothesis.settings.register_profile(
+    name="numpy-profile", deadline=None, print_blob=True,
+)
+hypothesis.settings.load_profile("numpy-profile")
+
 
 def pytest_configure(config):
     config.addinivalue_line("markers",
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 942a698a9..b22d6b85e 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -2679,25 +2679,6 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
             itemsize = len(obj)
         shape = len(obj) // itemsize
 
-        if unicode:
-            if sys.maxunicode == 0xffff:
-                # On a narrow Python build, the buffer for Unicode
-                # strings is UCS2, which doesn't match the buffer for
-                # NumPy Unicode types, which is ALWAYS UCS4.
-                # Therefore, we need to convert the buffer.  On Python
-                # 2.6 and later, we can use the utf_32 codec.  Earlier
-                # versions don't have that codec, so we convert to a
-                # numerical array that matches the input buffer, and
-                # then use NumPy to convert it to UCS4.  All of this
-                # should happen in native endianness.
-                obj = obj.encode('utf_32')
-            else:
-                obj = str(obj)
-        else:
-            # Let the default Unicode -> string encoding (if any) take
-            # precedence.
-            obj = bytes(obj)
-
         return chararray(shape, itemsize=itemsize, unicode=unicode,
                          buffer=obj, order=order)
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index ab45ddfe8..acd2d2bea 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -2039,8 +2039,8 @@ def clip(a, a_min, a_max, out=None, **kwargs):
     is specified, values smaller than 0 become 0, and values larger
     than 1 become 1.
 
-    Equivalent to but faster than ``np.maximum(a_min, np.minimum(a, a_max))``
-    assuming ``a_min < a_max``.
+    Equivalent to but faster than ``np.minimum(a_max, np.maximum(a, a_min))``.
+
     No check is performed to ensure ``a_min < a_max``.
 
     Parameters
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index 64450e713..42a0df76a 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -135,7 +135,13 @@ typedef struct {
 } PyScalarObject;
 
 #define PyStringScalarObject PyStringObject
-#define PyUnicodeScalarObject PyUnicodeObject
+#define PyStringScalarObject PyStringObject
+typedef struct {
+        /* note that the PyObject_HEAD macro lives right here */
+        PyUnicodeObject base;
+        Py_UCS4 *obval;
+} PyUnicodeScalarObject;
+
 
 typedef struct {
         PyObject_VAR_HEAD
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index d50c53e30..a920b3687 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -12,6 +12,9 @@ import warnings
 from . import overrides
 from . import _multiarray_umath
 from ._multiarray_umath import *  # noqa: F403
+# These imports are needed for backward compatibility,
+# do not change them. issue gh-15518
+# _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
     _fastCopyAndTranspose, _flagdict, _insert, _reconstruct, _vec_string,
     _ARRAY_API, _monotonicity, _get_ndarray_c_version
diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c
index 946a72257..3ef5d6878 100644
--- a/numpy/core/src/common/ucsnarrow.c
+++ b/numpy/core/src/common/ucsnarrow.c
@@ -16,76 +16,12 @@
 #include "ctors.h"
 
 /*
- * Functions only needed on narrow builds of Python for converting back and
- * forth between the NumPy Unicode data-type (always 4-bytes) and the
- * Python Unicode scalar (2-bytes on a narrow build).
- */
-
-/*
- * The ucs2 buffer must be large enough to hold 2*ucs4length characters
- * due to the use of surrogate pairs.
+ * This file originally contained functions only needed on narrow builds of
+ * Python for converting back and forth between the NumPy Unicode data-type
+ * (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
  *
- * The return value is the number of ucs2 bytes used-up which
- * is ucs4length + number of surrogate pairs found.
- *
- * Values above 0xffff are converted to surrogate pairs.
+ * This "narrow" interface is now deprecated in python and unused in NumPy.
  */
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
-{
-    int i;
-    int numucs2 = 0;
-    npy_ucs4 chr;
-    for (i = 0; i < ucs4length; i++) {
-        chr = *ucs4++;
-        if (chr > 0xffff) {
-            numucs2++;
-            chr -= 0x10000L;
-            *ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
-            *ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
-        }
-        else {
-            *ucs2++ = (Py_UNICODE) chr;
-        }
-        numucs2++;
-    }
-    return numucs2;
-}
-
-
-/*
- * This converts a UCS2 buffer of the given length to UCS4 buffer.
- * It converts up to ucs4len characters of UCS2
- *
- * It returns the number of characters converted which can
- * be less than ucs2len if there are surrogate pairs in ucs2.
- *
- * The return value is the actual size of the used part of the ucs4 buffer.
- */
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
-{
-    int i;
-    npy_ucs4 chr;
-    Py_UNICODE ch;
-    int numchars=0;
-
-    for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
-        ch = *ucs2++;
-        if (ch >= 0xd800 && ch <= 0xdfff) {
-            /* surrogate pair */
-            chr = ((npy_ucs4)(ch-0xd800)) << 10;
-            chr += *ucs2++ + 0x2400;  /* -0xdc00 + 0x10000 */
-            i++;
-        }
-        else {
-            chr = (npy_ucs4) ch;
-        }
-        *ucs4++ = chr;
-        numchars++;
-    }
-    return numchars;
-}
 
 /*
  * Returns a PyUnicodeObject initialized from a buffer containing
@@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
     Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
     npy_ucs4 const *src = (npy_ucs4 const *)src_char;
     npy_ucs4 *buf = NULL;
-    PyUnicodeObject *ret;
 
     /* swap and align if needed */
     if (swap || align) {
         buf = (npy_ucs4 *)malloc(size);
         if (buf == NULL) {
             PyErr_NoMemory();
-            goto fail;
+            return NULL;
         }
         memcpy(buf, src, size);
         if (swap) {
@@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
     while (ucs4len > 0 && src[ucs4len - 1] == 0) {
         ucs4len--;
     }
-
-    /* produce PyUnicode object */
-#ifdef Py_UNICODE_WIDE
-    {
-        ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
-                                                       (Py_ssize_t) ucs4len);
-        if (ret == NULL) {
-            goto fail;
-        }
-    }
-#else
-    {
-        Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
-        Py_ssize_t ucs2len;
-        Py_UNICODE *tmp;
-
-        if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
-            PyErr_NoMemory();
-            goto fail;
-        }
-        ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
-        ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
-        free(tmp);
-        if (ret == NULL) {
-            goto fail;
-        }
-    }
-#endif
-
-    if (buf) {
-        free(buf);
-    }
+    PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
+        PyUnicode_4BYTE_KIND, src, ucs4len);
+    free(buf);
     return ret;
-
-fail:
-    if (buf) {
-        free(buf);
-    }
-    return NULL;
 }
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index fe31a5e25..c811e1f2c 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -1,12 +1,6 @@
 #ifndef _NPY_UCSNARROW_H_
 #define _NPY_UCSNARROW_H_
 
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);
-
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);
-
 NPY_NO_EXPORT PyUnicodeObject *
 PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
 
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index ce288d62e..c16e0f311 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -450,12 +450,6 @@ static int
 UNICODE_setitem(PyObject *op, void *ov, void *vap)
 {
     PyArrayObject *ap = vap;
-    PyObject *temp;
-    Py_UNICODE *ptr;
-    int datalen;
-#ifndef Py_UNICODE_WIDE
-    char *buffer;
-#endif
 
     if (PyArray_IsZeroDim(op)) {
         return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
@@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
                 "setting an array element with a sequence");
         return -1;
     }
+
+    PyObject *temp;
     if (PyBytes_Check(op)) {
         /* Try to decode from ASCII */
         temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
@@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
     else if ((temp=PyObject_Str(op)) == NULL) {
         return -1;
     }
-    ptr = PyUnicode_AS_UNICODE(temp);
-    if ((ptr == NULL) || (PyErr_Occurred())) {
+
+    /* truncate if needed */
+    Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
+    Py_ssize_t actual_len = PyUnicode_GetLength(temp);
+    if (actual_len < 0) {
         Py_DECREF(temp);
         return -1;
     }
-    datalen = PyUnicode_GET_DATA_SIZE(temp);
+    if (actual_len > max_len) {
+        Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
+        if (temp == NULL) {
+            return -1;
+        }
+        actual_len = max_len;
+    }
 
-#ifdef Py_UNICODE_WIDE
-    memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
-#else
+    Py_ssize_t num_bytes = actual_len * 4;
+
+    char *buffer;
     if (!PyArray_ISALIGNED(ap)) {
-        buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+        buffer = PyArray_malloc(num_bytes);
         if (buffer == NULL) {
             Py_DECREF(temp);
             PyErr_NoMemory();
@@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
     else {
         buffer = ov;
     }
-    datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
-            datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
-    datalen <<= 2;
+    if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
+        PyArray_free(buffer);
+        Py_DECREF(temp);
+        return -1;
+    }
+
     if (!PyArray_ISALIGNED(ap)) {
-        memcpy(ov, buffer, datalen);
+        memcpy(ov, buffer, num_bytes);
         PyArray_free(buffer);
     }
-#endif
+
     /* Fill in the rest of the space with 0 */
-    if (PyArray_DESCR(ap)->elsize > datalen) {
-        memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
+    if (PyArray_DESCR(ap)->elsize > num_bytes) {
+        memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
     }
     if (PyArray_ISBYTESWAPPED(ap)) {
-        byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
+        byte_swap_vector(ov, actual_len, 4);
     }
     Py_DECREF(temp);
     return 0;
@@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
     return nonz;
 }
 
-#ifdef Py_UNICODE_WIDE
-#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
-#else
-#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
-#endif
-
 static npy_bool
 UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
 {
@@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
         if (*ip == '\0') {
             seen_null = NPY_TRUE;
         }
-        else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
+        else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
             nonz = NPY_TRUE;
             break;
         }
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 576186362..9a1f7b230 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
     descr = PyArray_DescrFromScalar(self);
     view->buf = (void *)scalar_value(self, descr);
     elsize = descr->elsize;
-#ifndef Py_UNICODE_WIDE
-    if (descr->type_num == NPY_UNICODE) {
-        elsize >>= 1;
-    }
-#endif
     view->len = elsize;
     if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
         elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 3ee2cc6c6..0150ae10e 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
         PyObject *obj, PyArray_Descr *last_dtype, int string_type)
 {
     int itemsize;
-    PyObject *temp;
 
     if (string_type == NPY_STRING) {
-        if ((temp = PyObject_Str(obj)) == NULL) {
+        PyObject *temp = PyObject_Str(obj);
+        if (temp == NULL) {
             return NULL;
         }
+        /* assume that when we do the encoding elsewhere we'll use ASCII */
         itemsize = PyUnicode_GetLength(temp);
+        Py_DECREF(temp);
+        if (itemsize < 0) {
+            return NULL;
+        }
     }
     else if (string_type == NPY_UNICODE) {
-        if ((temp = PyObject_Str(obj)) == NULL) {
+        PyObject *temp = PyObject_Str(obj);
+        if (temp == NULL) {
             return NULL;
         }
-        itemsize = PyUnicode_GET_DATA_SIZE(temp);
-#ifndef Py_UNICODE_WIDE
-        itemsize <<= 1;
-#endif
+        itemsize = PyUnicode_GetLength(temp);
+        Py_DECREF(temp);
+        if (itemsize < 0) {
+            return NULL;
+        }
+        itemsize *= 4;  /* convert UCS4 codepoints to bytes */
     }
     else {
         return NULL;
     }
-    Py_DECREF(temp);
     if (last_dtype != NULL &&
             last_dtype->type_num == string_type &&
             last_dtype->elsize >= itemsize) {
@@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 
     /* Check if it's a Unicode string */
     if (PyUnicode_Check(obj)) {
-        int itemsize = PyUnicode_GET_DATA_SIZE(obj);
-#ifndef Py_UNICODE_WIDE
-        itemsize <<= 1;
-#endif
+        int itemsize = PyUnicode_GetLength(obj);
+        if (itemsize < 0) {
+            goto fail;
+        }
+        itemsize *= 4;
 
         /*
          * If it's already a big enough unicode object,
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 5c4332364..6d3276e18 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -71,7 +71,16 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
         case NPY_STRING:
             return (void *)PyString_AS_STRING(scalar);
         case NPY_UNICODE:
-            return (void *)PyUnicode_AS_DATA(scalar);
+            /* lazy initialization, to reduce the memory used by string scalars */
+            if (PyArrayScalar_VAL(scalar, Unicode) == NULL) {
+                Py_UCS4 *raw_data = PyUnicode_AsUCS4Copy(scalar);
+                if (raw_data == NULL) {
+                    return NULL;
+                }
+                PyArrayScalar_VAL(scalar, Unicode) = raw_data;
+                return (void *)raw_data;
+            }
+            return PyArrayScalar_VAL(scalar, Unicode);
         case NPY_VOID:
             /* Note: no & needed here, so can't use CASE */
             return PyArrayScalar_VAL(scalar, Void);
@@ -319,21 +328,10 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 
     memptr = scalar_value(scalar, typecode);
 
-#ifndef Py_UNICODE_WIDE
-    if (typecode->type_num == NPY_UNICODE) {
-        PyUCS2Buffer_AsUCS4((Py_UNICODE *)memptr,
-                (npy_ucs4 *)PyArray_DATA(r),
-                PyUnicode_GET_SIZE(scalar),
-                PyArray_ITEMSIZE(r) >> 2);
-    }
-    else
-#endif
-    {
-        memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
-        if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
-            /* Need to INCREF just the PyObject portion */
-            PyArray_Item_INCREF(memptr, typecode);
-        }
+    memcpy(PyArray_DATA(r), memptr, PyArray_ITEMSIZE(r));
+    if (PyDataType_FLAGCHK(typecode, NPY_ITEM_HASOBJECT)) {
+        /* Need to INCREF just the PyObject portion */
+        PyArray_Item_INCREF(memptr, typecode);
     }
 
 finish:
@@ -568,10 +566,7 @@ PyArray_DescrFromScalar(PyObject *sc)
             descr->elsize = PyString_GET_SIZE(sc);
         }
         else if (type_num == NPY_UNICODE) {
-            descr->elsize = PyUnicode_GET_DATA_SIZE(sc);
-#ifndef Py_UNICODE_WIDE
-            descr->elsize <<= 1;
-#endif
+            descr->elsize = PyUnicode_GET_LENGTH(sc) * 4;
         }
         else {
             PyArray_Descr *dtype;
@@ -654,23 +649,30 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
         }
     }
     if (type_num == NPY_UNICODE) {
-        PyObject *u, *args;
-        int byteorder;
-
-#if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
-        byteorder = -1;
-#elif NPY_BYTE_ORDER == NPY_BIG_ENDIAN
-        byteorder = +1;
-#else
-        #error Endianness undefined ?
-#endif
-        if (swap) byteorder *= -1;
-
-        u = PyUnicode_DecodeUTF32(data, itemsize, NULL, &byteorder);
+        /* we need the full string length here, else copyswap will write too
+           many bytes */
+        void *buff = PyArray_malloc(descr->elsize);
+        if (buff == NULL) {
+            return PyErr_NoMemory();
+        }
+        /* copyswap needs an array object, but only actually cares about the
+         * dtype
+         */
+        PyArrayObject_fields dummy_arr;
+        if (base == NULL) {
+            dummy_arr.descr = descr;
+            base = (PyObject *)&dummy_arr;
+        }
+        copyswap(buff, data, swap, base);
+
+        /* truncation occurs here */
+        PyObject *u = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buff, itemsize / 4);
+        PyArray_free(buff);
         if (u == NULL) {
             return NULL;
         }
-        args = Py_BuildValue("(O)", u);
+
+        PyObject *args = Py_BuildValue("(O)", u);
         if (args == NULL) {
             Py_DECREF(u);
             return NULL;
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 7657e39ee..eafa13ff2 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -345,6 +345,10 @@ format_@name@(@type@ val, npy_bool scientific,
  * over-ride repr and str of array-scalar strings and unicode to
  * remove NULL bytes and then call the corresponding functions
  * of string and unicode.
+ * 
+ * FIXME:
+ *   is this really a good idea?
+ *   stop using Py_UNICODE here.
  */
 
 /**begin repeat
@@ -1094,11 +1098,6 @@ gentype_itemsize_get(PyObject *self)
 
     typecode = PyArray_DescrFromScalar(self);
     elsize = typecode->elsize;
-#ifndef Py_UNICODE_WIDE
-    if (typecode->type_num == NPY_UNICODE) {
-        elsize >>= 1;
-    }
-#endif
     ret = PyInt_FromLong((long) elsize);
     Py_DECREF(typecode);
     return ret;
@@ -1658,12 +1657,7 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         return NULL;
     }
 
-    if (PyArray_IsScalar(self, Unicode)) {
-        /* Unicode on Python 3 does not expose the buffer interface */
-        buffer = PyUnicode_AS_DATA(self);
-        buflen = PyUnicode_GET_DATA_SIZE(self);
-    }
-    else if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
+    if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
         buffer = view.buf;
         buflen = view.len;
         /*
@@ -1718,48 +1712,13 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         PyTuple_SET_ITEM(ret, 1, tup);
     }
     else {
-#ifndef Py_UNICODE_WIDE
-        /*
-         * We need to expand the buffer so that we always write
-         * UCS4 to disk for pickle of unicode scalars.
-         *
-         * This could be in a unicode_reduce function, but
-         * that would require re-factoring.
-         */
-        int alloc = 0;
-        char *tmp;
-        int newlen;
-
-        if (PyArray_IsScalar(self, Unicode)) {
-            tmp = PyArray_malloc(buflen*2);
-            if (tmp == NULL) {
-                Py_DECREF(ret);
-                return PyErr_NoMemory();
-            }
-            alloc = 1;
-            newlen = PyUCS2Buffer_AsUCS4((Py_UNICODE *)buffer,
-                    (npy_ucs4 *)tmp,
-                    buflen / 2, buflen / 2);
-            buflen = newlen*4;
-            buffer = tmp;
-        }
-#endif
         mod = PyBytes_FromStringAndSize(buffer, buflen);
         if (mod == NULL) {
             Py_DECREF(ret);
-#ifndef Py_UNICODE_WIDE
-            ret = NULL;
-            goto fail;
-#else
             return NULL;
-#endif
         }
         PyTuple_SET_ITEM(ret, 1,
                 Py_BuildValue("NN", obj, mod));
-#ifndef Py_UNICODE_WIDE
-fail:
-        if (alloc) PyArray_free((char *)buffer);
-#endif
     }
     return ret;
 }
@@ -2409,6 +2368,15 @@ object_arrtype_dealloc(PyObject *v)
     Py_TYPE(v)->tp_free(v);
 }
 
+static void
+unicode_arrtype_dealloc(PyObject *v)
+{
+    /* note: may be null if it was never requested */
+    PyMem_Free(PyArrayScalar_VAL(v, Unicode));
+    /* delegate to the base class */
+    PyUnicode_Type.tp_dealloc(v);
+}
+
 /**begin repeat
  * #name = byte, short, int, long, longlong, ubyte, ushort, uint, ulong,
  *         ulonglong, half, float, double, longdouble, cfloat, cdouble,
@@ -2444,6 +2412,9 @@ static PyObject *
         PyErr_Clear();
     }
     else {
+#if defined(_@TYPE@_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
         return from_superclass;
     }
 #endif
@@ -3667,6 +3638,9 @@ initialize_numeric_types(void)
 
     /**end repeat**/
 
+    PyUnicodeArrType_Type.tp_dealloc = unicode_arrtype_dealloc;
+    PyUnicodeArrType_Type.tp_as_buffer = &gentype_as_buffer;
+
     /**begin repeat
      * #name = bool, byte, short, ubyte, ushort, uint, ulong, ulonglong,
      *         half, float, longdouble, cfloat, clongdouble, void, object,
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 4769bdad9..1efdd204f 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -38,8 +38,13 @@ array_contains(PyArrayObject *self, PyObject *el)
     if (res == NULL) {
         return -1;
     }
+
     any = PyArray_Any((PyArrayObject *)res, NPY_MAXDIMS, NULL);
     Py_DECREF(res);
+    if (any == NULL) {
+        return -1;
+    }
+
     ret = PyObject_IsTrue(any);
     Py_DECREF(any);
     return ret;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 7ec90f9c8..4265476b5 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -55,6 +55,37 @@ abs_ptrdiff(char *a, char *b)
     return (a > b) ? (a - b) : (b - a);
 }
 
+/*
+ * nomemoverlap - returns true if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+static NPY_INLINE npy_bool
+nomemoverlap(char *ip,
+             npy_intp ip_size,
+             char *op,
+             npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start > op_end) | (op_start > ip_end);
+}
+
 #define IS_BINARY_STRIDE_ONE(esize, vsize) \
     ((steps[0] == esize) && \
      (steps[1] == esize) && \
@@ -83,22 +114,25 @@ abs_ptrdiff(char *a, char *b)
  *    cross page boundaries.
  *
  * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
- * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this.
+ * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
+ * ensures this. The condition also requires that the input and output arrays
+ * should have no overlap in memory.
  */
-#define IS_BINARY_SMALL_STEPS \
+#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
     ((abs(steps[0]) < MAX_STEP_SIZE)  && \
      (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE))
+     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+     (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
 
 /*
- * output should be contiguous, can handle strided input data
- * Input step should be smaller than MAX_STEP_SIZE for performance
+ * 1) Output should be contiguous, can handle strided input data
+ * 2) Input step should be smaller than MAX_STEP_SIZE for performance
+ * 3) Input and output arrays should have no overlap in memory
  */
 #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \
     (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \
-     (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \
-     ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
-      ((abs_ptrdiff(args[1], args[0]) == 0))))
+     (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
 
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
     (steps[1] == (esize) && abs_ptrdiff(args[1], args[0]) >= (vsize) && \
@@ -253,7 +287,7 @@ static NPY_INLINE int
 run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_BINARY_SMALL_STEPS) {
+    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
         AVX512F_@func@_@TYPE@(args, dimensions, steps);
         return 1;
     }
@@ -1943,7 +1977,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via
-     * IS_BINARY_SMALL_STEPS
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
      */
 
     npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
diff --git a/numpy/core/tests/data/umath-validation-set-cos b/numpy/core/tests/data/umath-validation-set-cos
index 360ebcd6a..2e75f044c 100644
--- a/numpy/core/tests/data/umath-validation-set-cos
+++ b/numpy/core/tests/data/umath-validation-set-cos
@@ -19,9 +19,7 @@ np.float32,0x80000001,0x3f800000,2
 np.float32,0x00000000,0x3f800000,2
 np.float32,0x80000000,0x3f800000,2
 np.float32,0x00800000,0x3f800000,2
-np.float32,0x7f7fffff,0x3f5a5f96,2
 np.float32,0x80800000,0x3f800000,2
-np.float32,0xff7fffff,0x3f5a5f96,2
 ## 1.00f + 0x00000001 ##
 np.float32,0x3f800000,0x3f0a5140,2
 np.float32,0x3f800001,0x3f0a513f,2
@@ -36,26 +34,6 @@ np.float32,0x41d92388,0xbed987c7,2
 np.float32,0x422dd66c,0x3f5dcab3,2
 np.float32,0xc28f5be6,0xbf5688d8,2
 np.float32,0x41ab2674,0xbf53aa3b,2
-np.float32,0xd0102756,0x3f45d12d,2
-np.float32,0xcf99405e,0xbe9cf281,2
-np.float32,0xcfd83a12,0x3eaae4ca,2
-np.float32,0x4fb54db0,0xbf7b2894,2
-np.float32,0xcfcca29d,0x3f752e4e,2
-np.float32,0xceec2ac0,0xbf745303,2
-np.float32,0xcfdca97f,0x3ef554a7,2
-np.float32,0xcfe92b0a,0x3f4618f2,2
-np.float32,0x5014b0eb,0x3ee933e6,2
-np.float32,0xcfa7ee96,0xbeedeeb2,2
-np.float32,0x754c09a0,0xbef298de,2
-np.float32,0x77a731fb,0x3f24599f,2
-np.float32,0x76de2494,0x3f79576c,2
-np.float32,0xf74920dc,0xbf4d196e,2
-np.float32,0x7707a312,0xbeb5cb8e,2
-np.float32,0x75bf9790,0xbf7fd7fe,2
-np.float32,0xf4ca7c40,0xbe15107d,2
-np.float32,0x77e91899,0xbe8a968b,2
-np.float32,0xf74c9820,0xbf7f9677,2
-np.float32,0x7785ca29,0xbe6ef93b,2
 np.float32,0x3f490fdb,0x3f3504f3,2
 np.float32,0xbf490fdb,0x3f3504f3,2
 np.float32,0x3fc90fdb,0xb33bbd2e,2
@@ -660,26 +638,6 @@ np.float32,0x4350ea79,0x3631dadb,2
 np.float32,0x42dbe957,0xbf800000,2
 np.float32,0x425be957,0xb505522a,2
 np.float32,0x435be957,0x3f800000,2
-np.float32,0x487fe5ab,0xba140185,2
-np.float32,0x497fe5ab,0x3f7fffd5,2
-np.float32,0x49ffe5ab,0x3f7fff55,2
-np.float32,0x49ffeb37,0x3b9382f5,2
-np.float32,0x497ff0c3,0x3b13049f,2
-np.float32,0x49fff0c3,0xbf7fff57,2
-np.float32,0x49fff64f,0xbb928618,2
-np.float32,0x497ffbdb,0xbf7fffd6,2
-np.float32,0x49fffbdb,0x3f7fff59,2
-np.float32,0x48fffbdb,0xba9207c6,2
-np.float32,0x4e736e56,0xbf800000,2
-np.float32,0x4d4da377,0xbf800000,2
-np.float32,0x4ece58c3,0xbf800000,2
-np.float32,0x4ee0db9c,0xbf800000,2
-np.float32,0x4dee7002,0x3f800000,2
-np.float32,0x4ee86afc,0x38857a23,2
-np.float32,0x4dca4f3f,0xbf800000,2
-np.float32,0x4ecb48af,0xb95d1e10,2
-np.float32,0x4e51e33f,0xbf800000,2
-np.float32,0x4ef5f421,0xbf800000,2
 np.float32,0x46027eb2,0x3e7d94c9,2
 np.float32,0x4477baed,0xbe7f1824,2
 np.float32,0x454b8024,0x3e7f5268,2
diff --git a/numpy/core/tests/data/umath-validation-set-sin b/numpy/core/tests/data/umath-validation-set-sin
index a56273195..64e78ae15 100644
--- a/numpy/core/tests/data/umath-validation-set-sin
+++ b/numpy/core/tests/data/umath-validation-set-sin
@@ -19,9 +19,7 @@ np.float32,0x80000001,0x80000001,2
 np.float32,0x00000000,0x00000000,2
 np.float32,0x80000000,0x80000000,2
 np.float32,0x00800000,0x00800000,2
-np.float32,0x7f7fffff,0xbf0599b3,2
 np.float32,0x80800000,0x80800000,2
-np.float32,0xff7fffff,0x3f0599b3,2
 ## 1.00f ##
 np.float32,0x3f800000,0x3f576aa4,2
 np.float32,0x3f800001,0x3f576aa6,2
@@ -36,26 +34,6 @@ np.float32,0x41d92388,0x3f67beef,2
 np.float32,0x422dd66c,0xbeffb0c1,2
 np.float32,0xc28f5be6,0xbf0bae79,2
 np.float32,0x41ab2674,0x3f0ffe2b,2
-np.float32,0xd0102756,0x3f227e8a,2
-np.float32,0xcf99405e,0x3f73ad00,2
-np.float32,0xcfd83a12,0xbf7151a7,2
-np.float32,0x4fb54db0,0xbe46354b,2
-np.float32,0xcfcca29d,0xbe9345e6,2
-np.float32,0xceec2ac0,0x3e98dc89,2
-np.float32,0xcfdca97f,0xbf60b2b4,2
-np.float32,0xcfe92b0a,0xbf222705,2
-np.float32,0x5014b0eb,0x3f63e75c,2
-np.float32,0xcfa7ee96,0x3f62ada4,2
-np.float32,0x754c09a0,0xbf617056,2
-np.float32,0x77a731fb,0x3f44472b,2
-np.float32,0x76de2494,0xbe680739,2
-np.float32,0xf74920dc,0xbf193338,2
-np.float32,0x7707a312,0xbf6f51b1,2
-np.float32,0x75bf9790,0xbd0f1a47,2
-np.float32,0xf4ca7c40,0xbf7d45e7,2
-np.float32,0x77e91899,0x3f767181,2
-np.float32,0xf74c9820,0xbd685b75,2
-np.float32,0x7785ca29,0x3f78ee61,2
 np.float32,0x3f490fdb,0x3f3504f3,2
 np.float32,0xbf490fdb,0xbf3504f3,2
 np.float32,0x3fc90fdb,0x3f800000,2
@@ -660,46 +638,21 @@ np.float32,0x4350ea79,0x3f800000,2
 np.float32,0x42dbe957,0x3585522a,2
 np.float32,0x425be957,0xbf800000,2
 np.float32,0x435be957,0xb605522a,2
-np.float32,0x487fe5ab,0xbf7ffffd,2
-np.float32,0x497fe5ab,0xbb14017d,2
-np.float32,0x49ffe5ab,0xbb940164,2
-np.float32,0x49ffeb37,0x3f7fff56,2
-np.float32,0x497ff0c3,0x3f7fffd6,2
-np.float32,0x49fff0c3,0x3b930487,2
-np.float32,0x49fff64f,0xbf7fff58,2
-np.float32,0x497ffbdb,0x3b1207c0,2
-np.float32,0x49fffbdb,0xbb9207a9,2
-np.float32,0x48fffbdb,0xbf7ffff6,2
-np.float32,0x4e736e56,0x397fa7f2,2
-np.float32,0x4d4da377,0xb57c64bc,2
-np.float32,0x4ece58c3,0xb80846c8,2
-np.float32,0x4ee0db9c,0x394c4786,2
-np.float32,0x4dee7002,0x381bce96,2
-np.float32,0x4ee86afc,0x3f800000,2
-np.float32,0x4dca4f3f,0xb8e25111,2
-np.float32,0x4ecb48af,0xbf800000,2
-np.float32,0x4e51e33f,0xb8a4fa6f,2
-np.float32,0x4ef5f421,0x387ca7df,2
 np.float32,0x476362a2,0xbd7ff911,2
 np.float32,0x464c99a4,0x3e7f4d41,2
 np.float32,0x4471f73d,0x3e7fe1b0,2
 np.float32,0x445a6752,0x3e7ef367,2
 np.float32,0x474fa400,0x3e7f9fcd,2
-np.float32,0x47c9e70e,0xbb4bba09,2
 np.float32,0x45c1e72f,0xbe7fc7af,2
 np.float32,0x4558c91d,0x3e7e9f31,2
 np.float32,0x43784f94,0xbdff6654,2
 np.float32,0x466e8500,0xbe7ea0a3,2
 np.float32,0x468e1c25,0x3e7e22fb,2
-np.float32,0x47d28adc,0xbe7d5e6b,2
 np.float32,0x44ea6cfc,0x3dff70c3,2
 np.float32,0x4605126c,0x3e7f89ef,2
 np.float32,0x4788b3c6,0xbb87d853,2
 np.float32,0x4531b042,0x3dffd163,2
-np.float32,0x47e46c29,0xbe7def2b,2
-np.float32,0x47c10e07,0xbdff63d4,2
 np.float32,0x43f1f71d,0x3dfff387,2
-np.float32,0x47c3e38c,0x3e7f0b2f,2
 np.float32,0x462c3fa5,0xbd7fe13d,2
 np.float32,0x441c5354,0xbdff76b4,2
 np.float32,0x44908b69,0x3e7dcf0d,2
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index cd3e501a5..008ca20e6 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 import sys
 import gc
+from hypothesis import given
+from hypothesis.extra import numpy as hynp
 import pytest
 
 import numpy as np
@@ -393,6 +395,18 @@ class TestArray2String:
             "[ 'xxxxx']"
         )
 
+    @given(hynp.from_dtype(np.dtype("U")))
+    def test_any_text(self, text):
+        # This test checks that, given any value that can be represented in an
+        # array of dtype("U") (i.e. unicode string), ...
+        a = np.array([text, text, text])
+        # casting a list of them to an array does not e.g. truncate the value
+        assert_equal(a[0], text)
+        # and that np.array2string puts a newline in the expected location
+        expected_repr = "[{0!r} {0!r}\n {0!r}]".format(text)
+        result = np.array2string(a, max_line_width=len(repr(text)) * 2 + 3)
+        assert_equal(result, expected_repr)
+
     @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
     def test_refcount(self):
         # make sure we do not hold references to the array due to a recursive
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index ad38911cb..13244f3ba 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -7854,6 +7854,34 @@ class TestBytestringArrayNonzero:
         assert_(a)
 
 
+class TestUnicodeEncoding:
+    """
+    Tests for encoding related bugs, such as UCS2 vs UCS4, round-tripping
+    issues, etc
+    """
+    def test_round_trip(self):
+        """ Tests that GETITEM, SETITEM, and PyArray_Scalar roundtrip """
+        # gh-15363
+        arr = np.zeros(shape=(), dtype="U1")
+        for i in range(1, sys.maxunicode + 1):
+            expected = chr(i)
+            arr[()] = expected
+            assert arr[()] == expected
+            assert arr.item() == expected
+
+    def test_assign_scalar(self):
+        # gh-3258
+        l = np.array(['aa', 'bb'])
+        l[:] = np.unicode_('cc')
+        assert_equal(l, ['cc', 'cc'])
+
+    def test_fill_scalar(self):
+        # gh-7227
+        l = np.array(['aa', 'bb'])
+        l.fill(np.unicode_('cc'))
+        assert_equal(l, ['cc', 'cc'])
+
+
 class TestUnicodeArrayNonzero:
 
     def test_empty_ustring_array_is_falsey(self):
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 3bc4cd187..05f59d9dc 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -14,6 +14,9 @@ from numpy.testing import (
     assert_warns, HAS_REFCOUNT
     )
 
+from hypothesis import assume, given, strategies as st
+from hypothesis.extra import numpy as hynp
+
 
 class TestResize:
     def test_copies(self):
@@ -1997,12 +2000,12 @@ class TestClip:
          np.array(np.nan),
          np.zeros(10, dtype=np.int32)),
     ])
+    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
     def test_clip_scalar_nan_propagation(self, arr, amin, amax):
         # enforcement of scalar nan propagation for comparisons
         # called through clip()
         expected = np.minimum(np.maximum(arr, amin), amax)
-        with assert_warns(DeprecationWarning):
-            actual = np.clip(arr, amin, amax)
+        actual = np.clip(arr, amin, amax)
         assert_equal(actual, expected)
 
     @pytest.mark.xfail(reason="propagation doesn't match spec")
@@ -2011,6 +2014,7 @@ class TestClip:
          np.timedelta64('NaT'),
          np.zeros(10, dtype=np.int32)),
     ])
+    @pytest.mark.filterwarnings("ignore::DeprecationWarning")
     def test_NaT_propagation(self, arr, amin, amax):
         # NOTE: the expected function spec doesn't
         # propagate NaT, but clip() now does
@@ -2018,6 +2022,68 @@ class TestClip:
         actual = np.clip(arr, amin, amax)
         assert_equal(actual, expected)
 
+    @given(data=st.data(), shape=hynp.array_shapes())
+    def test_clip_property(self, data, shape):
+        """A property-based test using Hypothesis.
+
+        This aims for maximum generality: it could in principle generate *any*
+        valid inputs to np.clip, and in practice generates much more varied
+        inputs than human testers come up with.
+
+        Because many of the inputs have tricky dependencies - compatible dtypes
+        and mutually-broadcastable shapes - we use `st.data()` strategy draw
+        values *inside* the test function, from strategies we construct based
+        on previous values.  An alternative would be to define a custom strategy
+        with `@st.composite`, but until we have duplicated code inline is fine.
+
+        That accounts for most of the function; the actual test is just three
+        lines to calculate and compare actual vs expected results!
+        """
+        # Our base array and bounds should not need to be of the same type as
+        # long as they are all compatible - so we allow any int or float type.
+        dtype_strategy = hynp.integer_dtypes() | hynp.floating_dtypes()
+
+        # The following line is a total hack to disable the varied-dtypes
+        # component of this test, because result != expected if dtypes can vary.
+        dtype_strategy = st.just(data.draw(dtype_strategy))
+
+        # Generate an arbitrary array of the chosen shape and dtype
+        # This is the value that we clip.
+        arr = data.draw(hynp.arrays(dtype=dtype_strategy, shape=shape))
+
+        # Generate shapes for the bounds which can be broadcast with each other
+        # and with the base shape.  Below, we might decide to use scalar bounds,
+        # but it's clearer to generate these shapes unconditionally in advance.
+        in_shapes, result_shape = data.draw(
+            hynp.mutually_broadcastable_shapes(
+                num_shapes=2,
+                base_shape=shape,
+                # Commenting out the min_dims line allows zero-dimensional arrays,
+                # and zero-dimensional arrays containing NaN make the test fail.
+                min_dims=1  
+                            
+            )
+        )
+        amin = data.draw(
+            dtype_strategy.flatmap(hynp.from_dtype)
+            | hynp.arrays(dtype=dtype_strategy, shape=in_shapes[0])
+        )
+        amax = data.draw(
+            dtype_strategy.flatmap(hynp.from_dtype)
+            | hynp.arrays(dtype=dtype_strategy, shape=in_shapes[1])
+        )
+        # If we allow either bound to be a scalar `nan`, the test will fail -
+        # so we just "assume" that away (if it is, this raises a special
+        # exception and Hypothesis will try again with different inputs)
+        assume(not np.isscalar(amin) or not np.isnan(amin))
+        assume(not np.isscalar(amax) or not np.isnan(amax))
+
+        # Then calculate our result and expected result and check that they're
+        # equal!  See gh-12519 for discussion deciding on this property.
+        result = np.clip(arr, amin, amax)
+        expected = np.minimum(amax, np.maximum(arr, amin))
+        assert_array_equal(result, expected)
+
 
 class TestAllclose:
     rtol = 1e-5
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index b8c6dd4aa..b1c1bbbb1 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -76,27 +76,44 @@ class TestScalarPEP3118:
         assert_equal(mv_x.itemsize, mv_a.itemsize)
         assert_equal(mv_x.format, mv_a.format)
 
+    def _as_dict(self, m):
+        return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
+                    ndim=m.ndim, format=m.format)
+
     def test_datetime_memoryview(self):
         # gh-11656
         # Values verified with v1.13.3, shape is not () as in test_scalar_dim
-        def as_dict(m):
-            return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
-                        ndim=m.ndim, format=m.format)
 
         dt1 = np.datetime64('2016-01-01')
         dt2 = np.datetime64('2017-01-01')
-        expected = {'strides': (1,), 'itemsize': 1, 'ndim': 1,
-                    'shape': (8,), 'format': 'B'}
+        expected = dict(strides=(1,), itemsize=1, ndim=1, shape=(8,),
+                        format='B')
         v = memoryview(dt1)
-        res = as_dict(v)
-        assert_equal(res, expected)
+        assert self._as_dict(v) == expected
 
         v = memoryview(dt2 - dt1)
-        res = as_dict(v)
-        assert_equal(res, expected)
+        assert self._as_dict(v) == expected
 
         dt = np.dtype([('a', 'uint16'), ('b', 'M8[s]')])
         a = np.empty(1, dt)
         # Fails to create a PEP 3118 valid buffer
         assert_raises((ValueError, BufferError), memoryview, a[0])
 
+    @pytest.mark.parametrize('s', [
+        pytest.param("\x32\x32", id="ascii"),
+        pytest.param("\uFE0F\uFE0F", id="basic multilingual"),
+        pytest.param("\U0001f4bb\U0001f4bb", id="non-BMP"),
+    ])
+    def test_str_ucs4(self, s):
+        s = np.str_(s)  # only our subclass implements the buffer protocol
+
+        # all the same, characters always encode as ucs4
+        expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w')
+
+        v = memoryview(s)
+        assert self._as_dict(v) == expected
+
+        # integers of the paltform-appropriate endianness
+        code_points = np.frombuffer(v, dtype='i4')
+
+        assert_equal(code_points, [ord(c) for c in s])
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index d1d4467d6..233a0b1d6 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -3157,6 +3157,14 @@ def test_rint_big_int():
     # Rint should not change the value
     assert_equal(val, np.rint(val))
 
+@pytest.mark.parametrize('ftype', [np.float32, np.float64])
+def test_memoverlap_accumulate(ftype):
+    # Reproduces bug https://github.com/numpy/numpy/issues/15597
+    arr = np.array([0.61, 0.60, 0.77, 0.41, 0.19], dtype=ftype)
+    out_max = np.array([0.61, 0.61, 0.77, 0.77, 0.77], dtype=ftype)
+    out_min = np.array([0.61, 0.60, 0.60, 0.41, 0.19], dtype=ftype)
+    assert_equal(np.maximum.accumulate(arr), out_max)
+    assert_equal(np.minimum.accumulate(arr), out_min)
 
 def test_signaling_nan_exceptions():
     with assert_no_warnings():
diff --git a/numpy/core/tests/test_umath_accuracy.py b/numpy/core/tests/test_umath_accuracy.py
index 32211974c..fd7214396 100644
--- a/numpy/core/tests/test_umath_accuracy.py
+++ b/numpy/core/tests/test_umath_accuracy.py
@@ -5,14 +5,14 @@ import sys
 import pytest
 from ctypes import c_float, c_int, cast, pointer, POINTER
 from numpy.testing import assert_array_max_ulp
+from numpy.core._multiarray_umath import __cpu_features__
 
-runtest = sys.platform.startswith('linux') and (platform.machine() == 'x86_64')
+IS_AVX = __cpu_features__.get('AVX512F', False) or \
+        (__cpu_features__.get('FMA3', False) and __cpu_features__.get('AVX2', False))
+runtest = sys.platform.startswith('linux') and IS_AVX
 platform_skip = pytest.mark.skipif(not runtest,
-                                   reason="""
-                                   stick to x86_64 and linux platforms.
-                                   test seems to fail on some of ARM and power
-                                   architectures.
-                                   """)
+                                   reason="avoid testing inconsistent platform "
+                                   "library implementations")
 
 # convert string to hex function taken from:
 # https://stackoverflow.com/questions/1592158/convert-hex-to-float #
@@ -29,7 +29,7 @@ files = ['umath-validation-set-exp',
          'umath-validation-set-cos']
 
 class TestAccuracy:
-    @pytest.mark.xfail(reason="Fails for MacPython/numpy-wheels builds")
+    @platform_skip
     def test_validate_transcendentals(self):
         with np.errstate(all='ignore'):
             for filename in files:
@@ -37,18 +37,24 @@ class TestAccuracy:
                 filepath = path.join(data_dir, filename)
                 with open(filepath) as fid:
                     file_without_comments = (r for r in fid if not r[0] in ('$', '#'))
-                data = np.genfromtxt(file_without_comments,
-                                     dtype=('|S39','|S39','|S39',int),
-                                     names=('type','input','output','ulperr'),
-                                     delimiter=',',
-                                     skip_header=1)
-                npfunc = getattr(np, filename.split('-')[3])
-                for datatype in np.unique(data['type']):
-                    data_subset = data[data['type'] == datatype]
-                    inval  = np.array(str_to_float(data_subset['input'].astype(str)), dtype=eval(datatype))
-                    outval = np.array(str_to_float(data_subset['output'].astype(str)), dtype=eval(datatype))
-                    perm = np.random.permutation(len(inval))
-                    inval = inval[perm]
-                    outval = outval[perm]
-                    maxulperr = data_subset['ulperr'].max()
-                    assert_array_max_ulp(npfunc(inval), outval, maxulperr)
+                    data = np.genfromtxt(file_without_comments,
+                                         dtype=('|S39','|S39','|S39',int),
+                                         names=('type','input','output','ulperr'),
+                                         delimiter=',',
+                                         skip_header=1)
+                    npfunc = getattr(np, filename.split('-')[3])
+                    for datatype in np.unique(data['type']):
+                        data_subset = data[data['type'] == datatype]
+                        inval  = np.array(str_to_float(data_subset['input'].astype(str)), dtype=eval(datatype))
+                        outval = np.array(str_to_float(data_subset['output'].astype(str)), dtype=eval(datatype))
+                        perm = np.random.permutation(len(inval))
+                        inval = inval[perm]
+                        outval = outval[perm]
+                        maxulperr = data_subset['ulperr'].max()
+                        assert_array_max_ulp(npfunc(inval), outval, maxulperr)
+
+    def test_ignore_nan_ulperror(self):
+        # Ignore ULP differences between various NAN's
+        nan1_f32 = np.array(str_to_float('0xffffffff'), dtype=np.float32)
+        nan2_f32 = np.array(str_to_float('0x7fddbfbf'), dtype=np.float32)
+        assert_array_max_ulp(nan1_f32, nan2_f32, 0)
diff --git a/numpy/core/umath.py b/numpy/core/umath.py
index d2f769505..6a5474ffe 100644
--- a/numpy/core/umath.py
+++ b/numpy/core/umath.py
@@ -8,7 +8,10 @@ by importing from the extension module.
 
 from . import _multiarray_umath
 from ._multiarray_umath import *  # noqa: F403
-from ._multiarray_umath import _UFUNC_API, _add_newdoc_ufunc
+# These imports are needed for backward compatibility,
+# do not change them. issue gh-11862
+# _ones_like is semi-public, on purpose not added to __all__
+from ._multiarray_umath import _UFUNC_API, _add_newdoc_ufunc, _ones_like
 
 __all__ = [
     '_UFUNC_API', 'ERR_CALL', 'ERR_DEFAULT', 'ERR_IGNORE', 'ERR_LOG',
diff --git a/numpy/fft/_pocketfft.py b/numpy/fft/_pocketfft.py
index f2510a6c2..3eab242e5 100644
--- a/numpy/fft/_pocketfft.py
+++ b/numpy/fft/_pocketfft.py
@@ -59,12 +59,11 @@ def _raw_fft(a, n, axis, is_real, is_forward, inv_norm):
 
     if a.shape[axis] != n:
         s = list(a.shape)
+        index = [slice(None)]*len(s)
         if s[axis] > n:
-            index = [slice(None)]*len(s)
             index[axis] = slice(0, n)
             a = a[tuple(index)]
         else:
-            index = [slice(None)]*len(s)
             index[axis] = slice(0, s[axis])
             s[axis] = n
             z = zeros(s, a.dtype.char)
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 7634af010..b7f1f16f2 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -269,8 +269,8 @@ def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
     Apply a function to 1-D slices along the given axis.
 
-    Execute `func1d(a, *args)` where `func1d` operates on 1-D arrays and `a`
-    is a 1-D slice of `arr` along `axis`.
+    Execute `func1d(a, *args, **kwargs)` where `func1d` operates on 1-D arrays
+    and `a` is a 1-D slice of `arr` along `axis`.
 
     This is equivalent to (but faster than) the following use of `ndindex` and
     `s_`, which sets each of ``ii``, ``jj``, and ``kk`` to a tuple of indices::
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 1d572f876..85f714ebf 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1223,8 +1223,10 @@ def eig(a):
            Hermitian (conjugate symmetric) array.
     eigvalsh : eigenvalues of a real symmetric or complex Hermitian
                (conjugate symmetric) array.
-    scipy.linalg.eig : Similar function in SciPy (but also solves the
-                       generalized eigenvalue problem).
+    scipy.linalg.eig : Similar function in SciPy that also solves the
+                       generalized eigenvalue problem.
+    scipy.linalg.schur : Best choice for unitary and other non-Hermitian
+                         normal matrices.
 
     Notes
     -----
@@ -1238,21 +1240,26 @@ def eig(a):
     the eigenvalues and eigenvectors of general square arrays.
 
     The number `w` is an eigenvalue of `a` if there exists a vector
-    `v` such that ``dot(a,v) = w * v``. Thus, the arrays `a`, `w`, and
-    `v` satisfy the equations ``dot(a[:,:], v[:,i]) = w[i] * v[:,i]``
+    `v` such that ``a @ v = w * v``. Thus, the arrays `a`, `w`, and
+    `v` satisfy the equations ``a @ v[:,i] = w[i] * v[:,i]``
     for :math:`i \\in \\{0,...,M-1\\}`.
 
     The array `v` of eigenvectors may not be of maximum rank, that is, some
     of the columns may be linearly dependent, although round-off error may
     obscure that fact. If the eigenvalues are all different, then theoretically
-    the eigenvectors are linearly independent. Likewise, the (complex-valued)
-    matrix of eigenvectors `v` is unitary if the matrix `a` is normal, i.e.,
-    if ``dot(a, a.H) = dot(a.H, a)``, where `a.H` denotes the conjugate
-    transpose of `a`.
+    the eigenvectors are linearly independent and `a` can be diagonalized by
+    a similarity transformation using `v`, i.e, ``inv(v) @ a @ v`` is diagonal.
+
+    For non-Hermitian normal matrices the SciPy function `scipy.linalg.schur`
+    is preferred because the matrix `v` is guaranteed to be unitary, which is
+    not the case when using `eig`. The Schur factorization produces an
+    upper triangular matrix rather than a diagonal matrix, but for normal
+    matrices only the diagonal of the upper triangular matrix is needed, the
+    rest is roundoff error.
 
     Finally, it is emphasized that `v` consists of the *right* (as in
     right-hand side) eigenvectors of `a`.  A vector `y` satisfying
-    ``dot(y.T, a) = z * y.T`` for some number `z` is called a *left*
+    ``y.T @ a = z * y.T`` for some number `z` is called a *left*
     eigenvector of `a`, and, in general, the left and right eigenvectors
     of a matrix are not necessarily the (perhaps conjugate) transposes
     of each other.
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index 3d0318752..6387814c8 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -1017,7 +1017,7 @@ cdef class RandomState:
             greater than or equal to low.  The default value is 0.
         high : float or array_like of floats
             Upper boundary of the output interval.  All values generated will be
-            less than high.  The default value is 1.0.
+            less than or equal to high.  The default value is 1.0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -1053,7 +1053,14 @@ cdef class RandomState:
         If ``high`` < ``low``, the results are officially undefined
         and may eventually raise an error, i.e. do not rely on this
         function to behave when passed arguments satisfying that
-        inequality condition.
+        inequality condition. The ``high`` limit may be included in the
+        returned array of floats due to floating-point rounding in the
+        equation ``low + (high-low) * random_sample()``. For example:
+
+        >>> x = np.float32(5*0.99999999)
+        >>> x
+        5.0
+
 
         Examples
         --------
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 67b7d317c..2842eb147 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -284,7 +284,7 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
     the scalar.
 
     This function handles NaN comparisons as if NaN was a "normal" number.
-    That is, no assertion is raised if both objects have NaNs in the same
+    That is, AssertionError is not raised if both objects have NaNs in the same
     positions.  This is in contrast to the IEEE standard on NaNs, which says
     that NaN compared to anything must return False.
 
@@ -1609,6 +1609,12 @@ def assert_array_max_ulp(a, b, maxulp=1, dtype=None):
     AssertionError
         If one or more elements differ by more than `maxulp`.
 
+    Notes
+    -----
+    For computing the ULP difference, this API does not differentiate between
+    various representations of NAN (ULP difference between 0x7fc00000 and 0xffc00000
+    is zero.
+
     See Also
     --------
     assert_array_almost_equal_nulp : Compare two arrays relatively to their
@@ -1649,6 +1655,12 @@ def nulp_diff(x, y, dtype=None):
         number of representable floating point numbers between each item in x
         and y.
 
+    Notes
+    -----
+    For computing the ULP difference, this API does not differentiate between
+    various representations of NAN (ULP difference between 0x7fc00000 and 0xffc00000
+    is zero.
+
     Examples
     --------
     # By definition, epsilon is the smallest number such as 1 + eps != 1, so
@@ -1668,8 +1680,11 @@ def nulp_diff(x, y, dtype=None):
     if np.iscomplexobj(x) or np.iscomplexobj(y):
         raise NotImplementedError("_nulp not implemented for complex array")
 
-    x = np.array(x, dtype=t)
-    y = np.array(y, dtype=t)
+    x = np.array([x], dtype=t)
+    y = np.array([y], dtype=t)
+
+    x[np.isnan(x)] = np.nan
+    y[np.isnan(y)] = np.nan
 
     if not x.shape == y.shape:
         raise ValueError("x and y do not have the same shape: %s - %s" %