451 files changed, 33891 insertions, 14374 deletions
diff --git a/numpy/__init__.cython-30.pxd b/numpy/__init__.cython-30.pxd
new file mode 100644
index 000000000..a2c451bc1
--- /dev/null
+++ b/numpy/__init__.cython-30.pxd
@@ -0,0 +1,1055 @@
+# NumPy static imports for Cython >= 3.0
+#
+# If any of the PyArray_* functions are called, import_array must be
+# called first.  This is done automatically by Cython 3.0+ if a call
+# is not detected inside of the module.
+#
+# Author: Dag Sverre Seljebotn
+#
+
+from cpython.ref cimport Py_INCREF
+from cpython.object cimport PyObject, PyTypeObject, PyObject_TypeCheck
+cimport libc.stdio as stdio
+
+
+cdef extern from *:
+    # Leave a marker that the NumPy declarations came from NumPy itself and not from Cython.
+    # See https://github.com/cython/cython/issues/3573
+    """
+    /* Using NumPy API declarations from "numpy/__init__.cython-30.pxd" */
+    """
+
+
+cdef extern from "Python.h":
+    ctypedef Py_ssize_t Py_intptr_t
+
+cdef extern from "numpy/arrayobject.h":
+    ctypedef Py_intptr_t npy_intp
+    ctypedef size_t npy_uintp
+
+    cdef enum NPY_TYPES:
+        NPY_BOOL
+        NPY_BYTE
+        NPY_UBYTE
+        NPY_SHORT
+        NPY_USHORT
+        NPY_INT
+        NPY_UINT
+        NPY_LONG
+        NPY_ULONG
+        NPY_LONGLONG
+        NPY_ULONGLONG
+        NPY_FLOAT
+        NPY_DOUBLE
+        NPY_LONGDOUBLE
+        NPY_CFLOAT
+        NPY_CDOUBLE
+        NPY_CLONGDOUBLE
+        NPY_OBJECT
+        NPY_STRING
+        NPY_UNICODE
+        NPY_VOID
+        NPY_DATETIME
+        NPY_TIMEDELTA
+        NPY_NTYPES
+        NPY_NOTYPE
+
+        NPY_INT8
+        NPY_INT16
+        NPY_INT32
+        NPY_INT64
+        NPY_INT128
+        NPY_INT256
+        NPY_UINT8
+        NPY_UINT16
+        NPY_UINT32
+        NPY_UINT64
+        NPY_UINT128
+        NPY_UINT256
+        NPY_FLOAT16
+        NPY_FLOAT32
+        NPY_FLOAT64
+        NPY_FLOAT80
+        NPY_FLOAT96
+        NPY_FLOAT128
+        NPY_FLOAT256
+        NPY_COMPLEX32
+        NPY_COMPLEX64
+        NPY_COMPLEX128
+        NPY_COMPLEX160
+        NPY_COMPLEX192
+        NPY_COMPLEX256
+        NPY_COMPLEX512
+
+        NPY_INTP
+
+    ctypedef enum NPY_ORDER:
+        NPY_ANYORDER
+        NPY_CORDER
+        NPY_FORTRANORDER
+        NPY_KEEPORDER
+
+    ctypedef enum NPY_CASTING:
+        NPY_NO_CASTING
+        NPY_EQUIV_CASTING
+        NPY_SAFE_CASTING
+        NPY_SAME_KIND_CASTING
+        NPY_UNSAFE_CASTING
+
+    ctypedef enum NPY_CLIPMODE:
+        NPY_CLIP
+        NPY_WRAP
+        NPY_RAISE
+
+    ctypedef enum NPY_SCALARKIND:
+        NPY_NOSCALAR,
+        NPY_BOOL_SCALAR,
+        NPY_INTPOS_SCALAR,
+        NPY_INTNEG_SCALAR,
+        NPY_FLOAT_SCALAR,
+        NPY_COMPLEX_SCALAR,
+        NPY_OBJECT_SCALAR
+
+    ctypedef enum NPY_SORTKIND:
+        NPY_QUICKSORT
+        NPY_HEAPSORT
+        NPY_MERGESORT
+
+    ctypedef enum NPY_SEARCHSIDE:
+        NPY_SEARCHLEFT
+        NPY_SEARCHRIGHT
+
+    enum:
+        # DEPRECATED since NumPy 1.7 ! Do not use in new code!
+        NPY_C_CONTIGUOUS
+        NPY_F_CONTIGUOUS
+        NPY_CONTIGUOUS
+        NPY_FORTRAN
+        NPY_OWNDATA
+        NPY_FORCECAST
+        NPY_ENSURECOPY
+        NPY_ENSUREARRAY
+        NPY_ELEMENTSTRIDES
+        NPY_ALIGNED
+        NPY_NOTSWAPPED
+        NPY_WRITEABLE
+        NPY_UPDATEIFCOPY
+        NPY_ARR_HAS_DESCR
+
+        NPY_BEHAVED
+        NPY_BEHAVED_NS
+        NPY_CARRAY
+        NPY_CARRAY_RO
+        NPY_FARRAY
+        NPY_FARRAY_RO
+        NPY_DEFAULT
+
+        NPY_IN_ARRAY
+        NPY_OUT_ARRAY
+        NPY_INOUT_ARRAY
+        NPY_IN_FARRAY
+        NPY_OUT_FARRAY
+        NPY_INOUT_FARRAY
+
+        NPY_UPDATE_ALL
+
+    enum:
+        # Added in NumPy 1.7 to replace the deprecated enums above.
+        NPY_ARRAY_C_CONTIGUOUS
+        NPY_ARRAY_F_CONTIGUOUS
+        NPY_ARRAY_OWNDATA
+        NPY_ARRAY_FORCECAST
+        NPY_ARRAY_ENSURECOPY
+        NPY_ARRAY_ENSUREARRAY
+        NPY_ARRAY_ELEMENTSTRIDES
+        NPY_ARRAY_ALIGNED
+        NPY_ARRAY_NOTSWAPPED
+        NPY_ARRAY_WRITEABLE
+        NPY_ARRAY_UPDATEIFCOPY
+
+        NPY_ARRAY_BEHAVED
+        NPY_ARRAY_BEHAVED_NS
+        NPY_ARRAY_CARRAY
+        NPY_ARRAY_CARRAY_RO
+        NPY_ARRAY_FARRAY
+        NPY_ARRAY_FARRAY_RO
+        NPY_ARRAY_DEFAULT
+
+        NPY_ARRAY_IN_ARRAY
+        NPY_ARRAY_OUT_ARRAY
+        NPY_ARRAY_INOUT_ARRAY
+        NPY_ARRAY_IN_FARRAY
+        NPY_ARRAY_OUT_FARRAY
+        NPY_ARRAY_INOUT_FARRAY
+
+        NPY_ARRAY_UPDATE_ALL
+
+    cdef enum:
+        NPY_MAXDIMS
+
+    npy_intp NPY_MAX_ELSIZE
+
+    ctypedef void (*PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *,  void *)
+
+    ctypedef struct PyArray_ArrayDescr:
+        # shape is a tuple, but Cython doesn't support "tuple shape"
+        # inside a non-PyObject declaration, so we have to declare it
+        # as just a PyObject*.
+        PyObject* shape
+
+    ctypedef struct PyArray_Descr:
+        pass
+
+    ctypedef class numpy.dtype [object PyArray_Descr, check_size ignore]:
+        # Use PyDataType_* macros when possible, however there are no macros
+        # for accessing some of the fields, so some are defined.
+        cdef PyTypeObject* typeobj
+        cdef char kind
+        cdef char type
+        # Numpy sometimes mutates this without warning (e.g. it'll
+        # sometimes change "|" to "<" in shared dtype objects on
+        # little-endian machines). If this matters to you, use
+        # PyArray_IsNativeByteOrder(dtype.byteorder) instead of
+        # directly accessing this field.
+        cdef char byteorder
+        cdef char flags
+        cdef int type_num
+        cdef int itemsize "elsize"
+        cdef int alignment
+        cdef object fields
+        cdef tuple names
+        # Use PyDataType_HASSUBARRAY to test whether this field is
+        # valid (the pointer can be NULL). Most users should access
+        # this field via the inline helper method PyDataType_SHAPE.
+        cdef PyArray_ArrayDescr* subarray
+
+    ctypedef class numpy.flatiter [object PyArrayIterObject, check_size ignore]:
+        # Use through macros
+        pass
+
+    ctypedef class numpy.broadcast [object PyArrayMultiIterObject, check_size ignore]:
+        # Use through macros
+        pass
+
+    ctypedef struct PyArrayObject:
+        # For use in situations where ndarray can't replace PyArrayObject*,
+        # like PyArrayObject**.
+        pass
+
+    ctypedef class numpy.ndarray [object PyArrayObject, check_size ignore]:
+        cdef __cythonbufferdefaults__ = {"mode": "strided"}
+
+        # NOTE: no field declarations since direct access is deprecated since NumPy 1.7
+        # Instead, we use properties that map to the corresponding C-API functions.
+
+        @property
+        cdef inline PyObject* base(self) nogil:
+            """Returns a borrowed reference to the object owning the data/memory.
+            """
+            return PyArray_BASE(self)
+
+        @property
+        cdef inline dtype descr(self):
+            """Returns an owned reference to the dtype of the array.
+            """
+            return <dtype>PyArray_DESCR(self)
+
+        @property
+        cdef inline int ndim(self) nogil:
+            """Returns the number of dimensions in the array.
+            """
+            return PyArray_NDIM(self)
+
+        @property
+        cdef inline npy_intp *shape(self) nogil:
+            """Returns a pointer to the dimensions/shape of the array.
+            The number of elements matches the number of dimensions of the array (ndim).
+            Can return NULL for 0-dimensional arrays.
+            """
+            return PyArray_DIMS(self)
+
+        @property
+        cdef inline npy_intp *strides(self) nogil:
+            """Returns a pointer to the strides of the array.
+            The number of elements matches the number of dimensions of the array (ndim).
+            """
+            return PyArray_STRIDES(self)
+
+        @property
+        cdef inline npy_intp size(self) nogil:
+            """Returns the total size (in number of elements) of the array.
+            """
+            return PyArray_SIZE(self)
+
+        @property
+        cdef inline char* data(self) nogil:
+            """The pointer to the data buffer as a char*.
+            This is provided for legacy reasons to avoid direct struct field access.
+            For new code that needs this access, you probably want to cast the result
+            of `PyArray_DATA()` instead, which returns a 'void*'.
+            """
+            return PyArray_BYTES(self)
+
+    ctypedef unsigned char      npy_bool
+
+    ctypedef signed char      npy_byte
+    ctypedef signed short     npy_short
+    ctypedef signed int       npy_int
+    ctypedef signed long      npy_long
+    ctypedef signed long long npy_longlong
+
+    ctypedef unsigned char      npy_ubyte
+    ctypedef unsigned short     npy_ushort
+    ctypedef unsigned int       npy_uint
+    ctypedef unsigned long      npy_ulong
+    ctypedef unsigned long long npy_ulonglong
+
+    ctypedef float        npy_float
+    ctypedef double       npy_double
+    ctypedef long double  npy_longdouble
+
+    ctypedef signed char        npy_int8
+    ctypedef signed short       npy_int16
+    ctypedef signed int         npy_int32
+    ctypedef signed long long   npy_int64
+    ctypedef signed long long   npy_int96
+    ctypedef signed long long   npy_int128
+
+    ctypedef unsigned char      npy_uint8
+    ctypedef unsigned short     npy_uint16
+    ctypedef unsigned int       npy_uint32
+    ctypedef unsigned long long npy_uint64
+    ctypedef unsigned long long npy_uint96
+    ctypedef unsigned long long npy_uint128
+
+    ctypedef float        npy_float32
+    ctypedef double       npy_float64
+    ctypedef long double  npy_float80
+    ctypedef long double  npy_float96
+    ctypedef long double  npy_float128
+
+    ctypedef struct npy_cfloat:
+        float real
+        float imag
+
+    ctypedef struct npy_cdouble:
+        double real
+        double imag
+
+    ctypedef struct npy_clongdouble:
+        long double real
+        long double imag
+
+    ctypedef struct npy_complex64:
+        float real
+        float imag
+
+    ctypedef struct npy_complex128:
+        double real
+        double imag
+
+    ctypedef struct npy_complex160:
+        long double real
+        long double imag
+
+    ctypedef struct npy_complex192:
+        long double real
+        long double imag
+
+    ctypedef struct npy_complex256:
+        long double real
+        long double imag
+
+    ctypedef struct PyArray_Dims:
+        npy_intp *ptr
+        int len
+
+    int _import_array() except -1
+    # A second definition so _import_array isn't marked as used when we use it here.
+    # Do not use - subject to change any time.
+    int __pyx_import_array "_import_array"() except -1
+
+    #
+    # Macros from ndarrayobject.h
+    #
+    bint PyArray_CHKFLAGS(ndarray m, int flags) nogil
+    bint PyArray_IS_C_CONTIGUOUS(ndarray arr) nogil
+    bint PyArray_IS_F_CONTIGUOUS(ndarray arr) nogil
+    bint PyArray_ISCONTIGUOUS(ndarray m) nogil
+    bint PyArray_ISWRITEABLE(ndarray m) nogil
+    bint PyArray_ISALIGNED(ndarray m) nogil
+
+    int PyArray_NDIM(ndarray) nogil
+    bint PyArray_ISONESEGMENT(ndarray) nogil
+    bint PyArray_ISFORTRAN(ndarray) nogil
+    int PyArray_FORTRANIF(ndarray) nogil
+
+    void* PyArray_DATA(ndarray) nogil
+    char* PyArray_BYTES(ndarray) nogil
+
+    npy_intp* PyArray_DIMS(ndarray) nogil
+    npy_intp* PyArray_STRIDES(ndarray) nogil
+    npy_intp PyArray_DIM(ndarray, size_t) nogil
+    npy_intp PyArray_STRIDE(ndarray, size_t) nogil
+
+    PyObject *PyArray_BASE(ndarray) nogil  # returns borrowed reference!
+    PyArray_Descr *PyArray_DESCR(ndarray) nogil  # returns borrowed reference to dtype!
+    PyArray_Descr *PyArray_DTYPE(ndarray) nogil  # returns borrowed reference to dtype! NP 1.7+ alias for descr.
+    int PyArray_FLAGS(ndarray) nogil
+    void PyArray_CLEARFLAGS(ndarray, int flags) nogil  # Added in NumPy 1.7
+    void PyArray_ENABLEFLAGS(ndarray, int flags) nogil  # Added in NumPy 1.7
+    npy_intp PyArray_ITEMSIZE(ndarray) nogil
+    int PyArray_TYPE(ndarray arr) nogil
+
+    object PyArray_GETITEM(ndarray arr, void *itemptr)
+    int PyArray_SETITEM(ndarray arr, void *itemptr, object obj)
+
+    bint PyTypeNum_ISBOOL(int) nogil
+    bint PyTypeNum_ISUNSIGNED(int) nogil
+    bint PyTypeNum_ISSIGNED(int) nogil
+    bint PyTypeNum_ISINTEGER(int) nogil
+    bint PyTypeNum_ISFLOAT(int) nogil
+    bint PyTypeNum_ISNUMBER(int) nogil
+    bint PyTypeNum_ISSTRING(int) nogil
+    bint PyTypeNum_ISCOMPLEX(int) nogil
+    bint PyTypeNum_ISPYTHON(int) nogil
+    bint PyTypeNum_ISFLEXIBLE(int) nogil
+    bint PyTypeNum_ISUSERDEF(int) nogil
+    bint PyTypeNum_ISEXTENDED(int) nogil
+    bint PyTypeNum_ISOBJECT(int) nogil
+
+    bint PyDataType_ISBOOL(dtype) nogil
+    bint PyDataType_ISUNSIGNED(dtype) nogil
+    bint PyDataType_ISSIGNED(dtype) nogil
+    bint PyDataType_ISINTEGER(dtype) nogil
+    bint PyDataType_ISFLOAT(dtype) nogil
+    bint PyDataType_ISNUMBER(dtype) nogil
+    bint PyDataType_ISSTRING(dtype) nogil
+    bint PyDataType_ISCOMPLEX(dtype) nogil
+    bint PyDataType_ISPYTHON(dtype) nogil
+    bint PyDataType_ISFLEXIBLE(dtype) nogil
+    bint PyDataType_ISUSERDEF(dtype) nogil
+    bint PyDataType_ISEXTENDED(dtype) nogil
+    bint PyDataType_ISOBJECT(dtype) nogil
+    bint PyDataType_HASFIELDS(dtype) nogil
+    bint PyDataType_HASSUBARRAY(dtype) nogil
+
+    bint PyArray_ISBOOL(ndarray) nogil
+    bint PyArray_ISUNSIGNED(ndarray) nogil
+    bint PyArray_ISSIGNED(ndarray) nogil
+    bint PyArray_ISINTEGER(ndarray) nogil
+    bint PyArray_ISFLOAT(ndarray) nogil
+    bint PyArray_ISNUMBER(ndarray) nogil
+    bint PyArray_ISSTRING(ndarray) nogil
+    bint PyArray_ISCOMPLEX(ndarray) nogil
+    bint PyArray_ISPYTHON(ndarray) nogil
+    bint PyArray_ISFLEXIBLE(ndarray) nogil
+    bint PyArray_ISUSERDEF(ndarray) nogil
+    bint PyArray_ISEXTENDED(ndarray) nogil
+    bint PyArray_ISOBJECT(ndarray) nogil
+    bint PyArray_HASFIELDS(ndarray) nogil
+
+    bint PyArray_ISVARIABLE(ndarray) nogil
+
+    bint PyArray_SAFEALIGNEDCOPY(ndarray) nogil
+    bint PyArray_ISNBO(char) nogil              # works on ndarray.byteorder
+    bint PyArray_IsNativeByteOrder(char) nogil  # works on ndarray.byteorder
+    bint PyArray_ISNOTSWAPPED(ndarray) nogil
+    bint PyArray_ISBYTESWAPPED(ndarray) nogil
+
+    bint PyArray_FLAGSWAP(ndarray, int) nogil
+
+    bint PyArray_ISCARRAY(ndarray) nogil
+    bint PyArray_ISCARRAY_RO(ndarray) nogil
+    bint PyArray_ISFARRAY(ndarray) nogil
+    bint PyArray_ISFARRAY_RO(ndarray) nogil
+    bint PyArray_ISBEHAVED(ndarray) nogil
+    bint PyArray_ISBEHAVED_RO(ndarray) nogil
+
+
+    bint PyDataType_ISNOTSWAPPED(dtype) nogil
+    bint PyDataType_ISBYTESWAPPED(dtype) nogil
+
+    bint PyArray_DescrCheck(object)
+
+    bint PyArray_Check(object)
+    bint PyArray_CheckExact(object)
+
+    # Cannot be supported due to out arg:
+    # bint PyArray_HasArrayInterfaceType(object, dtype, object, object&)
+    # bint PyArray_HasArrayInterface(op, out)
+
+
+    bint PyArray_IsZeroDim(object)
+    # Cannot be supported due to ## ## in macro:
+    # bint PyArray_IsScalar(object, verbatim work)
+    bint PyArray_CheckScalar(object)
+    bint PyArray_IsPythonNumber(object)
+    bint PyArray_IsPythonScalar(object)
+    bint PyArray_IsAnyScalar(object)
+    bint PyArray_CheckAnyScalar(object)
+
+    ndarray PyArray_GETCONTIGUOUS(ndarray)
+    bint PyArray_SAMESHAPE(ndarray, ndarray) nogil
+    npy_intp PyArray_SIZE(ndarray) nogil
+    npy_intp PyArray_NBYTES(ndarray) nogil
+
+    object PyArray_FROM_O(object)
+    object PyArray_FROM_OF(object m, int flags)
+    object PyArray_FROM_OT(object m, int type)
+    object PyArray_FROM_OTF(object m, int type, int flags)
+    object PyArray_FROMANY(object m, int type, int min, int max, int flags)
+    object PyArray_ZEROS(int nd, npy_intp* dims, int type, int fortran)
+    object PyArray_EMPTY(int nd, npy_intp* dims, int type, int fortran)
+    void PyArray_FILLWBYTE(object, int val)
+    npy_intp PyArray_REFCOUNT(object)
+    object PyArray_ContiguousFromAny(op, int, int min_depth, int max_depth)
+    unsigned char PyArray_EquivArrTypes(ndarray a1, ndarray a2)
+    bint PyArray_EquivByteorders(int b1, int b2) nogil
+    object PyArray_SimpleNew(int nd, npy_intp* dims, int typenum)
+    object PyArray_SimpleNewFromData(int nd, npy_intp* dims, int typenum, void* data)
+    #object PyArray_SimpleNewFromDescr(int nd, npy_intp* dims, dtype descr)
+    object PyArray_ToScalar(void* data, ndarray arr)
+
+    void* PyArray_GETPTR1(ndarray m, npy_intp i) nogil
+    void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) nogil
+    void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) nogil
+    void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) nogil
+
+    void PyArray_XDECREF_ERR(ndarray)
+    # Cannot be supported due to out arg
+    # void PyArray_DESCR_REPLACE(descr)
+
+
+    object PyArray_Copy(ndarray)
+    object PyArray_FromObject(object op, int type, int min_depth, int max_depth)
+    object PyArray_ContiguousFromObject(object op, int type, int min_depth, int max_depth)
+    object PyArray_CopyFromObject(object op, int type, int min_depth, int max_depth)
+
+    object PyArray_Cast(ndarray mp, int type_num)
+    object PyArray_Take(ndarray ap, object items, int axis)
+    object PyArray_Put(ndarray ap, object items, object values)
+
+    void PyArray_ITER_RESET(flatiter it) nogil
+    void PyArray_ITER_NEXT(flatiter it) nogil
+    void PyArray_ITER_GOTO(flatiter it, npy_intp* destination) nogil
+    void PyArray_ITER_GOTO1D(flatiter it, npy_intp ind) nogil
+    void* PyArray_ITER_DATA(flatiter it) nogil
+    bint PyArray_ITER_NOTDONE(flatiter it) nogil
+
+    void PyArray_MultiIter_RESET(broadcast multi) nogil
+    void PyArray_MultiIter_NEXT(broadcast multi) nogil
+    void PyArray_MultiIter_GOTO(broadcast multi, npy_intp dest) nogil
+    void PyArray_MultiIter_GOTO1D(broadcast multi, npy_intp ind) nogil
+    void* PyArray_MultiIter_DATA(broadcast multi, npy_intp i) nogil
+    void PyArray_MultiIter_NEXTi(broadcast multi, npy_intp i) nogil
+    bint PyArray_MultiIter_NOTDONE(broadcast multi) nogil
+
+    # Functions from __multiarray_api.h
+
+    # Functions taking dtype and returning object/ndarray are disabled
+    # for now as they steal dtype references. I'm conservative and disable
+    # more than is probably needed until it can be checked further.
+    int PyArray_SetNumericOps        (object)
+    object PyArray_GetNumericOps ()
+    int PyArray_INCREF (ndarray)
+    int PyArray_XDECREF (ndarray)
+    void PyArray_SetStringFunction (object, int)
+    dtype PyArray_DescrFromType (int)
+    object PyArray_TypeObjectFromType (int)
+    char * PyArray_Zero (ndarray)
+    char * PyArray_One (ndarray)
+    #object PyArray_CastToType (ndarray, dtype, int)
+    int PyArray_CastTo (ndarray, ndarray)
+    int PyArray_CastAnyTo (ndarray, ndarray)
+    int PyArray_CanCastSafely (int, int)
+    npy_bool PyArray_CanCastTo (dtype, dtype)
+    int PyArray_ObjectType (object, int)
+    dtype PyArray_DescrFromObject (object, dtype)
+    #ndarray* PyArray_ConvertToCommonType (object, int *)
+    dtype PyArray_DescrFromScalar (object)
+    dtype PyArray_DescrFromTypeObject (object)
+    npy_intp PyArray_Size (object)
+    #object PyArray_Scalar (void *, dtype, object)
+    #object PyArray_FromScalar (object, dtype)
+    void PyArray_ScalarAsCtype (object, void *)
+    #int PyArray_CastScalarToCtype (object, void *, dtype)
+    #int PyArray_CastScalarDirect (object, dtype, void *, int)
+    object PyArray_ScalarFromObject (object)
+    #PyArray_VectorUnaryFunc * PyArray_GetCastFunc (dtype, int)
+    object PyArray_FromDims (int, int *, int)
+    #object PyArray_FromDimsAndDataAndDescr (int, int *, dtype, char *)
+    #object PyArray_FromAny (object, dtype, int, int, int, object)
+    object PyArray_EnsureArray (object)
+    object PyArray_EnsureAnyArray (object)
+    #object PyArray_FromFile (stdio.FILE *, dtype, npy_intp, char *)
+    #object PyArray_FromString (char *, npy_intp, dtype, npy_intp, char *)
+    #object PyArray_FromBuffer (object, dtype, npy_intp, npy_intp)
+    #object PyArray_FromIter (object, dtype, npy_intp)
+    object PyArray_Return (ndarray)
+    #object PyArray_GetField (ndarray, dtype, int)
+    #int PyArray_SetField (ndarray, dtype, int, object)
+    object PyArray_Byteswap (ndarray, npy_bool)
+    object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER)
+    int PyArray_MoveInto (ndarray, ndarray)
+    int PyArray_CopyInto (ndarray, ndarray)
+    int PyArray_CopyAnyInto (ndarray, ndarray)
+    int PyArray_CopyObject (ndarray, object)
+    object PyArray_NewCopy (ndarray, NPY_ORDER)
+    object PyArray_ToList (ndarray)
+    object PyArray_ToString (ndarray, NPY_ORDER)
+    int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *)
+    int PyArray_Dump (object, object, int)
+    object PyArray_Dumps (object, int)
+    int PyArray_ValidType (int)
+    void PyArray_UpdateFlags (ndarray, int)
+    object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object)
+    #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object)
+    #dtype PyArray_DescrNew (dtype)
+    dtype PyArray_DescrNewFromType (int)
+    double PyArray_GetPriority (object, double)
+    object PyArray_IterNew (object)
+    object PyArray_MultiIterNew (int, ...)
+
+    int PyArray_PyIntAsInt (object)
+    npy_intp PyArray_PyIntAsIntp (object)
+    int PyArray_Broadcast (broadcast)
+    void PyArray_FillObjectArray (ndarray, object)
+    int PyArray_FillWithScalar (ndarray, object)
+    npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)
+    dtype PyArray_DescrNewByteorder (dtype, char)
+    object PyArray_IterAllButAxis (object, int *)
+    #object PyArray_CheckFromAny (object, dtype, int, int, int, object)
+    #object PyArray_FromArray (ndarray, dtype, int)
+    object PyArray_FromInterface (object)
+    object PyArray_FromStructInterface (object)
+    #object PyArray_FromArrayAttr (object, dtype, object)
+    #NPY_SCALARKIND PyArray_ScalarKind (int, ndarray*)
+    int PyArray_CanCoerceScalar (int, int, NPY_SCALARKIND)
+    object PyArray_NewFlagsObject (object)
+    npy_bool PyArray_CanCastScalar (type, type)
+    #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t)
+    int PyArray_RemoveSmallest (broadcast)
+    int PyArray_ElementStrides (object)
+    void PyArray_Item_INCREF (char *, dtype)
+    void PyArray_Item_XDECREF (char *, dtype)
+    object PyArray_FieldNames (object)
+    object PyArray_Transpose (ndarray, PyArray_Dims *)
+    object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE)
+    object PyArray_PutTo (ndarray, object, object, NPY_CLIPMODE)
+    object PyArray_PutMask (ndarray, object, object)
+    object PyArray_Repeat (ndarray, object, int)
+    object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE)
+    int PyArray_Sort (ndarray, int, NPY_SORTKIND)
+    object PyArray_ArgSort (ndarray, int, NPY_SORTKIND)
+    object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE, PyObject *)
+    object PyArray_ArgMax (ndarray, int, ndarray)
+    object PyArray_ArgMin (ndarray, int, ndarray)
+    object PyArray_Reshape (ndarray, object)
+    object PyArray_Newshape (ndarray, PyArray_Dims *, NPY_ORDER)
+    object PyArray_Squeeze (ndarray)
+    #object PyArray_View (ndarray, dtype, type)
+    object PyArray_SwapAxes (ndarray, int, int)
+    object PyArray_Max (ndarray, int, ndarray)
+    object PyArray_Min (ndarray, int, ndarray)
+    object PyArray_Ptp (ndarray, int, ndarray)
+    object PyArray_Mean (ndarray, int, int, ndarray)
+    object PyArray_Trace (ndarray, int, int, int, int, ndarray)
+    object PyArray_Diagonal (ndarray, int, int, int)
+    object PyArray_Clip (ndarray, object, object, ndarray)
+    object PyArray_Conjugate (ndarray, ndarray)
+    object PyArray_Nonzero (ndarray)
+    object PyArray_Std (ndarray, int, int, ndarray, int)
+    object PyArray_Sum (ndarray, int, int, ndarray)
+    object PyArray_CumSum (ndarray, int, int, ndarray)
+    object PyArray_Prod (ndarray, int, int, ndarray)
+    object PyArray_CumProd (ndarray, int, int, ndarray)
+    object PyArray_All (ndarray, int, ndarray)
+    object PyArray_Any (ndarray, int, ndarray)
+    object PyArray_Compress (ndarray, object, int, ndarray)
+    object PyArray_Flatten (ndarray, NPY_ORDER)
+    object PyArray_Ravel (ndarray, NPY_ORDER)
+    npy_intp PyArray_MultiplyList (npy_intp *, int)
+    int PyArray_MultiplyIntList (int *, int)
+    void * PyArray_GetPtr (ndarray, npy_intp*)
+    int PyArray_CompareLists (npy_intp *, npy_intp *, int)
+    #int PyArray_AsCArray (object*, void *, npy_intp *, int, dtype)
+    #int PyArray_As1D (object*, char **, int *, int)
+    #int PyArray_As2D (object*, char ***, int *, int *, int)
+    int PyArray_Free (object, void *)
+    #int PyArray_Converter (object, object*)
+    int PyArray_IntpFromSequence (object, npy_intp *, int)
+    object PyArray_Concatenate (object, int)
+    object PyArray_InnerProduct (object, object)
+    object PyArray_MatrixProduct (object, object)
+    object PyArray_CopyAndTranspose (object)
+    object PyArray_Correlate (object, object, int)
+    int PyArray_TypestrConvert (int, int)
+    #int PyArray_DescrConverter (object, dtype*)
+    #int PyArray_DescrConverter2 (object, dtype*)
+    int PyArray_IntpConverter (object, PyArray_Dims *)
+    #int PyArray_BufferConverter (object, chunk)
+    int PyArray_AxisConverter (object, int *)
+    int PyArray_BoolConverter (object, npy_bool *)
+    int PyArray_ByteorderConverter (object, char *)
+    int PyArray_OrderConverter (object, NPY_ORDER *)
+    unsigned char PyArray_EquivTypes (dtype, dtype)
+    #object PyArray_Zeros (int, npy_intp *, dtype, int)
+    #object PyArray_Empty (int, npy_intp *, dtype, int)
+    object PyArray_Where (object, object, object)
+    object PyArray_Arange (double, double, double, int)
+    #object PyArray_ArangeObj (object, object, object, dtype)
+    int PyArray_SortkindConverter (object, NPY_SORTKIND *)
+    object PyArray_LexSort (object, int)
+    object PyArray_Round (ndarray, int, ndarray)
+    unsigned char PyArray_EquivTypenums (int, int)
+    int PyArray_RegisterDataType (dtype)
+    int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *)
+    int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND)
+    #void PyArray_InitArrFuncs (PyArray_ArrFuncs *)
+    object PyArray_IntTupleFromIntp (int, npy_intp *)
+    int PyArray_TypeNumFromName (char *)
+    int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *)
+    #int PyArray_OutputConverter (object, ndarray*)
+    object PyArray_BroadcastToShape (object, npy_intp *, int)
+    void _PyArray_SigintHandler (int)
+    void* _PyArray_GetSigintBuf ()
+    #int PyArray_DescrAlignConverter (object, dtype*)
+    #int PyArray_DescrAlignConverter2 (object, dtype*)
+    int PyArray_SearchsideConverter (object, void *)
+    object PyArray_CheckAxis (ndarray, int *, int)
+    npy_intp PyArray_OverflowMultiplyList (npy_intp *, int)
+    int PyArray_CompareString (char *, char *, size_t)
+    int PyArray_SetBaseObject(ndarray, base)  # NOTE: steals a reference to base! Use "set_array_base()" instead.
+
+
+# Typedefs that matches the runtime dtype objects in
+# the numpy module.
+
+# The ones that are commented out needs an IFDEF function
+# in Cython to enable them only on the right systems.
+
+ctypedef npy_int8       int8_t
+ctypedef npy_int16      int16_t
+ctypedef npy_int32      int32_t
+ctypedef npy_int64      int64_t
+#ctypedef npy_int96      int96_t
+#ctypedef npy_int128     int128_t
+
+ctypedef npy_uint8      uint8_t
+ctypedef npy_uint16     uint16_t
+ctypedef npy_uint32     uint32_t
+ctypedef npy_uint64     uint64_t
+#ctypedef npy_uint96     uint96_t
+#ctypedef npy_uint128    uint128_t
+
+ctypedef npy_float32    float32_t
+ctypedef npy_float64    float64_t
+#ctypedef npy_float80    float80_t
+#ctypedef npy_float128   float128_t
+
+ctypedef float complex  complex64_t
+ctypedef double complex complex128_t
+
+# The int types are mapped a bit surprising --
+# numpy.int corresponds to 'l' and numpy.long to 'q'
+ctypedef npy_long       int_t
+ctypedef npy_longlong   long_t
+ctypedef npy_longlong   longlong_t
+
+ctypedef npy_ulong      uint_t
+ctypedef npy_ulonglong  ulong_t
+ctypedef npy_ulonglong  ulonglong_t
+
+ctypedef npy_intp       intp_t
+ctypedef npy_uintp      uintp_t
+
+ctypedef npy_double     float_t
+ctypedef npy_double     double_t
+ctypedef npy_longdouble longdouble_t
+
+ctypedef npy_cfloat      cfloat_t
+ctypedef npy_cdouble     cdouble_t
+ctypedef npy_clongdouble clongdouble_t
+
+ctypedef npy_cdouble     complex_t
+
+cdef inline object PyArray_MultiIterNew1(a):
+    return PyArray_MultiIterNew(1, <void*>a)
+
+cdef inline object PyArray_MultiIterNew2(a, b):
+    return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+
+cdef inline object PyArray_MultiIterNew3(a, b, c):
+    return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+
+cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+    return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+
+cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+    return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+
+cdef inline tuple PyDataType_SHAPE(dtype d):
+    if PyDataType_HASSUBARRAY(d):
+        return <tuple>d.subarray.shape
+    else:
+        return ()
+
+
+cdef extern from "numpy/ndarrayobject.h":
+    PyTypeObject PyTimedeltaArrType_Type
+    PyTypeObject PyDatetimeArrType_Type
+    ctypedef int64_t npy_timedelta
+    ctypedef int64_t npy_datetime
+
+cdef extern from "numpy/ndarraytypes.h":
+    ctypedef struct PyArray_DatetimeMetaData:
+        NPY_DATETIMEUNIT base
+        int64_t num
+
+cdef extern from "numpy/arrayscalars.h":
+
+    # abstract types
+    ctypedef class numpy.generic [object PyObject]:
+        pass
+    ctypedef class numpy.number [object PyObject]:
+        pass
+    ctypedef class numpy.integer [object PyObject]:
+        pass
+    ctypedef class numpy.signedinteger [object PyObject]:
+        pass
+    ctypedef class numpy.unsignedinteger [object PyObject]:
+        pass
+    ctypedef class numpy.inexact [object PyObject]:
+        pass
+    ctypedef class numpy.floating [object PyObject]:
+        pass
+    ctypedef class numpy.complexfloating [object PyObject]:
+        pass
+    ctypedef class numpy.flexible [object PyObject]:
+        pass
+    ctypedef class numpy.character [object PyObject]:
+        pass
+
+    ctypedef struct PyDatetimeScalarObject:
+        # PyObject_HEAD
+        npy_datetime obval
+        PyArray_DatetimeMetaData obmeta
+
+    ctypedef struct PyTimedeltaScalarObject:
+        # PyObject_HEAD
+        npy_timedelta obval
+        PyArray_DatetimeMetaData obmeta
+
+    ctypedef enum NPY_DATETIMEUNIT:
+        NPY_FR_Y
+        NPY_FR_M
+        NPY_FR_W
+        NPY_FR_D
+        NPY_FR_B
+        NPY_FR_h
+        NPY_FR_m
+        NPY_FR_s
+        NPY_FR_ms
+        NPY_FR_us
+        NPY_FR_ns
+        NPY_FR_ps
+        NPY_FR_fs
+        NPY_FR_as
+
+
+#
+# ufunc API
+#
+
+cdef extern from "numpy/ufuncobject.h":
+
+    ctypedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *)
+
+    ctypedef class numpy.ufunc [object PyUFuncObject, check_size ignore]:
+        cdef:
+            int nin, nout, nargs
+            int identity
+            PyUFuncGenericFunction *functions
+            void **data
+            int ntypes
+            int check_return
+            char *name
+            char *types
+            char *doc
+            void *ptr
+            PyObject *obj
+            PyObject *userloops
+
+    cdef enum:
+        PyUFunc_Zero
+        PyUFunc_One
+        PyUFunc_None
+        UFUNC_ERR_IGNORE
+        UFUNC_ERR_WARN
+        UFUNC_ERR_RAISE
+        UFUNC_ERR_CALL
+        UFUNC_ERR_PRINT
+        UFUNC_ERR_LOG
+        UFUNC_MASK_DIVIDEBYZERO
+        UFUNC_MASK_OVERFLOW
+        UFUNC_MASK_UNDERFLOW
+        UFUNC_MASK_INVALID
+        UFUNC_SHIFT_DIVIDEBYZERO
+        UFUNC_SHIFT_OVERFLOW
+        UFUNC_SHIFT_UNDERFLOW
+        UFUNC_SHIFT_INVALID
+        UFUNC_FPE_DIVIDEBYZERO
+        UFUNC_FPE_OVERFLOW
+        UFUNC_FPE_UNDERFLOW
+        UFUNC_FPE_INVALID
+        UFUNC_ERR_DEFAULT
+        UFUNC_ERR_DEFAULT2
+
+    object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *,
+          void **, char *, int, int, int, int, char *, char *, int)
+    int PyUFunc_RegisterLoopForType(ufunc, int,
+                                    PyUFuncGenericFunction, int *, void *)
+    int PyUFunc_GenericFunction \
+        (ufunc, PyObject *, PyObject *, PyArrayObject **)
+    void PyUFunc_f_f_As_d_d \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_d_d \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_f_f \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_g_g \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_F_F_As_D_D \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_F_F \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_D_D \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_G_G \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_O_O \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_ff_f_As_dd_d \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_ff_f \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_dd_d \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_gg_g \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_FF_F_As_DD_D \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_DD_D \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_FF_F \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_GG_G \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_OO_O \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_O_O_method \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_OO_O_method \
+         (char **, npy_intp *, npy_intp *, void *)
+    void PyUFunc_On_Om \
+         (char **, npy_intp *, npy_intp *, void *)
+    int PyUFunc_GetPyValues \
+        (char *, int *, int *, PyObject **)
+    int PyUFunc_checkfperr \
+           (int, PyObject *, int *)
+    void PyUFunc_clearfperr()
+    int PyUFunc_getfperr()
+    int PyUFunc_handlefperr \
+        (int, PyObject *, int, int *)
+    int PyUFunc_ReplaceLoopBySignature \
+        (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)
+    object PyUFunc_FromFuncAndDataAndSignature \
+             (PyUFuncGenericFunction *, void **, char *, int, int, int,
+              int, char *, char *, int, char *)
+
+    int _import_umath() except -1
+
+cdef inline void set_array_base(ndarray arr, object base):
+    Py_INCREF(base) # important to do this before stealing the reference below!
+    PyArray_SetBaseObject(arr, base)
+
+cdef inline object get_array_base(ndarray arr):
+    base = PyArray_BASE(arr)
+    if base is NULL:
+        return None
+    return <object>base
+
+# Versions of the import_* functions which are more suitable for
+# Cython code.
+cdef inline int import_array() except -1:
+    try:
+        __pyx_import_array()
+    except Exception:
+        raise ImportError("numpy.core.multiarray failed to import")
+
+cdef inline int import_umath() except -1:
+    try:
+        _import_umath()
+    except Exception:
+        raise ImportError("numpy.core.umath failed to import")
+
+cdef inline int import_ufunc() except -1:
+    try:
+        _import_umath()
+    except Exception:
+        raise ImportError("numpy.core.umath failed to import")
+
+
+cdef inline bint is_timedelta64_object(object obj):
+    """
+    Cython equivalent of `isinstance(obj, np.timedelta64)`
+
+    Parameters
+    ----------
+    obj : object
+
+    Returns
+    -------
+    bool
+    """
+    return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)
+
+
+cdef inline bint is_datetime64_object(object obj):
+    """
+    Cython equivalent of `isinstance(obj, np.datetime64)`
+
+    Parameters
+    ----------
+    obj : object
+
+    Returns
+    -------
+    bool
+    """
+    return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)
+
+
+cdef inline npy_datetime get_datetime64_value(object obj) nogil:
+    """
+    returns the int64 value underlying scalar numpy datetime64 object
+
+    Note that to interpret this as a datetime, the corresponding unit is
+    also needed.  That can be found using `get_datetime64_unit`.
+    """
+    return (<PyDatetimeScalarObject*>obj).obval
+
+
+cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:
+    """
+    returns the int64 value underlying scalar numpy timedelta64 object
+    """
+    return (<PyTimedeltaScalarObject*>obj).obval
+
+
+cdef inline NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
+    """
+    returns the unit part of the dtype for a numpy datetime64 object.
+    """
+    return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
diff --git a/numpy/__init__.pxd b/numpy/__init__.pxd
index 33d90c496..fd704b7e3 100644
--- a/numpy/__init__.pxd
+++ b/numpy/__init__.pxd
@@ -1,16 +1,8 @@
-# NumPy static imports for Cython
+# NumPy static imports for Cython < 3.0
 #
 # If any of the PyArray_* functions are called, import_array must be
 # called first.
 #
-# This also defines backwards-compatibility buffer acquisition
-# code for use in Python 2.x (or Python <= 2.5 when NumPy starts
-# implementing PEP-3118 directly).
-#
-# Because of laziness, the format string of the buffer is statically
-# allocated. Increase the size if this is not enough, or submit a
-# patch to do this properly.
-#
 # Author: Dag Sverre Seljebotn
 #
 
@@ -228,11 +220,11 @@ cdef extern from "numpy/arrayobject.h":
         # this field via the inline helper method PyDataType_SHAPE.
         cdef PyArray_ArrayDescr* subarray
 
-    ctypedef extern class numpy.flatiter [object PyArrayIterObject, check_size ignore]:
+    ctypedef class numpy.flatiter [object PyArrayIterObject, check_size ignore]:
         # Use through macros
         pass
 
-    ctypedef extern class numpy.broadcast [object PyArrayMultiIterObject, check_size ignore]:
+    ctypedef class numpy.broadcast [object PyArrayMultiIterObject, check_size ignore]:
         cdef int numiter
         cdef npy_intp size, index
         cdef int nd
@@ -298,8 +290,8 @@ cdef extern from "numpy/arrayobject.h":
     ctypedef long double  npy_float128
 
     ctypedef struct npy_cfloat:
-        double real
-        double imag
+        float real
+        float imag
 
     ctypedef struct npy_cdouble:
         double real
@@ -761,73 +753,6 @@ cdef inline tuple PyDataType_SHAPE(dtype d):
     else:
         return ()
 
-cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
-    # Recursive utility function used in __getbuffer__ to get format
-    # string. The new location in the format string is returned.
-
-    cdef dtype child
-    cdef int endian_detector = 1
-    cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
-    cdef tuple fields
-
-    for childname in descr.names:
-        fields = descr.fields[childname]
-        child, new_offset = fields
-
-        if (end - f) - <int>(new_offset - offset[0]) < 15:
-            raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
-
-        if ((child.byteorder == c'>' and little_endian) or
-            (child.byteorder == c'<' and not little_endian)):
-            raise ValueError(u"Non-native byte order not supported")
-            # One could encode it in the format string and have Cython
-            # complain instead, BUT: < and > in format strings also imply
-            # standardized sizes for datatypes, and we rely on native in
-            # order to avoid reencoding data types based on their size.
-            #
-            # A proper PEP 3118 exporter for other clients than Cython
-            # must deal properly with this!
-
-        # Output padding bytes
-        while offset[0] < new_offset:
-            f[0] = 120 # "x"; pad byte
-            f += 1
-            offset[0] += 1
-
-        offset[0] += child.itemsize
-
-        if not PyDataType_HASFIELDS(child):
-            t = child.type_num
-            if end - f < 5:
-                raise RuntimeError(u"Format string allocated too short.")
-
-            # Until ticket #99 is fixed, use integers to avoid warnings
-            if   t == NPY_BYTE:        f[0] =  98 #"b"
-            elif t == NPY_UBYTE:       f[0] =  66 #"B"
-            elif t == NPY_SHORT:       f[0] = 104 #"h"
-            elif t == NPY_USHORT:      f[0] =  72 #"H"
-            elif t == NPY_INT:         f[0] = 105 #"i"
-            elif t == NPY_UINT:        f[0] =  73 #"I"
-            elif t == NPY_LONG:        f[0] = 108 #"l"
-            elif t == NPY_ULONG:       f[0] = 76  #"L"
-            elif t == NPY_LONGLONG:    f[0] = 113 #"q"
-            elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
-            elif t == NPY_FLOAT:       f[0] = 102 #"f"
-            elif t == NPY_DOUBLE:      f[0] = 100 #"d"
-            elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
-            elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
-            elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
-            elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
-            elif t == NPY_OBJECT:      f[0] = 79 #"O"
-            else:
-                raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
-            f += 1
-        else:
-            # Cython ignores struct boundary information ("T{...}"),
-            # so don't output it
-            f = _util_dtypestring(child, f, end, offset)
-    return f
-
 
 cdef extern from "numpy/ndarrayobject.h":
     PyTypeObject PyTimedeltaArrType_Type
@@ -841,6 +766,29 @@ cdef extern from "numpy/ndarraytypes.h":
         int64_t num
 
 cdef extern from "numpy/arrayscalars.h":
+
+    # abstract types
+    ctypedef class numpy.generic [object PyObject]:
+        pass
+    ctypedef class numpy.number [object PyObject]:
+        pass
+    ctypedef class numpy.integer [object PyObject]:
+        pass
+    ctypedef class numpy.signedinteger [object PyObject]:
+        pass
+    ctypedef class numpy.unsignedinteger [object PyObject]:
+        pass
+    ctypedef class numpy.inexact [object PyObject]:
+        pass
+    ctypedef class numpy.floating [object PyObject]:
+        pass
+    ctypedef class numpy.complexfloating [object PyObject]:
+        pass
+    ctypedef class numpy.flexible [object PyObject]:
+        pass
+    ctypedef class numpy.character [object PyObject]:
+        pass
+
     ctypedef struct PyDatetimeScalarObject:
         # PyObject_HEAD
         npy_datetime obval
@@ -876,7 +824,7 @@ cdef extern from "numpy/ufuncobject.h":
 
     ctypedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *)
 
-    ctypedef extern class numpy.ufunc [object PyUFuncObject, check_size ignore]:
+    ctypedef class numpy.ufunc [object PyUFuncObject, check_size ignore]:
         cdef:
             int nin, nout, nargs
             int identity
diff --git a/numpy/__init__.py b/numpy/__init__.py
index 0fe8818ef..879e8f013 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -124,18 +124,22 @@ if __NUMPY_SETUP__:
 else:
     try:
         from numpy.__config__ import show as show_config
-    except ImportError:
+    except ImportError as e:
         msg = """Error importing numpy: you should not try to import numpy from
         its source directory; please exit the numpy source tree, and relaunch
         your python interpreter from there."""
-        raise ImportError(msg)
-
-    from .version import git_revision as __git_revision__
-    from .version import version as __version__
+        raise ImportError(msg) from e
 
     __all__ = ['ModuleDeprecationWarning',
                'VisibleDeprecationWarning']
 
+    # get the version using versioneer
+    from ._version import get_versions
+    vinfo = get_versions()
+    __version__ = vinfo.get("closest-tag", vinfo["version"])
+    __git_version__ = vinfo.get("full-revisionid")
+    del get_versions, vinfo
+
     # mapping of {name: (value, deprecation_msg)}
     __deprecated_attrs__ = {}
 
@@ -214,6 +218,18 @@ else:
     __all__.remove('Arrayterator')
     del Arrayterator
 
+    # These names were removed in NumPy 1.20.  For at least one release,
+    # attempts to access these names in the numpy namespace will trigger
+    # a warning, and calling the function will raise an exception.
+    _financial_names = ['fv', 'ipmt', 'irr', 'mirr', 'nper', 'npv', 'pmt',
+                        'ppmt', 'pv', 'rate']
+    __expired_functions__ = {
+        name: (f'In accordance with NEP 32, the function {name} was removed '
+               'from NumPy version 1.20.  A replacement for this function '
+               'is available in the numpy_financial library: '
+               'https://pypi.org/project/numpy-financial')
+        for name in _financial_names}
+
     # Filter out Cython harmless warnings
     warnings.filterwarnings("ignore", message="numpy.dtype size changed")
     warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
@@ -228,6 +244,20 @@ else:
         # module level getattr is only supported in 3.7 onwards
         # https://www.python.org/dev/peps/pep-0562/
         def __getattr__(attr):
+            # Warn for expired attributes, and return a dummy function
+            # that always raises an exception.
+            try:
+                msg = __expired_functions__[attr]
+            except KeyError:
+                pass
+            else:
+                warnings.warn(msg, DeprecationWarning, stacklevel=2)
+
+                def _expired(*args, **kwds):
+                    raise RuntimeError(msg)
+
+                return _expired
+
             # Emit warnings for deprecated attributes
             try:
                 val, msg = __deprecated_attrs__[attr]
@@ -295,7 +325,7 @@ else:
                    "by incorrect BLAS library being linked in, or by mixing "
                    "package managers (pip, conda, apt, ...). Search closed "
                    "numpy issues for similar problems.")
-            raise RuntimeError(msg.format(__file__))
+            raise RuntimeError(msg.format(__file__)) from None
 
     _sanity_check()
     del _sanity_check
@@ -358,3 +388,8 @@ else:
 
     # Note that this will currently only make a difference on Linux
     core.multiarray._set_madvise_hugepage(use_hugepage)
+
+
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
diff --git a/numpy/__init__.pyi b/numpy/__init__.pyi
index ff2cc565e..cf9b3e86a 100644
--- a/numpy/__init__.pyi
+++ b/numpy/__init__.pyi
@@ -2,9 +2,54 @@ import builtins
 import sys
 import datetime as dt
 from abc import abstractmethod
+from types import TracebackType
+from contextlib import ContextDecorator
 
 from numpy.core._internal import _ctypes
-from numpy.typing import ArrayLike, DtypeLike, _Shape, _ShapeLike
+from numpy.typing import (
+    ArrayLike,
+    DTypeLike,
+    _Shape,
+    _ShapeLike,
+    _CharLike,
+    _BoolLike,
+    _IntLike,
+    _FloatLike,
+    _ComplexLike,
+    _TD64Like,
+    _NumberLike,
+    _SupportsDType,
+    _VoidDTypeLike,
+    NBitBase,
+    _64Bit,
+    _32Bit,
+    _16Bit,
+    _8Bit,
+)
+from numpy.typing._callable import (
+    _BoolOp,
+    _BoolBitOp,
+    _BoolSub,
+    _BoolTrueDiv,
+    _BoolMod,
+    _BoolDivMod,
+    _TD64Div,
+    _IntTrueDiv,
+    _UnsignedIntOp,
+    _UnsignedIntBitOp,
+    _UnsignedIntMod,
+    _UnsignedIntDivMod,
+    _SignedIntOp,
+    _SignedIntBitOp,
+    _SignedIntMod,
+    _SignedIntDivMod,
+    _FloatOp,
+    _FloatMod,
+    _FloatDivMod,
+    _ComplexOp,
+    _NumberOp,
+    _ComparisonOp,
+)
 
 from typing import (
     Any,
@@ -22,7 +67,6 @@ from typing import (
     overload,
     Sequence,
     Sized,
-    SupportsAbs,
     SupportsComplex,
     SupportsFloat,
     SupportsInt,
@@ -33,40 +77,800 @@ from typing import (
     Union,
 )
 
-if sys.version_info[0] < 3:
-    class SupportsBytes: ...
-
-else:
-    from typing import SupportsBytes
-
 if sys.version_info >= (3, 8):
-    from typing import Literal, Protocol
+    from typing import Literal, Protocol, SupportsIndex, Final
 else:
-    from typing_extensions import Literal, Protocol
+    from typing_extensions import Literal, Protocol, Final
+    class SupportsIndex(Protocol):
+        def __index__(self) -> int: ...
+
+# Ensures that the stubs are picked up
+from numpy import (
+    char,
+    ctypeslib,
+    emath,
+    fft,
+    lib,
+    linalg,
+    ma,
+    matrixlib,
+    polynomial,
+    random,
+    rec,
+    testing,
+    version,
+)
+
+from numpy.core.function_base import (
+    linspace,
+    logspace,
+    geomspace,
+)
 
-# TODO: remove when the full numpy namespace is defined
-def __getattr__(name: str) -> Any: ...
+from numpy.core.fromnumeric import (
+    take,
+    reshape,
+    choose,
+    repeat,
+    put,
+    swapaxes,
+    transpose,
+    partition,
+    argpartition,
+    sort,
+    argsort,
+    argmax,
+    argmin,
+    searchsorted,
+    resize,
+    squeeze,
+    diagonal,
+    trace,
+    ravel,
+    nonzero,
+    shape,
+    compress,
+    clip,
+    sum,
+    all,
+    any,
+    cumsum,
+    ptp,
+    amax,
+    amin,
+    prod,
+    cumprod,
+    ndim,
+    size,
+    around,
+    mean,
+    std,
+    var,
+)
+
+from numpy.core._asarray import (
+    asarray as asarray,
+    asanyarray as asanyarray,
+    ascontiguousarray as ascontiguousarray,
+    asfortranarray as asfortranarray,
+    require as require,
+)
+
+from numpy.core._type_aliases import (
+    sctypes as sctypes,
+    sctypeDict as sctypeDict,
+)
+
+from numpy.core._ufunc_config import (
+    seterr as seterr,
+    geterr as geterr,
+    setbufsize as setbufsize,
+    getbufsize as getbufsize,
+    seterrcall as seterrcall,
+    geterrcall as geterrcall,
+    _SupportsWrite,
+    _ErrKind,
+    _ErrFunc,
+    _ErrDictOptional,
+)
+
+from numpy.core.numeric import (
+    zeros_like as zeros_like,
+    ones as ones,
+    ones_like as ones_like,
+    empty_like as empty_like,
+    full as full,
+    full_like as full_like,
+    count_nonzero as count_nonzero,
+    isfortran as isfortran,
+    argwhere as argwhere,
+    flatnonzero as flatnonzero,
+    correlate as correlate,
+    convolve as convolve,
+    outer as outer,
+    tensordot as tensordot,
+    roll as roll,
+    rollaxis as rollaxis,
+    moveaxis as moveaxis,
+    cross as cross,
+    indices as indices,
+    fromfunction as fromfunction,
+    isscalar as isscalar,
+    binary_repr as binary_repr,
+    base_repr as base_repr,
+    identity as identity,
+    allclose as allclose,
+    isclose as isclose,
+    array_equal as array_equal,
+    array_equiv as array_equiv,
+)
+
+from numpy.core.numerictypes import (
+    maximum_sctype as maximum_sctype,
+    issctype as issctype,
+    obj2sctype as obj2sctype,
+    issubclass_ as issubclass_,
+    issubsctype as issubsctype,
+    issubdtype as issubdtype,
+    sctype2char as sctype2char,
+    find_common_type as find_common_type,
+)
+
+from numpy.core.shape_base import (
+    atleast_1d as atleast_1d,
+    atleast_2d as atleast_2d,
+    atleast_3d as atleast_3d,
+    block as block,
+    hstack as hstack,
+    stack as stack,
+    vstack as vstack,
+)
+
+# Add an object to `__all__` if their stubs are defined in an external file;
+# their stubs will not be recognized otherwise.
+# NOTE: This is redundant for objects defined within this file.
+__all__ = [
+    "linspace",
+    "logspace",
+    "geomspace",
+    "take",
+    "reshape",
+    "choose",
+    "repeat",
+    "put",
+    "swapaxes",
+    "transpose",
+    "partition",
+    "argpartition",
+    "sort",
+    "argsort",
+    "argmax",
+    "argmin",
+    "searchsorted",
+    "resize",
+    "squeeze",
+    "diagonal",
+    "trace",
+    "ravel",
+    "nonzero",
+    "shape",
+    "compress",
+    "clip",
+    "sum",
+    "all",
+    "any",
+    "cumsum",
+    "ptp",
+    "amax",
+    "amin",
+    "prod",
+    "cumprod",
+    "ndim",
+    "size",
+    "around",
+    "mean",
+    "std",
+    "var",
+]
+
+DataSource: Any
+MachAr: Any
+ScalarType: Any
+angle: Any
+append: Any
+apply_along_axis: Any
+apply_over_axes: Any
+arange: Any
+array2string: Any
+array_repr: Any
+array_split: Any
+array_str: Any
+asarray_chkfinite: Any
+asfarray: Any
+asmatrix: Any
+asscalar: Any
+average: Any
+bartlett: Any
+bincount: Any
+bitwise_not: Any
+blackman: Any
+bmat: Any
+bool8: Any
+broadcast: Any
+broadcast_arrays: Any
+broadcast_to: Any
+busday_count: Any
+busday_offset: Any
+busdaycalendar: Any
+byte: Any
+byte_bounds: Any
+bytes0: Any
+c_: Any
+can_cast: Any
+cast: Any
+cdouble: Any
+cfloat: Any
+chararray: Any
+clongdouble: Any
+clongfloat: Any
+column_stack: Any
+common_type: Any
+compare_chararrays: Any
+complex256: Any
+complex_: Any
+concatenate: Any
+conj: Any
+copy: Any
+copyto: Any
+corrcoef: Any
+cov: Any
+csingle: Any
+cumproduct: Any
+datetime_as_string: Any
+datetime_data: Any
+delete: Any
+deprecate: Any
+deprecate_with_doc: Any
+diag: Any
+diag_indices: Any
+diag_indices_from: Any
+diagflat: Any
+diff: Any
+digitize: Any
+disp: Any
+divide: Any
+dot: Any
+double: Any
+dsplit: Any
+dstack: Any
+ediff1d: Any
+einsum: Any
+einsum_path: Any
+expand_dims: Any
+extract: Any
+eye: Any
+fill_diagonal: Any
+finfo: Any
+fix: Any
+flip: Any
+fliplr: Any
+flipud: Any
+float128: Any
+float_: Any
+format_float_positional: Any
+format_float_scientific: Any
+format_parser: Any
+frombuffer: Any
+fromfile: Any
+fromiter: Any
+frompyfunc: Any
+fromregex: Any
+fromstring: Any
+genfromtxt: Any
+get_include: Any
+get_printoptions: Any
+geterrobj: Any
+gradient: Any
+half: Any
+hamming: Any
+hanning: Any
+histogram: Any
+histogram2d: Any
+histogram_bin_edges: Any
+histogramdd: Any
+hsplit: Any
+i0: Any
+iinfo: Any
+imag: Any
+in1d: Any
+index_exp: Any
+info: Any
+inner: Any
+insert: Any
+int0: Any
+int_: Any
+intc: Any
+interp: Any
+intersect1d: Any
+intp: Any
+is_busday: Any
+iscomplex: Any
+iscomplexobj: Any
+isin: Any
+isneginf: Any
+isposinf: Any
+isreal: Any
+isrealobj: Any
+iterable: Any
+ix_: Any
+kaiser: Any
+kron: Any
+lexsort: Any
+load: Any
+loads: Any
+loadtxt: Any
+longcomplex: Any
+longdouble: Any
+longfloat: Any
+longlong: Any
+lookfor: Any
+mafromtxt: Any
+mask_indices: Any
+mat: Any
+matrix: Any
+max: Any
+may_share_memory: Any
+median: Any
+memmap: Any
+meshgrid: Any
+mgrid: Any
+min: Any
+min_scalar_type: Any
+mintypecode: Any
+mod: Any
+msort: Any
+nan_to_num: Any
+nanargmax: Any
+nanargmin: Any
+nancumprod: Any
+nancumsum: Any
+nanmax: Any
+nanmean: Any
+nanmedian: Any
+nanmin: Any
+nanpercentile: Any
+nanprod: Any
+nanquantile: Any
+nanstd: Any
+nansum: Any
+nanvar: Any
+nbytes: Any
+ndenumerate: Any
+ndfromtxt: Any
+ndindex: Any
+nditer: Any
+nested_iters: Any
+newaxis: Any
+numarray: Any
+object0: Any
+ogrid: Any
+packbits: Any
+pad: Any
+percentile: Any
+piecewise: Any
+place: Any
+poly: Any
+poly1d: Any
+polyadd: Any
+polyder: Any
+polydiv: Any
+polyfit: Any
+polyint: Any
+polymul: Any
+polysub: Any
+polyval: Any
+printoptions: Any
+product: Any
+promote_types: Any
+put_along_axis: Any
+putmask: Any
+quantile: Any
+r_: Any
+ravel_multi_index: Any
+real: Any
+real_if_close: Any
+recarray: Any
+recfromcsv: Any
+recfromtxt: Any
+record: Any
+result_type: Any
+roots: Any
+rot90: Any
+round: Any
+round_: Any
+row_stack: Any
+s_: Any
+save: Any
+savetxt: Any
+savez: Any
+savez_compressed: Any
+select: Any
+set_printoptions: Any
+set_string_function: Any
+setdiff1d: Any
+seterrobj: Any
+setxor1d: Any
+shares_memory: Any
+short: Any
+show_config: Any
+sinc: Any
+single: Any
+singlecomplex: Any
+sort_complex: Any
+source: Any
+split: Any
+string_: Any
+take_along_axis: Any
+tile: Any
+trapz: Any
+tri: Any
+tril: Any
+tril_indices: Any
+tril_indices_from: Any
+trim_zeros: Any
+triu: Any
+triu_indices: Any
+triu_indices_from: Any
+typeDict: Any
+typecodes: Any
+typename: Any
+ubyte: Any
+uint: Any
+uint0: Any
+uintc: Any
+uintp: Any
+ulonglong: Any
+union1d: Any
+unique: Any
+unpackbits: Any
+unravel_index: Any
+unwrap: Any
+ushort: Any
+vander: Any
+vdot: Any
+vectorize: Any
+void0: Any
+vsplit: Any
+where: Any
+who: Any
 
 _NdArraySubClass = TypeVar("_NdArraySubClass", bound=ndarray)
+_DTypeScalar = TypeVar("_DTypeScalar", bound=generic)
+_ByteOrder = Literal["S", "<", ">", "=", "|", "L", "B", "N", "I"]
 
-class dtype:
+class dtype(Generic[_DTypeScalar]):
     names: Optional[Tuple[str, ...]]
-    def __init__(
-        self,
-        dtype: DtypeLike,
+    # Overload for subclass of generic
+    @overload
+    def __new__(
+        cls,
+        dtype: Type[_DTypeScalar],
         align: bool = ...,
         copy: bool = ...,
-    ) -> None: ...
-    def __eq__(self, other: DtypeLike) -> bool: ...
-    def __ne__(self, other: DtypeLike) -> bool: ...
-    def __gt__(self, other: DtypeLike) -> bool: ...
-    def __ge__(self, other: DtypeLike) -> bool: ...
-    def __lt__(self, other: DtypeLike) -> bool: ...
-    def __le__(self, other: DtypeLike) -> bool: ...
+    ) -> dtype[_DTypeScalar]: ...
+    # Overloads for string aliases, Python types, and some assorted
+    # other special cases. Order is sometimes important because of the
+    # subtype relationships
+    #
+    # bool < int < float < complex
+    #
+    # so we have to make sure the overloads for the narrowest type is
+    # first.
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[
+            Type[bool],
+            Literal[
+                "?",
+                "=?",
+                "<?",
+                ">?",
+                "bool",
+                "bool_",
+            ],
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[bool_]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "uint8",
+            "u1",
+            "=u1",
+            "<u1",
+            ">u1",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[uint8]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "uint16",
+            "u2",
+            "=u2",
+            "<u2",
+            ">u2",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[uint16]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "uint32",
+            "u4",
+            "=u4",
+            "<u4",
+            ">u4",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[uint32]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "uint64",
+            "u8",
+            "=u8",
+            "<u8",
+            ">u8",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[uint64]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "int8",
+            "i1",
+            "=i1",
+            "<i1",
+            ">i1",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[int8]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "int16",
+            "i2",
+            "=i2",
+            "<i2",
+            ">i2",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[int16]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "int32",
+            "i4",
+            "=i4",
+            "<i4",
+            ">i4",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[int32]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "int64",
+            "i8",
+            "=i8",
+            "<i8",
+            ">i8",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[int64]: ...
+    # "int"/int resolve to int_, which is system dependent and as of
+    # now untyped. Long-term we'll do something fancier here.
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[Type[int], Literal["int"]],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "float16",
+            "f4",
+            "=f4",
+            "<f4",
+            ">f4",
+            "e",
+            "=e",
+            "<e",
+            ">e",
+            "half",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[float16]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "float32",
+            "f4",
+            "=f4",
+            "<f4",
+            ">f4",
+            "f",
+            "=f",
+            "<f",
+            ">f",
+            "single",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[float32]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[
+            None,
+            Type[float],
+            Literal[
+                "float64",
+                "f8",
+                "=f8",
+                "<f8",
+                ">f8",
+                "d",
+                "<d",
+                ">d",
+                "float",
+                "double",
+                "float_",
+            ],
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[float64]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Literal[
+            "complex64",
+            "c8",
+            "=c8",
+            "<c8",
+            ">c8",
+            "F",
+            "=F",
+            "<F",
+            ">F",
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[complex64]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[
+            Type[complex],
+            Literal[
+                "complex128",
+                "c16",
+                "=c16",
+                "<c16",
+                ">c16",
+                "D",
+                "=D",
+                "<D",
+                ">D",
+            ],
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[complex128]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[
+            Type[bytes],
+            Literal[
+                "S",
+                "=S",
+                "<S",
+                ">S",
+                "bytes",
+                "bytes_",
+                "bytes0",
+            ],
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[bytes_]: ...
+    @overload
+    def __new__(
+        cls,
+        dtype: Union[
+            Type[str],
+            Literal[
+                "U",
+                "=U",
+                # <U and >U intentionally not included; they are not
+                # the same dtype and which one dtype("U") translates
+                # to is platform-dependent.
+                "str",
+                "str_",
+                "str0",
+            ],
+        ],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[str_]: ...
+    # dtype of a dtype is the same dtype
+    @overload
+    def __new__(
+        cls,
+        dtype: dtype[_DTypeScalar],
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[_DTypeScalar]: ...
+    # TODO: handle _SupportsDType better
+    @overload
+    def __new__(
+        cls,
+        dtype: _SupportsDType,
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[Any]: ...
+    # Handle strings that can't be expressed as literals; i.e. s1, s2, ...
+    @overload
+    def __new__(
+        cls,
+        dtype: str,
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[Any]: ...
+    # Catchall overload
+    @overload
+    def __new__(
+        cls,
+        dtype: _VoidDTypeLike,
+        align: bool = ...,
+        copy: bool = ...,
+    ) -> dtype[void]: ...
+    def __eq__(self, other: DTypeLike) -> bool: ...
+    def __ne__(self, other: DTypeLike) -> bool: ...
+    def __gt__(self, other: DTypeLike) -> bool: ...
+    def __ge__(self, other: DTypeLike) -> bool: ...
+    def __lt__(self, other: DTypeLike) -> bool: ...
+    def __le__(self, other: DTypeLike) -> bool: ...
     @property
     def alignment(self) -> int: ...
     @property
-    def base(self) -> dtype: ...
+    def base(self: _DType) -> _DType: ...
     @property
     def byteorder(self) -> str: ...
     @property
@@ -76,7 +880,7 @@ class dtype:
     @property
     def fields(
         self,
-    ) -> Optional[Mapping[str, Union[Tuple[dtype, int], Tuple[dtype, int, Any]]]]: ...
+    ) -> Optional[Mapping[str, Union[Tuple[dtype[Any], int], Tuple[dtype[Any], int, Any]]]]: ...
     @property
     def flags(self) -> int: ...
     @property
@@ -102,16 +906,14 @@ class dtype:
     @property
     def ndim(self) -> int: ...
     @property
-    def subdtype(self) -> Optional[Tuple[dtype, _Shape]]: ...
-    def newbyteorder(self, new_order: str = ...) -> dtype: ...
+    def subdtype(self: _DType) -> Optional[Tuple[_DType, _Shape]]: ...
+    def newbyteorder(self: _DType, __new_order: _ByteOrder = ...) -> _DType: ...
     # Leave str and type for end to avoid having to use `builtins.str`
     # everywhere. See https://github.com/python/mypy/issues/3775
     @property
     def str(self) -> builtins.str: ...
     @property
-    def type(self) -> Type[generic]: ...
-
-_Dtype = dtype  # to avoid name conflicts with ndarray.dtype
+    def type(self) -> Type[_DTypeScalar]: ...
 
 class _flagsobj:
     aligned: bool
@@ -143,124 +945,517 @@ class _flagsobj:
     def __getitem__(self, key: str) -> bool: ...
     def __setitem__(self, key: str, value: bool) -> None: ...
 
+_ArrayLikeInt = Union[
+    int,
+    integer,
+    Sequence[Union[int, integer]],
+    Sequence[Sequence[Any]],  # TODO: wait for support for recursive types
+    ndarray
+]
+
 _FlatIterSelf = TypeVar("_FlatIterSelf", bound=flatiter)
 
-class flatiter(Generic[_ArraySelf]):
+class flatiter(Generic[_NdArraySubClass]):
     @property
-    def base(self) -> _ArraySelf: ...
+    def base(self) -> _NdArraySubClass: ...
     @property
     def coords(self) -> _Shape: ...
     @property
     def index(self) -> int: ...
-    def copy(self) -> _ArraySelf: ...
+    def copy(self) -> _NdArraySubClass: ...
     def __iter__(self: _FlatIterSelf) -> _FlatIterSelf: ...
-    def __next__(self) -> generic: ...
+    def __next__(self: flatiter[ndarray[Any, dtype[_ScalarType]]]) -> _ScalarType: ...
+    def __len__(self) -> int: ...
+    @overload
+    def __getitem__(
+        self: flatiter[ndarray[Any, dtype[_ScalarType]]],
+        key: Union[int, integer],
+    ) -> _ScalarType: ...
+    @overload
+    def __getitem__(
+        self, key: Union[_ArrayLikeInt, slice, ellipsis],
+    ) -> _NdArraySubClass: ...
+    @overload
+    def __array__(self: flatiter[ndarray[Any, _DType]], __dtype: None = ...) -> ndarray[Any, _DType]: ...
+    @overload
+    def __array__(self, __dtype: DTypeLike) -> ndarray[Any, dtype[Any]]: ...
+
+_OrderKACF = Optional[Literal["K", "A", "C", "F"]]
+_OrderACF = Optional[Literal["A", "C", "F"]]
+_OrderCF = Optional[Literal["C", "F"]]
+
+_ModeKind = Literal["raise", "wrap", "clip"]
+_PartitionKind = Literal["introselect"]
+_SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
+_SortSide = Literal["left", "right"]
+
+_ArrayLikeBool = Union[_BoolLike, Sequence[_BoolLike], ndarray]
+_ArrayLikeIntOrBool = Union[
+    _IntLike,
+    ndarray,
+    Sequence[_IntLike],
+    Sequence[Sequence[Any]],  # TODO: wait for support for recursive types
+]
 
 _ArraySelf = TypeVar("_ArraySelf", bound=_ArrayOrScalarCommon)
 
-class _ArrayOrScalarCommon(
-    SupportsInt, SupportsFloat, SupportsComplex, SupportsBytes, SupportsAbs[Any]
-):
+class _ArrayOrScalarCommon:
     @property
     def T(self: _ArraySelf) -> _ArraySelf: ...
     @property
-    def base(self) -> Optional[ndarray]: ...
-    @property
-    def dtype(self) -> _Dtype: ...
-    @property
     def data(self) -> memoryview: ...
     @property
     def flags(self) -> _flagsobj: ...
     @property
-    def size(self) -> int: ...
-    @property
     def itemsize(self) -> int: ...
     @property
     def nbytes(self) -> int: ...
-    @property
-    def ndim(self) -> int: ...
-    @property
-    def shape(self) -> _Shape: ...
-    @property
-    def strides(self) -> _Shape: ...
-    def __array__(self, __dtype: DtypeLike = ...) -> ndarray: ...
-    def __int__(self) -> int: ...
-    def __float__(self) -> float: ...
-    def __complex__(self) -> complex: ...
-    if sys.version_info[0] < 3:
-        def __oct__(self) -> str: ...
-        def __hex__(self) -> str: ...
-        def __nonzero__(self) -> bool: ...
-        def __unicode__(self) -> Text: ...
-    else:
-        def __bool__(self) -> bool: ...
-        def __bytes__(self) -> bytes: ...
+    def __bool__(self) -> bool: ...
+    def __bytes__(self) -> bytes: ...
     def __str__(self) -> str: ...
     def __repr__(self) -> str: ...
-    def __copy__(self: _ArraySelf, order: str = ...) -> _ArraySelf: ...
-    def __deepcopy__(self: _ArraySelf, memo: dict) -> _ArraySelf: ...
-    def __lt__(self, other): ...
-    def __le__(self, other): ...
+    def __copy__(self: _ArraySelf) -> _ArraySelf: ...
+    def __deepcopy__(self: _ArraySelf, __memo: Optional[dict] = ...) -> _ArraySelf: ...
     def __eq__(self, other): ...
     def __ne__(self, other): ...
-    def __gt__(self, other): ...
-    def __ge__(self, other): ...
-    def __add__(self, other): ...
-    def __radd__(self, other): ...
-    def __iadd__(self, other): ...
-    def __sub__(self, other): ...
-    def __rsub__(self, other): ...
-    def __isub__(self, other): ...
-    def __mul__(self, other): ...
-    def __rmul__(self, other): ...
-    def __imul__(self, other): ...
-    if sys.version_info[0] < 3:
-        def __div__(self, other): ...
-        def __rdiv__(self, other): ...
-        def __idiv__(self, other): ...
-    def __truediv__(self, other): ...
-    def __rtruediv__(self, other): ...
-    def __itruediv__(self, other): ...
-    def __floordiv__(self, other): ...
-    def __rfloordiv__(self, other): ...
-    def __ifloordiv__(self, other): ...
-    def __mod__(self, other): ...
-    def __rmod__(self, other): ...
-    def __imod__(self, other): ...
-    def __divmod__(self, other): ...
-    def __rdivmod__(self, other): ...
-    # NumPy's __pow__ doesn't handle a third argument
-    def __pow__(self, other): ...
-    def __rpow__(self, other): ...
-    def __ipow__(self, other): ...
-    def __lshift__(self, other): ...
-    def __rlshift__(self, other): ...
-    def __ilshift__(self, other): ...
-    def __rshift__(self, other): ...
-    def __rrshift__(self, other): ...
-    def __irshift__(self, other): ...
-    def __and__(self, other): ...
-    def __rand__(self, other): ...
-    def __iand__(self, other): ...
-    def __xor__(self, other): ...
-    def __rxor__(self, other): ...
-    def __ixor__(self, other): ...
-    def __or__(self, other): ...
-    def __ror__(self, other): ...
-    def __ior__(self, other): ...
-    if sys.version_info[:2] >= (3, 5):
-        def __matmul__(self, other): ...
-        def __rmatmul__(self, other): ...
-    def __neg__(self: _ArraySelf) -> _ArraySelf: ...
-    def __pos__(self: _ArraySelf) -> _ArraySelf: ...
-    def __abs__(self: _ArraySelf) -> _ArraySelf: ...
-    def __invert__(self: _ArraySelf) -> _ArraySelf: ...
-    # TODO(shoyer): remove when all methods are defined
-    def __getattr__(self, name) -> Any: ...
+    def astype(
+        self: _ArraySelf,
+        dtype: DTypeLike,
+        order: _OrderKACF = ...,
+        casting: _Casting = ...,
+        subok: bool = ...,
+        copy: bool = ...,
+    ) -> _ArraySelf: ...
+    def copy(self: _ArraySelf, order: _OrderKACF = ...) -> _ArraySelf: ...
+    def dump(self, file: str) -> None: ...
+    def dumps(self) -> bytes: ...
+    def flatten(self, order: _OrderKACF = ...) -> ndarray: ...
+    def getfield(
+        self: _ArraySelf, dtype: DTypeLike, offset: int = ...
+    ) -> _ArraySelf: ...
+    def ravel(self, order: _OrderKACF = ...) -> ndarray: ...
+    @overload
+    def reshape(
+        self, __shape: Sequence[int], *, order: _OrderACF = ...
+    ) -> ndarray: ...
+    @overload
+    def reshape(
+        self, *shape: int, order: _OrderACF = ...
+    ) -> ndarray: ...
+    def tobytes(self, order: _OrderKACF = ...) -> bytes: ...
+    # NOTE: `tostring()` is deprecated and therefore excluded
+    # def tostring(self, order=...): ...
+    def tofile(
+        self, fid: Union[IO[bytes], str], sep: str = ..., format: str = ...
+    ) -> None: ...
+    # generics and 0d arrays return builtin scalars
+    def tolist(self) -> Any: ...
+    @overload
+    def view(self, type: Type[_NdArraySubClass]) -> _NdArraySubClass: ...
+    @overload
+    def view(self: _ArraySelf, dtype: DTypeLike = ...) -> _ArraySelf: ...
+    @overload
+    def view(
+        self, dtype: DTypeLike, type: Type[_NdArraySubClass]
+    ) -> _NdArraySubClass: ...
+
+    # TODO: Add proper signatures
+    def __getitem__(self, key) -> Any: ...
+    @property
+    def __array_interface__(self): ...
+    @property
+    def __array_priority__(self): ...
+    @property
+    def __array_struct__(self): ...
+    def __array_wrap__(array, context=...): ...
+    def __setstate__(self, __state): ...
+    # a `bool_` is returned when `keepdims=True` and `self` is a 0d array
+    @overload
+    def all(
+        self, axis: None = ..., out: None = ..., keepdims: Literal[False] = ...
+    ) -> bool_: ...
+    @overload
+    def all(
+        self, axis: Optional[_ShapeLike] = ..., out: None = ..., keepdims: bool = ...
+    ) -> Union[bool_, ndarray]: ...
+    @overload
+    def all(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def any(
+        self, axis: None = ..., out: None = ..., keepdims: Literal[False] = ...
+    ) -> bool_: ...
+    @overload
+    def any(
+        self, axis: Optional[_ShapeLike] = ..., out: None = ..., keepdims: bool = ...
+    ) -> Union[bool_, ndarray]: ...
+    @overload
+    def any(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def argmax(self, axis: None = ..., out: None = ...) -> signedinteger: ...
+    @overload
+    def argmax(
+        self, axis: _ShapeLike = ..., out: None = ...
+    ) -> Union[signedinteger, ndarray]: ...
+    @overload
+    def argmax(
+        self, axis: Optional[_ShapeLike] = ..., out: _NdArraySubClass = ...
+    ) -> _NdArraySubClass: ...
+    @overload
+    def argmin(self, axis: None = ..., out: None = ...) -> signedinteger: ...
+    @overload
+    def argmin(
+        self, axis: _ShapeLike = ..., out: None = ...
+    ) -> Union[signedinteger, ndarray]: ...
+    @overload
+    def argmin(
+        self, axis: Optional[_ShapeLike] = ..., out: _NdArraySubClass = ...
+    ) -> _NdArraySubClass: ...
+    def argsort(
+        self,
+        axis: Optional[int] = ...,
+        kind: Optional[_SortKind] = ...,
+        order: Union[None, str, Sequence[str]] = ...,
+    ) -> ndarray: ...
+    @overload
+    def choose(
+        self, choices: ArrayLike, out: None = ..., mode: _ModeKind = ...,
+    ) -> ndarray: ...
+    @overload
+    def choose(
+        self, choices: ArrayLike, out: _NdArraySubClass = ..., mode: _ModeKind = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def clip(
+        self,
+        min: ArrayLike = ...,
+        max: Optional[ArrayLike] = ...,
+        out: None = ...,
+        **kwargs: Any,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def clip(
+        self,
+        min: None = ...,
+        max: ArrayLike = ...,
+        out: None = ...,
+        **kwargs: Any,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def clip(
+        self,
+        min: ArrayLike = ...,
+        max: Optional[ArrayLike] = ...,
+        out: _NdArraySubClass = ...,
+        **kwargs: Any,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def clip(
+        self,
+        min: None = ...,
+        max: ArrayLike = ...,
+        out: _NdArraySubClass = ...,
+        **kwargs: Any,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def compress(
+        self, a: ArrayLike, axis: Optional[int] = ..., out: None = ...,
+    ) -> ndarray: ...
+    @overload
+    def compress(
+        self, a: ArrayLike, axis: Optional[int] = ..., out: _NdArraySubClass = ...,
+    ) -> _NdArraySubClass: ...
+    def conj(self: _ArraySelf) -> _ArraySelf: ...
+    def conjugate(self: _ArraySelf) -> _ArraySelf: ...
+    @overload
+    def cumprod(
+        self, axis: Optional[int] = ..., dtype: DTypeLike = ..., out: None = ...,
+    ) -> ndarray: ...
+    @overload
+    def cumprod(
+        self,
+        axis: Optional[int] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def cumsum(
+        self, axis: Optional[int] = ..., dtype: DTypeLike = ..., out: None = ...,
+    ) -> ndarray: ...
+    @overload
+    def cumsum(
+        self,
+        axis: Optional[int] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def max(
+        self,
+        axis: None = ...,
+        out: None = ...,
+        keepdims: Literal[False] = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> number: ...
+    @overload
+    def max(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: None = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def max(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def mean(
+        self,
+        axis: None = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: Literal[False] = ...,
+    ) -> number: ...
+    @overload
+    def mean(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: bool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def mean(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def min(
+        self,
+        axis: None = ...,
+        out: None = ...,
+        keepdims: Literal[False] = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> number: ...
+    @overload
+    def min(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: None = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def min(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> _NdArraySubClass: ...
+    def newbyteorder(self: _ArraySelf, __new_order: _ByteOrder = ...) -> _ArraySelf: ...
+    @overload
+    def prod(
+        self,
+        axis: None = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: Literal[False] = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> number: ...
+    @overload
+    def prod(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def prod(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def ptp(
+        self, axis: None = ..., out: None = ..., keepdims: Literal[False] = ...,
+    ) -> number: ...
+    @overload
+    def ptp(
+        self, axis: Optional[_ShapeLike] = ..., out: None = ..., keepdims: bool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def ptp(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+    def repeat(
+        self, repeats: _ArrayLikeIntOrBool, axis: Optional[int] = ...
+    ) -> ndarray: ...
+    @overload
+    def round(self: _ArraySelf, decimals: int = ..., out: None = ...) -> _ArraySelf: ...
+    @overload
+    def round(
+        self, decimals: int = ..., out: _NdArraySubClass = ...
+    ) -> _NdArraySubClass: ...
+    @overload
+    def std(
+        self,
+        axis: None = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        ddof: int = ...,
+        keepdims: Literal[False] = ...,
+    ) -> number: ...
+    @overload
+    def std(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        ddof: int = ...,
+        keepdims: bool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def std(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+        ddof: int = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def sum(
+        self,
+        axis: None = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: Literal[False] = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> number: ...
+    @overload
+    def sum(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def sum(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+        keepdims: bool = ...,
+        initial: _NumberLike = ...,
+        where: _ArrayLikeBool = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def take(
+        self,
+        indices: _IntLike,
+        axis: Optional[int] = ...,
+        out: None = ...,
+        mode: _ModeKind = ...,
+    ) -> generic: ...
+    @overload
+    def take(
+        self,
+        indices: _ArrayLikeIntOrBool,
+        axis: Optional[int] = ...,
+        out: None = ...,
+        mode: _ModeKind = ...,
+    ) -> ndarray: ...
+    @overload
+    def take(
+        self,
+        indices: _ArrayLikeIntOrBool,
+        axis: Optional[int] = ...,
+        out: _NdArraySubClass = ...,
+        mode: _ModeKind = ...,
+    ) -> _NdArraySubClass: ...
+    @overload
+    def var(
+        self,
+        axis: None = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        ddof: int = ...,
+        keepdims: Literal[False] = ...,
+    ) -> number: ...
+    @overload
+    def var(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+        ddof: int = ...,
+        keepdims: bool = ...,
+    ) -> Union[number, ndarray]: ...
+    @overload
+    def var(
+        self,
+        axis: Optional[_ShapeLike] = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+        ddof: int = ...,
+        keepdims: bool = ...,
+    ) -> _NdArraySubClass: ...
+
+_DType = TypeVar("_DType", bound=dtype[Any])
+
+# TODO: Set the `bound` to something more suitable once we
+# have proper shape support
+_ShapeType = TypeVar("_ShapeType", bound=Any)
 
 _BufferType = Union[ndarray, bytes, bytearray, memoryview]
+_Casting = Literal["no", "equiv", "safe", "same_kind", "unsafe"]
 
-class ndarray(_ArrayOrScalarCommon, Iterable, Sized, Container):
+class ndarray(_ArrayOrScalarCommon, Generic[_ShapeType, _DType]):
+    @property
+    def base(self) -> Optional[ndarray]: ...
+    @property
+    def ndim(self) -> int: ...
+    @property
+    def size(self) -> int: ...
     @property
     def real(self: _ArraySelf) -> _ArraySelf: ...
     @real.setter
@@ -272,14 +1467,16 @@ class ndarray(_ArrayOrScalarCommon, Iterable, Sized, Container):
     def __new__(
         cls: Type[_ArraySelf],
         shape: Sequence[int],
-        dtype: DtypeLike = ...,
+        dtype: DTypeLike = ...,
         buffer: _BufferType = ...,
         offset: int = ...,
         strides: _ShapeLike = ...,
-        order: Optional[str] = ...,
+        order: _OrderKACF = ...,
     ) -> _ArraySelf: ...
-    @property
-    def dtype(self) -> _Dtype: ...
+    @overload
+    def __array__(self, __dtype: None = ...) -> ndarray[Any, _DType]: ...
+    @overload
+    def __array__(self, __dtype: DTypeLike) -> ndarray[Any, dtype[Any]]: ...
     @property
     def ctypes(self) -> _ctypes: ...
     @property
@@ -287,82 +1484,161 @@ class ndarray(_ArrayOrScalarCommon, Iterable, Sized, Container):
     @shape.setter
     def shape(self, value: _ShapeLike): ...
     @property
-    def flat(self: _ArraySelf) -> flatiter[_ArraySelf]: ...
-    @property
     def strides(self) -> _Shape: ...
     @strides.setter
     def strides(self, value: _ShapeLike): ...
-    # Array conversion
+    def byteswap(self: _ArraySelf, inplace: bool = ...) -> _ArraySelf: ...
+    def fill(self, value: Any) -> None: ...
+    @property
+    def flat(self: _NdArraySubClass) -> flatiter[_NdArraySubClass]: ...
     @overload
     def item(self, *args: int) -> Any: ...
     @overload
-    def item(self, args: Tuple[int, ...]) -> Any: ...
-    def tolist(self) -> List[Any]: ...
+    def item(self, __args: Tuple[int, ...]) -> Any: ...
     @overload
     def itemset(self, __value: Any) -> None: ...
     @overload
     def itemset(self, __item: _ShapeLike, __value: Any) -> None: ...
-    def tobytes(self, order: Optional[str] = ...) -> bytes: ...
-    def tofile(
-        self, fid: Union[IO[bytes], str], sep: str = ..., format: str = ...
-    ) -> None: ...
-    def dump(self, file: str) -> None: ...
-    def dumps(self) -> bytes: ...
-    def astype(
-        self: _ArraySelf,
-        dtype: DtypeLike,
-        order: str = ...,
-        casting: str = ...,
-        subok: bool = ...,
-        copy: bool = ...,
-    ) -> _ArraySelf: ...
-    def byteswap(self: _ArraySelf, inplace: bool = ...) -> _ArraySelf: ...
-    def copy(self: _ArraySelf, order: str = ...) -> _ArraySelf: ...
-    @overload
-    def view(self, type: Type[_NdArraySubClass]) -> _NdArraySubClass: ...
     @overload
-    def view(self: _ArraySelf, dtype: DtypeLike = ...) -> _ArraySelf: ...
+    def resize(self, __new_shape: Sequence[int], *, refcheck: bool = ...) -> None: ...
     @overload
-    def view(
-        self, dtype: DtypeLike, type: Type[_NdArraySubClass]
-    ) -> _NdArraySubClass: ...
-    def getfield(
-        self: _ArraySelf, dtype: DtypeLike, offset: int = ...
-    ) -> _ArraySelf: ...
+    def resize(self, *new_shape: int, refcheck: bool = ...) -> None: ...
     def setflags(
         self, write: bool = ..., align: bool = ..., uic: bool = ...
     ) -> None: ...
-    def fill(self, value: Any) -> None: ...
-    # Shape manipulation
-    @overload
-    def reshape(
-        self: _ArraySelf, shape: Sequence[int], *, order: str = ...
+    def squeeze(
+        self: _ArraySelf, axis: Union[int, Tuple[int, ...]] = ...
     ) -> _ArraySelf: ...
+    def swapaxes(self: _ArraySelf, axis1: int, axis2: int) -> _ArraySelf: ...
     @overload
-    def reshape(self: _ArraySelf, *shape: int, order: str = ...) -> _ArraySelf: ...
+    def transpose(self: _ArraySelf, __axes: Sequence[int]) -> _ArraySelf: ...
     @overload
-    def resize(self, new_shape: Sequence[int], *, refcheck: bool = ...) -> None: ...
+    def transpose(self: _ArraySelf, *axes: int) -> _ArraySelf: ...
+    def argpartition(
+        self,
+        kth: _ArrayLikeIntOrBool,
+        axis: Optional[int] = ...,
+        kind: _PartitionKind = ...,
+        order: Union[None, str, Sequence[str]] = ...,
+    ) -> ndarray: ...
+    def diagonal(
+        self: _ArraySelf, offset: int = ..., axis1: int = ..., axis2: int = ...
+    ) -> _ArraySelf: ...
     @overload
-    def resize(self, *new_shape: int, refcheck: bool = ...) -> None: ...
+    def dot(self, b: ArrayLike, out: None = ...) -> Union[number, ndarray]: ...
+    @overload
+    def dot(self, b: ArrayLike, out: _NdArraySubClass = ...) -> _NdArraySubClass: ...
+    # `nonzero()` is deprecated for 0d arrays/generics
+    def nonzero(self) -> Tuple[ndarray, ...]: ...
+    def partition(
+        self,
+        kth: _ArrayLikeIntOrBool,
+        axis: int = ...,
+        kind: _PartitionKind = ...,
+        order: Union[None, str, Sequence[str]] = ...,
+    ) -> None: ...
+    # `put` is technically available to `generic`,
+    # but is pointless as `generic`s are immutable
+    def put(
+        self, ind: _ArrayLikeIntOrBool, v: ArrayLike, mode: _ModeKind = ...
+    ) -> None: ...
+    def searchsorted(
+        self,  # >= 1D array
+        v: ArrayLike,
+        side: _SortSide = ...,
+        sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
+    ) -> ndarray: ...
+    def setfield(
+        self, val: ArrayLike, dtype: DTypeLike, offset: int = ...
+    ) -> None: ...
+    def sort(
+        self,
+        axis: int = ...,
+        kind: Optional[_SortKind] = ...,
+        order: Union[None, str, Sequence[str]] = ...,
+    ) -> None: ...
     @overload
-    def transpose(self: _ArraySelf, axes: Sequence[int]) -> _ArraySelf: ...
+    def trace(
+        self,  # >= 2D array
+        offset: int = ...,
+        axis1: int = ...,
+        axis2: int = ...,
+        dtype: DTypeLike = ...,
+        out: None = ...,
+    ) -> Union[number, ndarray]: ...
     @overload
-    def transpose(self: _ArraySelf, *axes: int) -> _ArraySelf: ...
-    def swapaxes(self: _ArraySelf, axis1: int, axis2: int) -> _ArraySelf: ...
-    def flatten(self: _ArraySelf, order: str = ...) -> _ArraySelf: ...
-    def ravel(self: _ArraySelf, order: str = ...) -> _ArraySelf: ...
-    def squeeze(
-        self: _ArraySelf, axis: Union[int, Tuple[int, ...]] = ...
-    ) -> _ArraySelf: ...
+    def trace(
+        self,  # >= 2D array
+        offset: int = ...,
+        axis1: int = ...,
+        axis2: int = ...,
+        dtype: DTypeLike = ...,
+        out: _NdArraySubClass = ...,
+    ) -> _NdArraySubClass: ...
     # Many of these special methods are irrelevant currently, since protocols
     # aren't supported yet. That said, I'm adding them for completeness.
     # https://docs.python.org/3/reference/datamodel.html
+    def __int__(self) -> int: ...
+    def __float__(self) -> float: ...
+    def __complex__(self) -> complex: ...
     def __len__(self) -> int: ...
-    def __getitem__(self, key) -> Any: ...
     def __setitem__(self, key, value): ...
     def __iter__(self) -> Any: ...
     def __contains__(self, key) -> bool: ...
     def __index__(self) -> int: ...
+    def __lt__(self, other: ArrayLike) -> Union[ndarray, bool_]: ...
+    def __le__(self, other: ArrayLike) -> Union[ndarray, bool_]: ...
+    def __gt__(self, other: ArrayLike) -> Union[ndarray, bool_]: ...
+    def __ge__(self, other: ArrayLike) -> Union[ndarray, bool_]: ...
+    def __matmul__(self, other: ArrayLike) -> Any: ...
+    # NOTE: `ndarray` does not implement `__imatmul__`
+    def __rmatmul__(self, other: ArrayLike) -> Any: ...
+    def __neg__(self: _ArraySelf) -> Any: ...
+    def __pos__(self: _ArraySelf) -> Any: ...
+    def __abs__(self: _ArraySelf) -> Any: ...
+    def __mod__(self, other: ArrayLike) -> Any: ...
+    def __rmod__(self, other: ArrayLike) -> Any: ...
+    def __divmod__(self, other: ArrayLike) -> Tuple[Any, Any]: ...
+    def __rdivmod__(self, other: ArrayLike) -> Tuple[Any, Any]: ...
+    def __add__(self, other: ArrayLike) -> Any: ...
+    def __radd__(self, other: ArrayLike) -> Any: ...
+    def __sub__(self, other: ArrayLike) -> Any: ...
+    def __rsub__(self, other: ArrayLike) -> Any: ...
+    def __mul__(self, other: ArrayLike) -> Any: ...
+    def __rmul__(self, other: ArrayLike) -> Any: ...
+    def __floordiv__(self, other: ArrayLike) -> Any: ...
+    def __rfloordiv__(self, other: ArrayLike) -> Any: ...
+    def __pow__(self, other: ArrayLike) -> Any: ...
+    def __rpow__(self, other: ArrayLike) -> Any: ...
+    def __truediv__(self, other: ArrayLike) -> Any: ...
+    def __rtruediv__(self, other: ArrayLike) -> Any: ...
+    def __invert__(self: _ArraySelf) -> Any: ...
+    def __lshift__(self, other: ArrayLike) -> Any: ...
+    def __rlshift__(self, other: ArrayLike) -> Any: ...
+    def __rshift__(self, other: ArrayLike) -> Any: ...
+    def __rrshift__(self, other: ArrayLike) -> Any: ...
+    def __and__(self, other: ArrayLike) -> Any: ...
+    def __rand__(self, other: ArrayLike) -> Any: ...
+    def __xor__(self, other: ArrayLike) -> Any: ...
+    def __rxor__(self, other: ArrayLike) -> Any: ...
+    def __or__(self, other: ArrayLike) -> Any: ...
+    def __ror__(self, other: ArrayLike) -> Any: ...
+    # `np.generic` does not support inplace operations
+    def __iadd__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __isub__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __imul__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __itruediv__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __ifloordiv__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __ipow__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __imod__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __ilshift__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __irshift__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __iand__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __ixor__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    def __ior__(self: _ArraySelf, other: ArrayLike) -> _ArraySelf: ...
+    # Keep `dtype` at the bottom to avoid name conflicts with `np.dtype`
+    @property
+    def dtype(self) -> _DType: ...
 
 # NOTE: while `np.generic` is not technically an instance of `ABCMeta`,
 # the `@abstractmethod` decorator is herein used to (forcefully) deny
@@ -372,142 +1648,364 @@ class ndarray(_ArrayOrScalarCommon, Iterable, Sized, Container):
 
 # See https://github.com/numpy/numpy-stubs/pull/80 for more details.
 
+_ScalarType = TypeVar("_ScalarType", bound=generic)
+_NBit_co = TypeVar("_NBit_co", covariant=True, bound=NBitBase)
+_NBit_co2 = TypeVar("_NBit_co2", covariant=True, bound=NBitBase)
+
 class generic(_ArrayOrScalarCommon):
     @abstractmethod
     def __init__(self, *args: Any, **kwargs: Any) -> None: ...
+    @overload
+    def __array__(self: _ScalarType, __dtype: None = ...) -> ndarray[Any, dtype[_ScalarType]]: ...
+    @overload
+    def __array__(self, __dtype: DTypeLike) -> ndarray[Any, dtype[Any]]: ...
     @property
     def base(self) -> None: ...
+    @property
+    def ndim(self) -> Literal[0]: ...
+    @property
+    def size(self) -> Literal[1]: ...
+    @property
+    def shape(self) -> Tuple[()]: ...
+    @property
+    def strides(self) -> Tuple[()]: ...
+    def byteswap(self: _ScalarType, inplace: Literal[False] = ...) -> _ScalarType: ...
+    @property
+    def flat(self: _ScalarType) -> flatiter[ndarray[Any, dtype[_ScalarType]]]: ...
+    def item(
+        self: _ScalarType,
+        __args: Union[Literal[0], Tuple[()], Tuple[Literal[0]]] = ...,
+    ) -> Any: ...
+    def squeeze(
+        self: _ScalarType, axis: Union[Literal[0], Tuple[()]] = ...
+    ) -> _ScalarType: ...
+    def transpose(self: _ScalarType, __axes: Tuple[()] = ...) -> _ScalarType: ...
+    # Keep `dtype` at the bottom to avoid name conflicts with `np.dtype`
+    @property
+    def dtype(self: _ScalarType) -> dtype[_ScalarType]: ...
 
-class _real_generic(generic):  # type: ignore
+class number(generic, Generic[_NBit_co]):  # type: ignore
     @property
     def real(self: _ArraySelf) -> _ArraySelf: ...
     @property
     def imag(self: _ArraySelf) -> _ArraySelf: ...
-
-class number(generic): ...  # type: ignore
-
-class bool_(_real_generic):
+    def __int__(self) -> int: ...
+    def __float__(self) -> float: ...
+    def __complex__(self) -> complex: ...
+    def __neg__(self: _ArraySelf) -> _ArraySelf: ...
+    def __pos__(self: _ArraySelf) -> _ArraySelf: ...
+    def __abs__(self: _ArraySelf) -> _ArraySelf: ...
+    # Ensure that objects annotated as `number` support arithmetic operations
+    __add__: _NumberOp
+    __radd__: _NumberOp
+    __sub__: _NumberOp
+    __rsub__: _NumberOp
+    __mul__: _NumberOp
+    __rmul__: _NumberOp
+    __floordiv__: _NumberOp
+    __rfloordiv__: _NumberOp
+    __pow__: _NumberOp
+    __rpow__: _NumberOp
+    __truediv__: _NumberOp
+    __rtruediv__: _NumberOp
+    __lt__: _ComparisonOp[_NumberLike]
+    __le__: _ComparisonOp[_NumberLike]
+    __gt__: _ComparisonOp[_NumberLike]
+    __ge__: _ComparisonOp[_NumberLike]
+
+class bool_(generic):
     def __init__(self, __value: object = ...) -> None: ...
+    @property
+    def real(self: _ArraySelf) -> _ArraySelf: ...
+    @property
+    def imag(self: _ArraySelf) -> _ArraySelf: ...
+    def __int__(self) -> int: ...
+    def __float__(self) -> float: ...
+    def __complex__(self) -> complex: ...
+    def __abs__(self: _ArraySelf) -> _ArraySelf: ...
+    __add__: _BoolOp[bool_]
+    __radd__: _BoolOp[bool_]
+    __sub__: _BoolSub
+    __rsub__: _BoolSub
+    __mul__: _BoolOp[bool_]
+    __rmul__: _BoolOp[bool_]
+    __floordiv__: _BoolOp[int8]
+    __rfloordiv__: _BoolOp[int8]
+    __pow__: _BoolOp[int8]
+    __rpow__: _BoolOp[int8]
+    __truediv__: _BoolTrueDiv
+    __rtruediv__: _BoolTrueDiv
+    def __invert__(self) -> bool_: ...
+    __lshift__: _BoolBitOp[int8]
+    __rlshift__: _BoolBitOp[int8]
+    __rshift__: _BoolBitOp[int8]
+    __rrshift__: _BoolBitOp[int8]
+    __and__: _BoolBitOp[bool_]
+    __rand__: _BoolBitOp[bool_]
+    __xor__: _BoolBitOp[bool_]
+    __rxor__: _BoolBitOp[bool_]
+    __or__: _BoolBitOp[bool_]
+    __ror__: _BoolBitOp[bool_]
+    __mod__: _BoolMod
+    __rmod__: _BoolMod
+    __divmod__: _BoolDivMod
+    __rdivmod__: _BoolDivMod
+    __lt__: _ComparisonOp[_NumberLike]
+    __le__: _ComparisonOp[_NumberLike]
+    __gt__: _ComparisonOp[_NumberLike]
+    __ge__: _ComparisonOp[_NumberLike]
 
 class object_(generic):
     def __init__(self, __value: object = ...) -> None: ...
+    @property
+    def real(self: _ArraySelf) -> _ArraySelf: ...
+    @property
+    def imag(self: _ArraySelf) -> _ArraySelf: ...
 
-class datetime64:
+class datetime64(generic):
     @overload
     def __init__(
         self,
-        __value: Union[None, datetime64, str, dt.datetime] = ...,
-        __format: str = ...
+        __value: Union[None, datetime64, _CharLike, dt.datetime] = ...,
+        __format: Union[_CharLike, Tuple[_CharLike, _IntLike]] = ...,
     ) -> None: ...
     @overload
-    def __init__(self, __value: int, __format: str) -> None: ...
-    def __add__(self, other: Union[timedelta64, int]) -> datetime64: ...
-    def __sub__(self, other: Union[timedelta64, datetime64, int]) -> timedelta64: ...
-
-class integer(number, _real_generic): ...  # type: ignore
-class signedinteger(integer): ...  # type: ignore
-
-class int8(signedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class int16(signedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class int32(signedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class int64(signedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class timedelta64(signedinteger):
-    def __init__(self, __value: Any = ..., __format: str = ...) -> None: ...
-    @overload
-    def __add__(self, other: Union[timedelta64, int]) -> timedelta64: ...
-    @overload
-    def __add__(self, other: datetime64) -> datetime64: ...
-    def __sub__(self, other: Union[timedelta64, int]) -> timedelta64: ...
-    if sys.version_info[0] < 3:
-        @overload
-        def __div__(self, other: timedelta64) -> float: ...
-        @overload
-        def __div__(self, other: float) -> timedelta64: ...
-    @overload
-    def __truediv__(self, other: timedelta64) -> float: ...
-    @overload
-    def __truediv__(self, other: float) -> timedelta64: ...
-    def __mod__(self, other: timedelta64) -> timedelta64: ...
-
-class unsignedinteger(integer): ...  # type: ignore
-
-class uint8(unsignedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class uint16(unsignedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class uint32(unsignedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class uint64(unsignedinteger):
-    def __init__(self, __value: SupportsInt = ...) -> None: ...
-
-class inexact(number): ...  # type: ignore
-class floating(inexact, _real_generic): ...  # type: ignore
-
-class float16(floating):
-    def __init__(self, __value: Optional[SupportsFloat] = ...) -> None: ...
-
-class float32(floating):
-    def __init__(self, __value: Optional[SupportsFloat] = ...) -> None: ...
-
-class float64(floating):
-    def __init__(self, __value: Optional[SupportsFloat] = ...) -> None: ...
-
-class complexfloating(inexact): ...  # type: ignore
-
-class complex64(complexfloating):
     def __init__(
         self,
-        __value: Union[None, SupportsInt, SupportsFloat, SupportsComplex] = ...
+        __value: int,
+        __format: Union[_CharLike, Tuple[_CharLike, _IntLike]]
     ) -> None: ...
-    @property
-    def real(self) -> float32: ...
-    @property
-    def imag(self) -> float32: ...
+    def __add__(self, other: _TD64Like) -> datetime64: ...
+    def __radd__(self, other: _TD64Like) -> datetime64: ...
+    @overload
+    def __sub__(self, other: datetime64) -> timedelta64: ...
+    @overload
+    def __sub__(self, other: _TD64Like) -> datetime64: ...
+    def __rsub__(self, other: datetime64) -> timedelta64: ...
+    __lt__: _ComparisonOp[datetime64]
+    __le__: _ComparisonOp[datetime64]
+    __gt__: _ComparisonOp[datetime64]
+    __ge__: _ComparisonOp[datetime64]
+
+# Support for `__index__` was added in python 3.8 (bpo-20092)
+if sys.version_info >= (3, 8):
+    _IntValue = Union[SupportsInt, _CharLike, SupportsIndex]
+    _FloatValue = Union[None, _CharLike, SupportsFloat, SupportsIndex]
+    _ComplexValue = Union[None, _CharLike, SupportsFloat, SupportsComplex, SupportsIndex]
+else:
+    _IntValue = Union[SupportsInt, _CharLike]
+    _FloatValue = Union[None, _CharLike, SupportsFloat]
+    _ComplexValue = Union[None, _CharLike, SupportsFloat, SupportsComplex]
 
-class complex128(complexfloating):
+class integer(number[_NBit_co]):  # type: ignore
+    # NOTE: `__index__` is technically defined in the bottom-most
+    # sub-classes (`int64`, `uint32`, etc)
+    def __index__(self) -> int: ...
+    __truediv__: _IntTrueDiv[_NBit_co]
+    __rtruediv__: _IntTrueDiv[_NBit_co]
+    def __mod__(self, value: _IntLike) -> integer: ...
+    def __rmod__(self, value: _IntLike) -> integer: ...
+    def __invert__(self: _IntType) -> _IntType: ...
+    # Ensure that objects annotated as `integer` support bit-wise operations
+    def __lshift__(self, other: _IntLike) -> integer: ...
+    def __rlshift__(self, other: _IntLike) -> integer: ...
+    def __rshift__(self, other: _IntLike) -> integer: ...
+    def __rrshift__(self, other: _IntLike) -> integer: ...
+    def __and__(self, other: _IntLike) -> integer: ...
+    def __rand__(self, other: _IntLike) -> integer: ...
+    def __or__(self, other: _IntLike) -> integer: ...
+    def __ror__(self, other: _IntLike) -> integer: ...
+    def __xor__(self, other: _IntLike) -> integer: ...
+    def __rxor__(self, other: _IntLike) -> integer: ...
+
+class signedinteger(integer[_NBit_co]):
+    def __init__(self, __value: _IntValue = ...) -> None: ...
+    __add__: _SignedIntOp[_NBit_co]
+    __radd__: _SignedIntOp[_NBit_co]
+    __sub__: _SignedIntOp[_NBit_co]
+    __rsub__: _SignedIntOp[_NBit_co]
+    __mul__: _SignedIntOp[_NBit_co]
+    __rmul__: _SignedIntOp[_NBit_co]
+    __floordiv__: _SignedIntOp[_NBit_co]
+    __rfloordiv__: _SignedIntOp[_NBit_co]
+    __pow__: _SignedIntOp[_NBit_co]
+    __rpow__: _SignedIntOp[_NBit_co]
+    __lshift__: _SignedIntBitOp[_NBit_co]
+    __rlshift__: _SignedIntBitOp[_NBit_co]
+    __rshift__: _SignedIntBitOp[_NBit_co]
+    __rrshift__: _SignedIntBitOp[_NBit_co]
+    __and__: _SignedIntBitOp[_NBit_co]
+    __rand__: _SignedIntBitOp[_NBit_co]
+    __xor__: _SignedIntBitOp[_NBit_co]
+    __rxor__: _SignedIntBitOp[_NBit_co]
+    __or__: _SignedIntBitOp[_NBit_co]
+    __ror__: _SignedIntBitOp[_NBit_co]
+    __mod__: _SignedIntMod[_NBit_co]
+    __rmod__: _SignedIntMod[_NBit_co]
+    __divmod__: _SignedIntDivMod[_NBit_co]
+    __rdivmod__: _SignedIntDivMod[_NBit_co]
+
+int8 = signedinteger[_8Bit]
+int16 = signedinteger[_16Bit]
+int32 = signedinteger[_32Bit]
+int64 = signedinteger[_64Bit]
+
+class timedelta64(generic):
     def __init__(
         self,
-        __value: Union[None, SupportsInt, SupportsFloat, SupportsComplex] = ...
+        __value: Union[None, int, _CharLike, dt.timedelta, timedelta64] = ...,
+        __format: Union[_CharLike, Tuple[_CharLike, _IntLike]] = ...,
     ) -> None: ...
+    def __int__(self) -> int: ...
+    def __float__(self) -> float: ...
+    def __complex__(self) -> complex: ...
+    def __neg__(self: _ArraySelf) -> _ArraySelf: ...
+    def __pos__(self: _ArraySelf) -> _ArraySelf: ...
+    def __abs__(self: _ArraySelf) -> _ArraySelf: ...
+    def __add__(self, other: _TD64Like) -> timedelta64: ...
+    def __radd__(self, other: _TD64Like) -> timedelta64: ...
+    def __sub__(self, other: _TD64Like) -> timedelta64: ...
+    def __rsub__(self, other: _TD64Like) -> timedelta64: ...
+    def __mul__(self, other: _FloatLike) -> timedelta64: ...
+    def __rmul__(self, other: _FloatLike) -> timedelta64: ...
+    __truediv__: _TD64Div[float64]
+    __floordiv__: _TD64Div[int64]
+    def __rtruediv__(self, other: timedelta64) -> float64: ...
+    def __rfloordiv__(self, other: timedelta64) -> int64: ...
+    def __mod__(self, other: timedelta64) -> timedelta64: ...
+    def __rmod__(self, other: timedelta64) -> timedelta64: ...
+    def __divmod__(self, other: timedelta64) -> Tuple[int64, timedelta64]: ...
+    def __rdivmod__(self, other: timedelta64) -> Tuple[int64, timedelta64]: ...
+    __lt__: _ComparisonOp[Union[timedelta64, _IntLike, _BoolLike]]
+    __le__: _ComparisonOp[Union[timedelta64, _IntLike, _BoolLike]]
+    __gt__: _ComparisonOp[Union[timedelta64, _IntLike, _BoolLike]]
+    __ge__: _ComparisonOp[Union[timedelta64, _IntLike, _BoolLike]]
+
+class unsignedinteger(integer[_NBit_co]):
+    # NOTE: `uint64 + signedinteger -> float64`
+    def __init__(self, __value: _IntValue = ...) -> None: ...
+    __add__: _UnsignedIntOp[_NBit_co]
+    __radd__: _UnsignedIntOp[_NBit_co]
+    __sub__: _UnsignedIntOp[_NBit_co]
+    __rsub__: _UnsignedIntOp[_NBit_co]
+    __mul__: _UnsignedIntOp[_NBit_co]
+    __rmul__: _UnsignedIntOp[_NBit_co]
+    __floordiv__: _UnsignedIntOp[_NBit_co]
+    __rfloordiv__: _UnsignedIntOp[_NBit_co]
+    __pow__: _UnsignedIntOp[_NBit_co]
+    __rpow__: _UnsignedIntOp[_NBit_co]
+    __lshift__: _UnsignedIntBitOp[_NBit_co]
+    __rlshift__: _UnsignedIntBitOp[_NBit_co]
+    __rshift__: _UnsignedIntBitOp[_NBit_co]
+    __rrshift__: _UnsignedIntBitOp[_NBit_co]
+    __and__: _UnsignedIntBitOp[_NBit_co]
+    __rand__: _UnsignedIntBitOp[_NBit_co]
+    __xor__: _UnsignedIntBitOp[_NBit_co]
+    __rxor__: _UnsignedIntBitOp[_NBit_co]
+    __or__: _UnsignedIntBitOp[_NBit_co]
+    __ror__: _UnsignedIntBitOp[_NBit_co]
+    __mod__: _UnsignedIntMod[_NBit_co]
+    __rmod__: _UnsignedIntMod[_NBit_co]
+    __divmod__: _UnsignedIntDivMod[_NBit_co]
+    __rdivmod__: _UnsignedIntDivMod[_NBit_co]
+
+uint8 = unsignedinteger[_8Bit]
+uint16 = unsignedinteger[_16Bit]
+uint32 = unsignedinteger[_32Bit]
+uint64 = unsignedinteger[_64Bit]
+
+class inexact(number[_NBit_co]): ...  # type: ignore
+
+_IntType = TypeVar("_IntType", bound=integer)
+_FloatType = TypeVar('_FloatType', bound=floating)
+
+class floating(inexact[_NBit_co]):
+    def __init__(self, __value: _FloatValue = ...) -> None: ...
+    __add__: _FloatOp[_NBit_co]
+    __radd__: _FloatOp[_NBit_co]
+    __sub__: _FloatOp[_NBit_co]
+    __rsub__: _FloatOp[_NBit_co]
+    __mul__: _FloatOp[_NBit_co]
+    __rmul__: _FloatOp[_NBit_co]
+    __truediv__: _FloatOp[_NBit_co]
+    __rtruediv__: _FloatOp[_NBit_co]
+    __floordiv__: _FloatOp[_NBit_co]
+    __rfloordiv__: _FloatOp[_NBit_co]
+    __pow__: _FloatOp[_NBit_co]
+    __rpow__: _FloatOp[_NBit_co]
+    __mod__: _FloatMod[_NBit_co]
+    __rmod__: _FloatMod[_NBit_co]
+    __divmod__: _FloatDivMod[_NBit_co]
+    __rdivmod__: _FloatDivMod[_NBit_co]
+
+float16 = floating[_16Bit]
+float32 = floating[_32Bit]
+float64 = floating[_64Bit]
+
+# The main reason for `complexfloating` having two typevars is cosmetic.
+# It is used to clarify why `complex128`s precision is `_64Bit`, the latter
+# describing the two 64 bit floats representing its real and imaginary component
+
+class complexfloating(inexact[_NBit_co], Generic[_NBit_co, _NBit_co2]):
+    def __init__(self, __value: _ComplexValue = ...) -> None: ...
+    @property
+    def real(self) -> floating[_NBit_co]: ...  # type: ignore[override]
+    @property
+    def imag(self) -> floating[_NBit_co2]: ...  # type: ignore[override]
+    def __abs__(self) -> floating[_NBit_co]: ...  # type: ignore[override]
+    __add__: _ComplexOp[_NBit_co]
+    __radd__: _ComplexOp[_NBit_co]
+    __sub__: _ComplexOp[_NBit_co]
+    __rsub__: _ComplexOp[_NBit_co]
+    __mul__: _ComplexOp[_NBit_co]
+    __rmul__: _ComplexOp[_NBit_co]
+    __truediv__: _ComplexOp[_NBit_co]
+    __rtruediv__: _ComplexOp[_NBit_co]
+    __floordiv__: _ComplexOp[_NBit_co]
+    __rfloordiv__: _ComplexOp[_NBit_co]
+    __pow__: _ComplexOp[_NBit_co]
+    __rpow__: _ComplexOp[_NBit_co]
+
+complex64 = complexfloating[_32Bit, _32Bit]
+complex128 = complexfloating[_64Bit, _64Bit]
+
+class flexible(generic): ...  # type: ignore
+
+class void(flexible):
+    def __init__(self, __value: Union[_IntLike, bytes]): ...
     @property
-    def real(self) -> float64: ...
+    def real(self: _ArraySelf) -> _ArraySelf: ...
     @property
-    def imag(self) -> float64: ...
-
-class flexible(_real_generic): ...  # type: ignore
+    def imag(self: _ArraySelf) -> _ArraySelf: ...
+    def setfield(
+        self, val: ArrayLike, dtype: DTypeLike, offset: int = ...
+    ) -> None: ...
 
-class void(flexible):
-    def __init__(self, __value: Union[int, integer, bool_, bytes, bytes_]): ...
+class character(flexible):  # type: ignore
+    def __int__(self) -> int: ...
+    def __float__(self) -> float: ...
 
-class character(_real_generic): ...  # type: ignore
+# NOTE: Most `np.bytes_` / `np.str_` methods return their
+# builtin `bytes` / `str` counterpart
 
-class bytes_(character):
+class bytes_(character, bytes):
     @overload
     def __init__(self, __value: object = ...) -> None: ...
     @overload
     def __init__(
-        self, __value: Union[str, str_], encoding: str = ..., errors: str = ...
+        self, __value: str, encoding: str = ..., errors: str = ...
     ) -> None: ...
 
-class str_(character):
+class str_(character, str):
     @overload
     def __init__(self, __value: object = ...) -> None: ...
     @overload
     def __init__(
-        self, __value: Union[bytes, bytes_], encoding: str = ..., errors: str = ...
+        self, __value: bytes, encoding: str = ..., errors: str = ...
     ) -> None: ...
 
+unicode_ = str0 = str_
+
 # TODO(alan): Platform dependent types
 # longcomplex, longdouble, longfloat
 # bytes, short, intc, intp, longlong
@@ -518,158 +2016,81 @@ class str_(character):
 
 def array(
     object: object,
-    dtype: DtypeLike = ...,
+    dtype: DTypeLike = ...,
+    *,
     copy: bool = ...,
+    order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
+    like: ArrayLike = ...,
 ) -> ndarray: ...
 def zeros(
-    shape: _ShapeLike, dtype: DtypeLike = ..., order: Optional[str] = ...
-) -> ndarray: ...
-def ones(
-    shape: _ShapeLike, dtype: DtypeLike = ..., order: Optional[str] = ...
+    shape: _ShapeLike,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
 ) -> ndarray: ...
 def empty(
-    shape: _ShapeLike, dtype: DtypeLike = ..., order: Optional[str] = ...
-) -> ndarray: ...
-def zeros_like(
-    a: ArrayLike,
-    dtype: DtypeLike = ...,
-    order: str = ...,
-    subok: bool = ...,
-    shape: Optional[Union[int, Sequence[int]]] = ...,
-) -> ndarray: ...
-def ones_like(
-    a: ArrayLike,
-    dtype: DtypeLike = ...,
-    order: str = ...,
-    subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
-def empty_like(
-    a: ArrayLike,
-    dtype: DtypeLike = ...,
-    order: str = ...,
-    subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
-def full(
-    shape: _ShapeLike, fill_value: Any, dtype: DtypeLike = ..., order: str = ...
-) -> ndarray: ...
-def full_like(
-    a: ArrayLike,
-    fill_value: Any,
-    dtype: DtypeLike = ...,
-    order: str = ...,
-    subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
-def count_nonzero(
-    a: ArrayLike, axis: Optional[Union[int, Tuple[int], Tuple[int, int]]] = ...
-) -> Union[int, ndarray]: ...
-def isfortran(a: ndarray) -> bool: ...
-def argwhere(a: ArrayLike) -> ndarray: ...
-def flatnonzero(a: ArrayLike) -> ndarray: ...
-def correlate(a: ArrayLike, v: ArrayLike, mode: str = ...) -> ndarray: ...
-def convolve(a: ArrayLike, v: ArrayLike, mode: str = ...) -> ndarray: ...
-def outer(a: ArrayLike, b: ArrayLike, out: ndarray = ...) -> ndarray: ...
-def tensordot(
-    a: ArrayLike,
-    b: ArrayLike,
-    axes: Union[
-        int, Tuple[int, int], Tuple[Tuple[int, int], ...], Tuple[List[int, int], ...]
-    ] = ...,
-) -> ndarray: ...
-def roll(
-    a: ArrayLike,
-    shift: Union[int, Tuple[int, ...]],
-    axis: Optional[Union[int, Tuple[int, ...]]] = ...,
+    shape: _ShapeLike,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
 ) -> ndarray: ...
-def rollaxis(a: ArrayLike, axis: int, start: int = ...) -> ndarray: ...
-def moveaxis(
-    a: ndarray,
-    source: Union[int, Sequence[int]],
-    destination: Union[int, Sequence[int]],
-) -> ndarray: ...
-def cross(
-    a: ArrayLike,
-    b: ArrayLike,
-    axisa: int = ...,
-    axisb: int = ...,
-    axisc: int = ...,
-    axis: Optional[int] = ...,
-) -> ndarray: ...
-def indices(
-    dimensions: Sequence[int], dtype: dtype = ..., sparse: bool = ...
-) -> Union[ndarray, Tuple[ndarray, ...]]: ...
-def fromfunction(function: Callable, shape: Tuple[int, int], **kwargs) -> Any: ...
-def isscalar(element: Any) -> bool: ...
-def binary_repr(num: int, width: Optional[int] = ...) -> str: ...
-def base_repr(number: int, base: int = ..., padding: int = ...) -> str: ...
-def identity(n: int, dtype: DtypeLike = ...) -> ndarray: ...
-def allclose(
-    a: ArrayLike,
-    b: ArrayLike,
-    rtol: float = ...,
-    atol: float = ...,
-    equal_nan: bool = ...,
-) -> bool: ...
-def isclose(
-    a: ArrayLike,
-    b: ArrayLike,
-    rtol: float = ...,
-    atol: float = ...,
-    equal_nan: bool = ...,
-) -> Union[bool_, ndarray]: ...
-def array_equal(a1: ArrayLike, a2: ArrayLike) -> bool: ...
-def array_equiv(a1: ArrayLike, a2: ArrayLike) -> bool: ...
+
+def broadcast_shapes(*args: _ShapeLike) -> _Shape: ...
 
 #
 # Constants
 #
 
-Inf: float
-Infinity: float
-NAN: float
-NINF: float
-NZERO: float
-NaN: float
-PINF: float
-PZERO: float
-e: float
-euler_gamma: float
-inf: float
-infty: float
-nan: float
-pi: float
-
-ALLOW_THREADS: int
-BUFSIZE: int
-CLIP: int
-ERR_CALL: int
-ERR_DEFAULT: int
-ERR_IGNORE: int
-ERR_LOG: int
-ERR_PRINT: int
-ERR_RAISE: int
-ERR_WARN: int
-FLOATING_POINT_SUPPORT: int
-FPE_DIVIDEBYZERO: int
-FPE_INVALID: int
-FPE_OVERFLOW: int
-FPE_UNDERFLOW: int
-MAXDIMS: int
-MAY_SHARE_BOUNDS: int
-MAY_SHARE_EXACT: int
-RAISE: int
-SHIFT_DIVIDEBYZERO: int
-SHIFT_INVALID: int
-SHIFT_OVERFLOW: int
-SHIFT_UNDERFLOW: int
-UFUNC_BUFSIZE_DEFAULT: int
-WRAP: int
-little_endian: int
-tracemalloc_domain: int
+Inf: Final[float]
+Infinity: Final[float]
+NAN: Final[float]
+NINF: Final[float]
+NZERO: Final[float]
+NaN: Final[float]
+PINF: Final[float]
+PZERO: Final[float]
+e: Final[float]
+euler_gamma: Final[float]
+inf: Final[float]
+infty: Final[float]
+nan: Final[float]
+pi: Final[float]
+ALLOW_THREADS: Final[int]
+BUFSIZE: Final[int]
+CLIP: Final[int]
+ERR_CALL: Final[int]
+ERR_DEFAULT: Final[int]
+ERR_IGNORE: Final[int]
+ERR_LOG: Final[int]
+ERR_PRINT: Final[int]
+ERR_RAISE: Final[int]
+ERR_WARN: Final[int]
+FLOATING_POINT_SUPPORT: Final[int]
+FPE_DIVIDEBYZERO: Final[int]
+FPE_INVALID: Final[int]
+FPE_OVERFLOW: Final[int]
+FPE_UNDERFLOW: Final[int]
+MAXDIMS: Final[int]
+MAY_SHARE_BOUNDS: Final[int]
+MAY_SHARE_EXACT: Final[int]
+RAISE: Final[int]
+SHIFT_DIVIDEBYZERO: Final[int]
+SHIFT_INVALID: Final[int]
+SHIFT_OVERFLOW: Final[int]
+SHIFT_UNDERFLOW: Final[int]
+UFUNC_BUFSIZE_DEFAULT: Final[int]
+WRAP: Final[int]
+tracemalloc_domain: Final[int]
+
+little_endian: Final[bool]
+True_: Final[bool_]
+False_: Final[bool_]
+
+UFUNC_PYVALS_NAME: Final[str]
 
 class ufunc:
     @property
@@ -687,18 +2108,16 @@ class ufunc:
         axes: List[Any] = ...,
         axis: int = ...,
         keepdims: bool = ...,
-        # TODO: make this precise when we can use Literal.
-        casting: str = ...,
-        # TODO: make this precise when we can use Literal.
-        order: Optional[str] = ...,
-        dtype: DtypeLike = ...,
+        casting: _Casting = ...,
+        order: _OrderKACF = ...,
+        dtype: DTypeLike = ...,
         subok: bool = ...,
         signature: Union[str, Tuple[str]] = ...,
         # In reality this should be a length of list 3 containing an
         # int, an int, and a callable, but there's no way to express
         # that.
         extobj: List[Union[int, Callable]] = ...,
-    ) -> Union[ndarray, generic]: ...
+    ) -> Any: ...
     @property
     def nin(self) -> int: ...
     @property
@@ -844,394 +2263,27 @@ class AxisError(ValueError, IndexError):
         self, axis: int, ndim: Optional[int] = ..., msg_prefix: Optional[str] = ...
     ) -> None: ...
 
-# Functions from np.core.numerictypes
-_DefaultType = TypeVar("_DefaultType")
-
-def maximum_sctype(t: DtypeLike) -> dtype: ...
-def issctype(rep: object) -> bool: ...
-@overload
-def obj2sctype(rep: object) -> Optional[generic]: ...
-@overload
-def obj2sctype(rep: object, default: None) -> Optional[generic]: ...
-@overload
-def obj2sctype(
-    rep: object, default: Type[_DefaultType]
-) -> Union[generic, Type[_DefaultType]]: ...
-def issubclass_(arg1: object, arg2: Union[object, Tuple[object, ...]]) -> bool: ...
-def issubsctype(
-    arg1: Union[ndarray, DtypeLike], arg2: Union[ndarray, DtypeLike]
-) -> bool: ...
-def issubdtype(arg1: DtypeLike, arg2: DtypeLike) -> bool: ...
-def sctype2char(sctype: object) -> str: ...
-def find_common_type(
-    array_types: Sequence[DtypeLike], scalar_types: Sequence[DtypeLike]
-) -> dtype: ...
-
-# Functions from np.core.fromnumeric
-_Mode = Literal["raise", "wrap", "clip"]
-_Order = Literal["C", "F", "A"]
-_PartitionKind = Literal["introselect"]
-_SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
-_Side = Literal["left", "right"]
-
-# Various annotations for scalars
-
-# While dt.datetime and dt.timedelta are not technically part of NumPy,
-# they are one of the rare few builtin scalars which serve as valid return types.
-# See https://github.com/numpy/numpy-stubs/pull/67#discussion_r412604113.
-_ScalarNumpy = Union[generic, dt.datetime, dt.timedelta]
-_ScalarBuiltin = Union[str, bytes, dt.date, dt.timedelta, bool, int, float, complex]
-_Scalar = Union[_ScalarBuiltin, _ScalarNumpy]
-
-# Integers and booleans can generally be used interchangeably
-_ScalarIntOrBool = TypeVar("_ScalarIntOrBool", bound=Union[integer, bool_])
-_ScalarGeneric = TypeVar("_ScalarGeneric", bound=generic)
-_ScalarGenericDT = TypeVar(
-    "_ScalarGenericDT", bound=Union[dt.datetime, dt.timedelta, generic]
-)
-
-_Number = TypeVar('_Number', bound=number)
-_NumberLike = Union[int, float, complex, number, bool_]
+_CallType = TypeVar("_CallType", bound=Union[_ErrFunc, _SupportsWrite])
 
-# An array-like object consisting of integers
-_Int = Union[int, integer]
-_Bool = Union[bool, bool_]
-_IntOrBool = Union[_Int, _Bool]
-_ArrayLikeIntNested = ArrayLike  # TODO: wait for support for recursive types
-_ArrayLikeBoolNested = ArrayLike  # TODO: wait for support for recursive types
+class errstate(Generic[_CallType], ContextDecorator):
+    call: _CallType
+    kwargs: _ErrDictOptional
 
-# Integers and booleans can generally be used interchangeably
-_ArrayLikeIntOrBool = Union[
-    _IntOrBool,
-    ndarray,
-    Sequence[_IntOrBool],
-    Sequence[_ArrayLikeIntNested],
-    Sequence[_ArrayLikeBoolNested],
-]
-_ArrayLikeBool = Union[
-    _Bool,
-    Sequence[_Bool],
-    ndarray
-]
-
-# The signature of take() follows a common theme with its overloads:
-# 1. A generic comes in; the same generic comes out
-# 2. A scalar comes in; a generic comes out
-# 3. An array-like object comes in; some keyword ensures that a generic comes out
-# 4. An array-like object comes in; an ndarray or generic comes out
-@overload
-def take(
-    a: _ScalarGenericDT,
-    indices: int,
-    axis: Optional[int] = ...,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> _ScalarGenericDT: ...
-@overload
-def take(
-    a: _Scalar,
-    indices: int,
-    axis: Optional[int] = ...,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> _ScalarNumpy: ...
-@overload
-def take(
-    a: ArrayLike,
-    indices: int,
-    axis: Optional[int] = ...,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> _ScalarNumpy: ...
-@overload
-def take(
-    a: ArrayLike,
-    indices: _ArrayLikeIntOrBool,
-    axis: Optional[int] = ...,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> Union[_ScalarNumpy, ndarray]: ...
-def reshape(a: ArrayLike, newshape: _ShapeLike, order: _Order = ...) -> ndarray: ...
-@overload
-def choose(
-    a: _ScalarIntOrBool,
-    choices: ArrayLike,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> _ScalarIntOrBool: ...
-@overload
-def choose(
-    a: _IntOrBool, choices: ArrayLike, out: Optional[ndarray] = ..., mode: _Mode = ...
-) -> Union[integer, bool_]: ...
-@overload
-def choose(
-    a: _ArrayLikeIntOrBool,
-    choices: ArrayLike,
-    out: Optional[ndarray] = ...,
-    mode: _Mode = ...,
-) -> ndarray: ...
-def repeat(
-    a: ArrayLike, repeats: _ArrayLikeIntOrBool, axis: Optional[int] = ...
-) -> ndarray: ...
-def put(
-    a: ndarray, ind: _ArrayLikeIntOrBool, v: ArrayLike, mode: _Mode = ...
-) -> None: ...
-def swapaxes(a: ArrayLike, axis1: int, axis2: int) -> ndarray: ...
-def transpose(
-    a: ArrayLike, axes: Union[None, Sequence[int], ndarray] = ...
-) -> ndarray: ...
-def partition(
-    a: ArrayLike,
-    kth: _ArrayLikeIntOrBool,
-    axis: Optional[int] = ...,
-    kind: _PartitionKind = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> ndarray: ...
-@overload
-def argpartition(
-    a: generic,
-    kth: _ArrayLikeIntOrBool,
-    axis: Optional[int] = ...,
-    kind: _PartitionKind = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> integer: ...
-@overload
-def argpartition(
-    a: _ScalarBuiltin,
-    kth: _ArrayLikeIntOrBool,
-    axis: Optional[int] = ...,
-    kind: _PartitionKind = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> ndarray: ...
-@overload
-def argpartition(
-    a: ArrayLike,
-    kth: _ArrayLikeIntOrBool,
-    axis: Optional[int] = ...,
-    kind: _PartitionKind = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> ndarray: ...
-def sort(
-    a: ArrayLike,
-    axis: Optional[int] = ...,
-    kind: Optional[_SortKind] = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> ndarray: ...
-def argsort(
-    a: ArrayLike,
-    axis: Optional[int] = ...,
-    kind: Optional[_SortKind] = ...,
-    order: Union[None, str, Sequence[str]] = ...,
-) -> ndarray: ...
-@overload
-def argmax(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
-@overload
-def argmax(
-    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
-) -> Union[integer, ndarray]: ...
-@overload
-def argmin(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
-@overload
-def argmin(
-    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
-) -> Union[integer, ndarray]: ...
-@overload
-def searchsorted(
-    a: ArrayLike,
-    v: _Scalar,
-    side: _Side = ...,
-    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
-) -> integer: ...
-@overload
-def searchsorted(
-    a: ArrayLike,
-    v: ArrayLike,
-    side: _Side = ...,
-    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
-) -> ndarray: ...
-def resize(a: ArrayLike, new_shape: _ShapeLike) -> ndarray: ...
-@overload
-def squeeze(a: _ScalarGeneric, axis: Optional[_ShapeLike] = ...) -> _ScalarGeneric: ...
-@overload
-def squeeze(a: ArrayLike, axis: Optional[_ShapeLike] = ...) -> ndarray: ...
-def diagonal(
-    a: ArrayLike, offset: int = ..., axis1: int = ..., axis2: int = ...  # >= 2D array
-) -> ndarray: ...
-def trace(
-    a: ArrayLike,  # >= 2D array
-    offset: int = ...,
-    axis1: int = ...,
-    axis2: int = ...,
-    dtype: DtypeLike = ...,
-    out: Optional[ndarray] = ...,
-) -> Union[number, ndarray]: ...
-def ravel(a: ArrayLike, order: _Order = ...) -> ndarray: ...
-def nonzero(a: ArrayLike) -> Tuple[ndarray, ...]: ...
-def shape(a: ArrayLike) -> _Shape: ...
-def compress(
-    condition: ArrayLike,  # 1D bool array
-    a: ArrayLike,
-    axis: Optional[int] = ...,
-    out: Optional[ndarray] = ...,
-) -> ndarray: ...
-@overload
-def clip(
-    a: _Number,
-    a_min: ArrayLike,
-    a_max: Optional[ArrayLike],
-    out: Optional[ndarray] = ...,
-    **kwargs: Any,
-) -> _Number: ...
-@overload
-def clip(
-    a: _Number,
-    a_min: None,
-    a_max: ArrayLike,
-    out: Optional[ndarray] = ...,
-    **kwargs: Any,
-) -> _Number: ...
-@overload
-def clip(
-    a: ArrayLike,
-    a_min: ArrayLike,
-    a_max: Optional[ArrayLike],
-    out: Optional[ndarray] = ...,
-    **kwargs: Any,
-) -> Union[number, ndarray]: ...
-@overload
-def clip(
-    a: ArrayLike,
-    a_min: None,
-    a_max: ArrayLike,
-    out: Optional[ndarray] = ...,
-    **kwargs: Any,
-) -> Union[number, ndarray]: ...
-@overload
-def sum(
-    a: _Number,
-    axis: Optional[_ShapeLike] = ...,
-    dtype: DtypeLike = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> _Number: ...
-@overload
-def sum(
-    a: ArrayLike,
-    axis: _ShapeLike = ...,
-    dtype: DtypeLike = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> Union[number, ndarray]: ...
-@overload
-def all(
-    a: ArrayLike,
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: Literal[False] = ...,
-) -> bool_: ...
-@overload
-def all(
-    a: ArrayLike,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-) -> Union[bool_, ndarray]: ...
-@overload
-def any(
-    a: ArrayLike,
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: Literal[False] = ...,
-) -> bool_: ...
-@overload
-def any(
-    a: ArrayLike,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-) -> Union[bool_, ndarray]: ...
-def cumsum(
-    a: ArrayLike,
-    axis: Optional[int] = ...,
-    dtype: DtypeLike = ...,
-    out: Optional[ndarray] = ...,
-) -> ndarray: ...
-@overload
-def ptp(
-    a: _Number,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-) -> _Number: ...
-@overload
-def ptp(
-    a: ArrayLike,
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: Literal[False] = ...,
-) -> number: ...
-@overload
-def ptp(
-    a: ArrayLike,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-) -> Union[number, ndarray]: ...
-@overload
-def amax(
-    a: _Number,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> _Number: ...
-@overload
-def amax(
-    a: ArrayLike,
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: Literal[False] = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> number: ...
-@overload
-def amax(
-    a: ArrayLike,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> Union[number, ndarray]: ...
-@overload
-def amin(
-    a: _Number,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> _Number: ...
-@overload
-def amin(
-    a: ArrayLike,
-    axis: None = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: Literal[False] = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> number: ...
-@overload
-def amin(
-    a: ArrayLike,
-    axis: Optional[_ShapeLike] = ...,
-    out: Optional[ndarray] = ...,
-    keepdims: bool = ...,
-    initial: _NumberLike = ...,
-    where: _ArrayLikeBool = ...,
-) -> Union[number, ndarray]: ...
+    # Expand `**kwargs` into explicit keyword-only arguments
+    def __init__(
+        self,
+        *,
+        call: _CallType = ...,
+        all: Optional[_ErrKind] = ...,
+        divide: Optional[_ErrKind] = ...,
+        over: Optional[_ErrKind] = ...,
+        under: Optional[_ErrKind] = ...,
+        invalid: Optional[_ErrKind] = ...,
+    ) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(
+        self,
+        __exc_type: Optional[Type[BaseException]],
+        __exc_value: Optional[BaseException],
+        __traceback: Optional[TracebackType],
+    ) -> None: ...
diff --git a/numpy/_pytesttester.py b/numpy/_pytesttester.py
index 1c32367f3..813e069a4 100644
--- a/numpy/_pytesttester.py
+++ b/numpy/_pytesttester.py
@@ -6,7 +6,7 @@ boiler plate for doing that is to put the following in the module
 ``__init__.py`` file::
 
     from numpy._pytesttester import PytestTester
-    test = PytestTester(__name__).test
+    test = PytestTester(__name__)
     del PytestTester
 
 
@@ -202,7 +202,6 @@ class PytestTester:
 
         pytest_args += ["--pyargs"] + list(tests)
 
-
         # run tests.
         _show_numpy_info()
 
diff --git a/numpy/_version.py b/numpy/_version.py
new file mode 100644
index 000000000..6605bf4af
--- /dev/null
+++ b/numpy/_version.py
@@ -0,0 +1,525 @@
+
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "numpy-"
+    cfg.versionfile_source = "numpy/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Create decorator to mark a method as the handler of a VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip().decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty=",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post0.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post0.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post0.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
diff --git a/numpy/char.pyi b/numpy/char.pyi
new file mode 100644
index 000000000..0e7342c0b
--- /dev/null
+++ b/numpy/char.pyi
@@ -0,0 +1,53 @@
+from typing import Any
+
+equal: Any
+not_equal: Any
+greater_equal: Any
+less_equal: Any
+greater: Any
+less: Any
+str_len: Any
+add: Any
+multiply: Any
+mod: Any
+capitalize: Any
+center: Any
+count: Any
+decode: Any
+encode: Any
+endswith: Any
+expandtabs: Any
+find: Any
+index: Any
+isalnum: Any
+isalpha: Any
+isdigit: Any
+islower: Any
+isspace: Any
+istitle: Any
+isupper: Any
+join: Any
+ljust: Any
+lower: Any
+lstrip: Any
+partition: Any
+replace: Any
+rfind: Any
+rindex: Any
+rjust: Any
+rpartition: Any
+rsplit: Any
+rstrip: Any
+split: Any
+splitlines: Any
+startswith: Any
+strip: Any
+swapcase: Any
+title: Any
+translate: Any
+upper: Any
+zfill: Any
+isnumeric: Any
+isdecimal: Any
+array: Any
+asarray: Any
diff --git a/numpy/compat/py3k.py b/numpy/compat/py3k.py
index fd9f8bd42..e1e236d92 100644
--- a/numpy/compat/py3k.py
+++ b/numpy/compat/py3k.py
@@ -18,7 +18,7 @@ __all__ = ['bytes', 'asbytes', 'isfileobj', 'getexception', 'strchar',
 
 import sys
 import os
-from pathlib import Path, PurePath
+from pathlib import Path
 import io
 
 import abc
@@ -78,11 +78,11 @@ def asunicode_nested(x):
 
 def is_pathlib_path(obj):
     """
-    Check whether obj is a pathlib.Path object.
+    Check whether obj is a `pathlib.Path` object.
 
-    Prefer using `isinstance(obj, os_PathLike)` instead of this function.
+    Prefer using ``isinstance(obj, os.PathLike)`` instead of this function.
     """
-    return Path is not None and isinstance(obj, Path)
+    return isinstance(obj, Path)
 
 # from Python 3.7
 class contextlib_nullcontext:
@@ -94,6 +94,9 @@ class contextlib_nullcontext:
     cm = optional_cm if condition else nullcontext()
     with cm:
         # Perform operation, using optional_cm if condition is True
+
+    .. note::
+        Prefer using `contextlib.nullcontext` instead of this context manager.
     """
 
     def __init__(self, enter_result=None):
@@ -132,55 +135,5 @@ def npy_load_module(name, fn, info=None):
     return SourceFileLoader(name, fn).load_module()
 
 
-# Backport os.fs_path, os.PathLike, and PurePath.__fspath__
-if sys.version_info[:2] >= (3, 6):
-    os_fspath = os.fspath
-    os_PathLike = os.PathLike
-else:
-    def _PurePath__fspath__(self):
-        return str(self)
-
-    class os_PathLike(abc_ABC):
-        """Abstract base class for implementing the file system path protocol."""
-
-        @abc.abstractmethod
-        def __fspath__(self):
-            """Return the file system path representation of the object."""
-            raise NotImplementedError
-
-        @classmethod
-        def __subclasshook__(cls, subclass):
-            if PurePath is not None and issubclass(subclass, PurePath):
-                return True
-            return hasattr(subclass, '__fspath__')
-
-
-    def os_fspath(path):
-        """Return the path representation of a path-like object.
-        If str or bytes is passed in, it is returned unchanged. Otherwise the
-        os.PathLike interface is used to get the path representation. If the
-        path representation is not str or bytes, TypeError is raised. If the
-        provided path is not str, bytes, or os.PathLike, TypeError is raised.
-        """
-        if isinstance(path, (str, bytes)):
-            return path
-
-        # Work from the object's type to match method resolution of other magic
-        # methods.
-        path_type = type(path)
-        try:
-            path_repr = path_type.__fspath__(path)
-        except AttributeError:
-            if hasattr(path_type, '__fspath__'):
-                raise
-            elif PurePath is not None and issubclass(path_type, PurePath):
-                return _PurePath__fspath__(path)
-            else:
-                raise TypeError("expected str, bytes or os.PathLike object, "
-                                "not " + path_type.__name__)
-        if isinstance(path_repr, (str, bytes)):
-            return path_repr
-        else:
-            raise TypeError("expected {}.__fspath__() to return str or bytes, "
-                            "not {}".format(path_type.__name__,
-                                            type(path_repr).__name__))
+os_fspath = os.fspath
+os_PathLike = os.PathLike
diff --git a/numpy/conftest.py b/numpy/conftest.py
index 20c8a449e..e15ee0845 100644
--- a/numpy/conftest.py
+++ b/numpy/conftest.py
@@ -2,6 +2,7 @@
 Pytest configuration and fixtures for the Numpy test suite.
 """
 import os
+import tempfile
 
 import hypothesis
 import pytest
@@ -13,11 +14,31 @@ from numpy.core._multiarray_tests import get_fpu_mode
 _old_fpu_mode = None
 _collect_results = {}
 
-# See https://hypothesis.readthedocs.io/en/latest/settings.html
+# Use a known and persistent tmpdir for hypothesis' caches, which
+# can be automatically cleared by the OS or user.
+hypothesis.configuration.set_hypothesis_home_dir(
+    os.path.join(tempfile.gettempdir(), ".hypothesis")
+)
+
+# We register two custom profiles for Numpy - for details see
+# https://hypothesis.readthedocs.io/en/latest/settings.html
+# The first is designed for our own CI runs; the latter also 
+# forces determinism and is designed for use via np.test()
 hypothesis.settings.register_profile(
     name="numpy-profile", deadline=None, print_blob=True,
 )
-hypothesis.settings.load_profile("numpy-profile")
+hypothesis.settings.register_profile(
+    name="np.test() profile",
+    deadline=None, print_blob=True, database=None, derandomize=True,
+    suppress_health_check=hypothesis.HealthCheck.all(),
+)
+# Note that the default profile is chosen based on the presence 
+# of pytest.ini, but can be overriden by passing the 
+# --hypothesis-profile=NAME argument to pytest.
+_pytest_ini = os.path.join(os.path.dirname(__file__), "..", "pytest.ini")
+hypothesis.settings.load_profile(
+    "numpy-profile" if os.path.isfile(_pytest_ini) else "np.test() profile"
+)
 
 
 def pytest_configure(config):
@@ -92,3 +113,7 @@ def check_fpu_mode(request):
 @pytest.fixture(autouse=True)
 def add_np(doctest_namespace):
     doctest_namespace['np'] = numpy
+
+@pytest.fixture(autouse=True)
+def env_setup(monkeypatch):
+    monkeypatch.setenv('PYTHONHASHSEED', '0')
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index c77885954..e8d3a381b 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -96,6 +96,7 @@ from .numeric import absolute as abs
 # do this after everything else, to minimize the chance of this misleadingly
 # appearing in an import-time traceback
 from . import _add_newdocs
+from . import _add_newdocs_scalars
 # add these for module-freeze analysis (like PyInstaller)
 from . import _dtype_ctypes
 from . import _internal
@@ -113,10 +114,9 @@ __all__ += getlimits.__all__
 __all__ += shape_base.__all__
 __all__ += einsumfunc.__all__
 
-# Make it possible so that ufuncs can be pickled
-#  Here are the loading and unloading functions
-# The name numpy.core._ufunc_reconstruct must be
-#   available for unpickling to work.
+# We used to use `np.core._ufunc_reconstruct` to unpickle. This is unnecessary,
+# but old pickles saved before 1.20 will be using it, and there is no reason
+# to break loading them.
 def _ufunc_reconstruct(module, name):
     # The `fromlist` kwarg is required to ensure that `mod` points to the
     # inner-most module rather than the parent package when module name is
@@ -126,14 +126,17 @@ def _ufunc_reconstruct(module, name):
     return getattr(mod, name)
 
 def _ufunc_reduce(func):
-    from pickle import whichmodule
-    name = func.__name__
-    return _ufunc_reconstruct, (whichmodule(func, name), name)
+    # Report the `__name__`. pickle will try to find the module. Note that
+    # pickle supports for this `__name__` to be a `__qualname__`. It may
+    # make sense to add a `__qualname__` to ufuncs, to allow this more
+    # explicitly (Numba has ufuncs as attributes).
+    # See also: https://github.com/dask/distributed/issues/3450
+    return func.__name__
 
 
 import copyreg
 
-copyreg.pickle(ufunc, _ufunc_reduce, _ufunc_reconstruct)
+copyreg.pickle(ufunc, _ufunc_reduce)
 # Unclutter namespace (must keep _ufunc_reconstruct for unpickling)
 del copyreg
 del _ufunc_reduce
diff --git a/numpy/core/__init__.pyi b/numpy/core/__init__.pyi
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/numpy/core/__init__.pyi
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index a3c404a64..2cbfe52be 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -9,9 +9,8 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 
 """
 
-from numpy.core import numerictypes as _numerictypes
-from numpy.core import dtype
 from numpy.core.function_base import add_newdoc
+from numpy.core.overrides import array_function_like_doc
 
 ###############################################################################
 #
@@ -606,6 +605,7 @@ add_newdoc('numpy.core', 'broadcast',
     --------
     broadcast_arrays
     broadcast_to
+    broadcast_shapes
 
     Examples
     --------
@@ -786,7 +786,8 @@ add_newdoc('numpy.core', 'broadcast', ('reset',
 
 add_newdoc('numpy.core.multiarray', 'array',
     """
-    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0)
+    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
+          like=None)
 
     Create an array.
 
@@ -829,6 +830,9 @@ add_newdoc('numpy.core.multiarray', 'array',
         Specifies the minimum number of dimensions that the resulting
         array should have.  Ones will be pre-pended to the shape as
         needed to meet this requirement.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -895,11 +899,14 @@ add_newdoc('numpy.core.multiarray', 'array',
     matrix([[1, 2],
             [3, 4]])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'empty',
     """
-    empty(shape, dtype=float, order='C')
+    empty(shape, dtype=float, order='C', *, like=None)
 
     Return a new array of given shape and type, without initializing entries.
 
@@ -914,6 +921,9 @@ add_newdoc('numpy.core.multiarray', 'empty',
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -946,7 +956,10 @@ add_newdoc('numpy.core.multiarray', 'empty',
     array([[-1073741821, -1067949133],
            [  496041986,    19249760]])                     #uninitialized
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'scalar',
     """
@@ -964,7 +977,7 @@ add_newdoc('numpy.core.multiarray', 'scalar',
 
 add_newdoc('numpy.core.multiarray', 'zeros',
     """
-    zeros(shape, dtype=float, order='C')
+    zeros(shape, dtype=float, order='C', *, like=None)
 
     Return a new array of given shape and type, filled with zeros.
 
@@ -979,6 +992,9 @@ add_newdoc('numpy.core.multiarray', 'zeros',
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1013,7 +1029,10 @@ add_newdoc('numpy.core.multiarray', 'zeros',
     array([(0, 0), (0, 0)],
           dtype=[('x', '<i4'), ('y', '<i4')])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'set_typeDict',
     """set_typeDict(dict)
@@ -1025,7 +1044,7 @@ add_newdoc('numpy.core.multiarray', 'set_typeDict',
 
 add_newdoc('numpy.core.multiarray', 'fromstring',
     """
-    fromstring(string, dtype=float, count=-1, sep='')
+    fromstring(string, dtype=float, count=-1, sep='', *, like=None)
 
     A new 1-D array initialized from text data in a string.
 
@@ -1058,6 +1077,9 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
             text, the binary mode of `fromstring` will first encode it into
             bytes using either utf-8 (python 3) or the default encoding
             (python 2), neither of which produce sane results.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1081,7 +1103,10 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
     >>> np.fromstring('1, 2', dtype=int, sep=',')
     array([1, 2])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'compare_chararrays',
     """
@@ -1122,7 +1147,7 @@ add_newdoc('numpy.core.multiarray', 'compare_chararrays',
 
 add_newdoc('numpy.core.multiarray', 'fromiter',
     """
-    fromiter(iter, dtype, count=-1)
+    fromiter(iter, dtype, count=-1, *, like=None)
 
     Create a new 1-dimensional array from an iterable object.
 
@@ -1135,6 +1160,9 @@ add_newdoc('numpy.core.multiarray', 'fromiter',
     count : int, optional
         The number of items to read from *iterable*.  The default is -1,
         which means all data is read.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1152,11 +1180,14 @@ add_newdoc('numpy.core.multiarray', 'fromiter',
     >>> np.fromiter(iterable, float)
     array([  0.,   1.,   4.,   9.,  16.])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'fromfile',
     """
-    fromfile(file, dtype=float, count=-1, sep='', offset=0)
+    fromfile(file, dtype=float, count=-1, sep='', offset=0, *, like=None)
 
     Construct an array from data in a text or binary file.
 
@@ -1195,6 +1226,9 @@ add_newdoc('numpy.core.multiarray', 'fromfile',
         Only permitted for binary files.
 
         .. versionadded:: 1.17.0
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     See also
     --------
@@ -1241,11 +1275,14 @@ add_newdoc('numpy.core.multiarray', 'fromfile',
     array([((10, 0), 98.25)],
           dtype=[('time', [('min', '<i8'), ('sec', '<i8')]), ('temp', '<f8')])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'frombuffer',
     """
-    frombuffer(buffer, dtype=float, count=-1, offset=0)
+    frombuffer(buffer, dtype=float, count=-1, offset=0, *, like=None)
 
     Interpret a buffer as a 1-dimensional array.
 
@@ -1259,6 +1296,9 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
         Number of items to read. ``-1`` means all data in the buffer.
     offset : int, optional
         Start reading the buffer from this offset (in bytes); default: 0.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Notes
     -----
@@ -1283,7 +1323,10 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
     >>> np.frombuffer(b'\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
     array([1, 2, 3], dtype=uint8)
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core', 'fastCopyAndTranspose',
     """_fastCopyAndTranspose(a)""")
@@ -1293,7 +1336,7 @@ add_newdoc('numpy.core.multiarray', 'correlate',
 
 add_newdoc('numpy.core.multiarray', 'arange',
     """
-    arange([start,] stop[, step,], dtype=None)
+    arange([start,] stop[, step,], dtype=None, *, like=None)
 
     Return evenly spaced values within a given interval.
 
@@ -1322,6 +1365,9 @@ add_newdoc('numpy.core.multiarray', 'arange',
     dtype : dtype
         The type of the output array.  If `dtype` is not given, infer the data
         type from the other input arguments.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1350,7 +1396,10 @@ add_newdoc('numpy.core.multiarray', 'arange',
     >>> np.arange(3,7,2)
     array([3, 5])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', '_get_ndarray_c_version',
     """_get_ndarray_c_version()
@@ -2521,7 +2570,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__setstate__',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('all',
     """
-    a.all(axis=None, out=None, keepdims=False)
+    a.all(axis=None, out=None, keepdims=False, *, where=True)
 
     Returns True if all elements evaluate to True.
 
@@ -2536,7 +2585,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('all',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('any',
     """
-    a.any(axis=None, out=None, keepdims=False)
+    a.any(axis=None, out=None, keepdims=False, *, where=True)
 
     Returns True if any of the elements of `a` evaluate to True.
 
@@ -2568,7 +2617,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin',
     """
     a.argmin(axis=None, out=None)
 
-    Return indices of the minimum values along the given axis of `a`.
+    Return indices of the minimum values along the given axis.
 
     Refer to `numpy.argmin` for detailed documentation.
 
@@ -3193,7 +3242,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('max',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('mean',
     """
-    a.mean(axis=None, dtype=None, out=None, keepdims=False)
+    a.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)
 
     Returns the average of the array elements along given axis.
 
@@ -3223,7 +3272,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('newbyteorder',
     """
-    arr.newbyteorder(new_order='S')
+    arr.newbyteorder(new_order='S', /)
 
     Return the array with the same data viewed with a different byte order.
 
@@ -3764,7 +3813,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('squeeze',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('std',
     """
-    a.std(axis=None, dtype=None, out=None, ddof=0, keepdims=False)
+    a.std(axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True)
 
     Returns the standard deviation of the array elements along given axis.
 
@@ -4051,7 +4100,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('transpose',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('var',
     """
-    a.var(axis=None, dtype=None, out=None, ddof=0, keepdims=False)
+    a.var(axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True)
 
     Returns the variance of the array elements, along given axis.
 
@@ -4457,10 +4506,8 @@ add_newdoc('numpy.core', 'ufunc',
 
     A detailed explanation of ufuncs can be found in the docs for :ref:`ufuncs`.
 
-    Calling ufuncs:
-    ===============
+    **Calling ufuncs:** ``op(*x[, out], where=True, **kwargs)``
 
-    op(*x[, out], where=True, **kwargs)
     Apply `op` to the arguments `*x` elementwise, broadcasting the arguments.
 
     The broadcasting rules are:
@@ -4691,14 +4738,14 @@ add_newdoc('numpy.core', 'ufunc', ('signature',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
+    reduce(array, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
 
-    Reduces `a`'s dimension by one, by applying ufunc along one axis.
+    Reduces `array`'s dimension by one, by applying ufunc along one axis.
 
-    Let :math:`a.shape = (N_0, ..., N_i, ..., N_{M-1})`.  Then
-    :math:`ufunc.reduce(a, axis=i)[k_0, ..,k_{i-1}, k_{i+1}, .., k_{M-1}]` =
+    Let :math:`array.shape = (N_0, ..., N_i, ..., N_{M-1})`.  Then
+    :math:`ufunc.reduce(array, axis=i)[k_0, ..,k_{i-1}, k_{i+1}, .., k_{M-1}]` =
     the result of iterating `j` over :math:`range(N_i)`, cumulatively applying
-    ufunc to each :math:`a[k_0, ..,k_{i-1}, j, k_{i+1}, .., k_{M-1}]`.
+    ufunc to each :math:`array[k_0, ..,k_{i-1}, j, k_{i+1}, .., k_{M-1}]`.
     For a one-dimensional array, reduce produces results equivalent to:
     ::
 
@@ -4711,7 +4758,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
     Parameters
     ----------
-    a : array_like
+    array : array_like
         The array to act on.
     axis : None or int or tuple of ints, optional
         Axis or axes along which a reduction is performed.
@@ -4744,7 +4791,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
         in the result as dimensions with size one. With this option,
-        the result will broadcast correctly against the original `arr`.
+        the result will broadcast correctly against the original `array`.
 
         .. versionadded:: 1.7.0
     initial : scalar, optional
@@ -4758,7 +4805,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
     where : array_like of bool, optional
         A boolean array which is broadcasted to match the dimensions
-        of `a`, and selects elements to include in the reduction. Note
+        of `array`, and selects elements to include in the reduction. Note
         that for ufuncs like ``minimum`` that do not have an identity
         defined, one has to pass in also ``initial``.
 
@@ -4900,28 +4947,28 @@ add_newdoc('numpy.core', 'ufunc', ('accumulate',
 
 add_newdoc('numpy.core', 'ufunc', ('reduceat',
     """
-    reduceat(a, indices, axis=0, dtype=None, out=None)
+    reduceat(array, indices, axis=0, dtype=None, out=None)
 
     Performs a (local) reduce with specified slices over a single axis.
 
     For i in ``range(len(indices))``, `reduceat` computes
-    ``ufunc.reduce(a[indices[i]:indices[i+1]])``, which becomes the i-th
+    ``ufunc.reduce(array[indices[i]:indices[i+1]])``, which becomes the i-th
     generalized "row" parallel to `axis` in the final result (i.e., in a
     2-D array, for example, if `axis = 0`, it becomes the i-th row, but if
     `axis = 1`, it becomes the i-th column).  There are three exceptions to this:
 
     * when ``i = len(indices) - 1`` (so for the last index),
-      ``indices[i+1] = a.shape[axis]``.
+      ``indices[i+1] = array.shape[axis]``.
     * if ``indices[i] >= indices[i + 1]``, the i-th generalized "row" is
-      simply ``a[indices[i]]``.
-    * if ``indices[i] >= len(a)`` or ``indices[i] < 0``, an error is raised.
+      simply ``array[indices[i]]``.
+    * if ``indices[i] >= len(array)`` or ``indices[i] < 0``, an error is raised.
 
     The shape of the output depends on the size of `indices`, and may be
-    larger than `a` (this happens if ``len(indices) > a.shape[axis]``).
+    larger than `array` (this happens if ``len(indices) > array.shape[axis]``).
 
     Parameters
     ----------
-    a : array_like
+    array : array_like
         The array to act on.
     indices : array_like
         Paired indices, comma separated (not colon), specifying slices to
@@ -4951,14 +4998,15 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
     -----
     A descriptive example:
 
-    If `a` is 1-D, the function `ufunc.accumulate(a)` is the same as
-    ``ufunc.reduceat(a, indices)[::2]`` where `indices` is
+    If `array` is 1-D, the function `ufunc.accumulate(array)` is the same as
+    ``ufunc.reduceat(array, indices)[::2]`` where `indices` is
     ``range(len(array) - 1)`` with a zero placed
     in every other element:
-    ``indices = zeros(2 * len(a) - 1)``, ``indices[1::2] = range(1, len(a))``.
+    ``indices = zeros(2 * len(array) - 1)``,
+    ``indices[1::2] = range(1, len(array))``.
 
-    Don't be fooled by this attribute's name: `reduceat(a)` is not
-    necessarily smaller than `a`.
+    Don't be fooled by this attribute's name: `reduceat(array)` is not
+    necessarily smaller than `array`.
 
     Examples
     --------
@@ -5007,7 +5055,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
 
 add_newdoc('numpy.core', 'ufunc', ('outer',
     r"""
-    outer(A, B, **kwargs)
+    outer(A, B, /, **kwargs)
 
     Apply the ufunc `op` to all pairs (a, b) with a in `A` and b in `B`.
 
@@ -5077,7 +5125,7 @@ add_newdoc('numpy.core', 'ufunc', ('outer',
 
 add_newdoc('numpy.core', 'ufunc', ('at',
     """
-    at(a, indices, b=None)
+    at(a, indices, b=None, /)
 
     Performs unbuffered in place operation on operand 'a' for elements
     specified by 'indices'. For addition ufunc, this method is equivalent to
@@ -5493,6 +5541,45 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('kind',
 
     """))
 
+add_newdoc('numpy.core.multiarray', 'dtype', ('metadata',
+    """
+    Either ``None`` or a readonly dictionary of metadata (mappingproxy).
+
+    The metadata field can be set using any dictionary at data-type
+    creation. NumPy currently has no uniform approach to propagating
+    metadata; although some array operations preserve it, there is no
+    guarantee that others will.
+
+    .. warning::
+
+        Although used in certain projects, this feature was long undocumented
+        and is not well supported. Some aspects of metadata propagation
+        are expected to change in the future.
+
+    Examples
+    --------
+
+    >>> dt = np.dtype(float, metadata={"key": "value"})
+    >>> dt.metadata["key"]
+    'value'
+    >>> arr = np.array([1, 2, 3], dtype=dt)
+    >>> arr.dtype.metadata
+    mappingproxy({'key': 'value'})
+
+    Adding arrays with identical datatypes currently preserves the metadata:
+
+    >>> (arr + arr).dtype.metadata
+    mappingproxy({'key': 'value'})
+
+    But if the arrays have different dtype metadata, the metadata may be 
+    dropped:
+
+    >>> dt2 = np.dtype(float, metadata={"key2": "value2"})
+    >>> arr2 = np.array([3, 2, 1], dtype=dt2)
+    >>> (arr + arr2).dtype.metadata is None
+    True  # The metadata field is cleared so None is returned
+    """))
+
 add_newdoc('numpy.core.multiarray', 'dtype', ('name',
     """
     A bit-width name for this data-type.
@@ -5649,7 +5736,7 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('type',
 
 add_newdoc('numpy.core.multiarray', 'dtype', ('newbyteorder',
     """
-    newbyteorder(new_order='S')
+    newbyteorder(new_order='S', /)
 
     Return a new dtype with a different byte order.
 
@@ -6025,7 +6112,7 @@ add_newdoc('numpy.core.numerictypes', 'generic',
 
 add_newdoc('numpy.core.numerictypes', 'generic', ('newbyteorder',
     """
-    newbyteorder(new_order='S')
+    newbyteorder(new_order='S', /)
 
     Return a new `dtype` with a different byte order.
 
@@ -6195,183 +6282,3 @@ add_newdoc('numpy.core.numerictypes', 'character',
     Abstract base class of all character string scalar types.
 
     """)
-
-
-##############################################################################
-#
-# Documentation for concrete scalar classes
-#
-##############################################################################
-
-def numeric_type_aliases(aliases):
-    def type_aliases_gen():
-        for alias, doc in aliases:
-            try:
-                alias_type = getattr(_numerictypes, alias)
-            except AttributeError:
-                # The set of aliases that actually exist varies between platforms
-                pass
-            else:
-                yield (alias_type, alias, doc)
-    return list(type_aliases_gen())
-
-
-possible_aliases = numeric_type_aliases([
-    ('int8', '8-bit signed integer (-128 to 127)'),
-    ('int16', '16-bit signed integer (-32768 to 32767)'),
-    ('int32', '32-bit signed integer (-2147483648 to 2147483647)'),
-    ('int64', '64-bit signed integer (-9223372036854775808 to 9223372036854775807)'),
-    ('intp', 'Signed integer large enough to fit pointer, compatible with C ``intptr_t``'),
-    ('uint8', '8-bit unsigned integer (0 to 255)'),
-    ('uint16', '16-bit unsigned integer (0 to 65535)'),
-    ('uint32', '32-bit unsigned integer (0 to 4294967295)'),
-    ('uint64', '64-bit unsigned integer (0 to 18446744073709551615)'),
-    ('uintp', 'Unsigned integer large enough to fit pointer, compatible with C ``uintptr_t``'),
-    ('float16', '16-bit-precision floating-point number type: sign bit, 5 bits exponent, 10 bits mantissa'),
-    ('float32', '32-bit-precision floating-point number type: sign bit, 8 bits exponent, 23 bits mantissa'),
-    ('float64', '64-bit precision floating-point number type: sign bit, 11 bits exponent, 52 bits mantissa'),
-    ('float96', '96-bit extended-precision floating-point number type'),
-    ('float128', '128-bit extended-precision floating-point number type'),
-    ('complex64', 'Complex number type composed of 2 32-bit-precision floating-point numbers'),
-    ('complex128', 'Complex number type composed of 2 64-bit-precision floating-point numbers'),
-    ('complex192', 'Complex number type composed of 2 96-bit extended-precision floating-point numbers'),
-    ('complex256', 'Complex number type composed of 2 128-bit extended-precision floating-point numbers'),
-    ])
-
-
-def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
-    o = getattr(_numerictypes, obj)
-
-    character_code = dtype(o).char
-    canonical_name_doc = "" if obj == o.__name__ else "Canonical name: ``np.{}``.\n    ".format(obj)
-    alias_doc = ''.join("Alias: ``np.{}``.\n    ".format(alias) for alias in fixed_aliases)
-    alias_doc += ''.join("Alias *on this platform*: ``np.{}``: {}.\n    ".format(alias, doc)
-                         for (alias_type, alias, doc) in possible_aliases if alias_type is o)
-
-    docstring = """
-    {doc}
-    Character code: ``'{character_code}'``.
-    {canonical_name_doc}{alias_doc}
-    """.format(doc=doc.strip(), character_code=character_code,
-               canonical_name_doc=canonical_name_doc, alias_doc=alias_doc)
-
-    add_newdoc('numpy.core.numerictypes', obj, docstring)
-
-
-add_newdoc_for_scalar_type('bool_', ['bool8'],
-    """
-    Boolean type (True or False), stored as a byte.
-    """)
-
-add_newdoc_for_scalar_type('byte', [],
-    """
-    Signed integer type, compatible with C ``char``.
-    """)
-
-add_newdoc_for_scalar_type('short', [],
-    """
-    Signed integer type, compatible with C ``short``.
-    """)
-
-add_newdoc_for_scalar_type('intc', [],
-    """
-    Signed integer type, compatible with C ``int``.
-    """)
-
-add_newdoc_for_scalar_type('int_', [],
-    """
-    Signed integer type, compatible with Python `int` anc C ``long``.
-    """)
-
-add_newdoc_for_scalar_type('longlong', [],
-    """
-    Signed integer type, compatible with C ``long long``.
-    """)
-
-add_newdoc_for_scalar_type('ubyte', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned char``.
-    """)
-
-add_newdoc_for_scalar_type('ushort', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned short``.
-    """)
-
-add_newdoc_for_scalar_type('uintc', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned int``.
-    """)
-
-add_newdoc_for_scalar_type('uint', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned long``.
-    """)
-
-add_newdoc_for_scalar_type('ulonglong', [],
-    """
-    Signed integer type, compatible with C ``unsigned long long``.
-    """)
-
-add_newdoc_for_scalar_type('half', [],
-    """
-    Half-precision floating-point number type.
-    """)
-
-add_newdoc_for_scalar_type('single', [],
-    """
-    Single-precision floating-point number type, compatible with C ``float``.
-    """)
-
-add_newdoc_for_scalar_type('double', ['float_'],
-    """
-    Double-precision floating-point number type, compatible with Python `float`
-    and C ``double``.
-    """)
-
-add_newdoc_for_scalar_type('longdouble', ['longfloat'],
-    """
-    Extended-precision floating-point number type, compatible with C
-    ``long double`` but not necessarily with IEEE 754 quadruple-precision.
-    """)
-
-add_newdoc_for_scalar_type('csingle', ['singlecomplex'],
-    """
-    Complex number type composed of two single-precision floating-point
-    numbers.
-    """)
-
-add_newdoc_for_scalar_type('cdouble', ['cfloat', 'complex_'],
-    """
-    Complex number type composed of two double-precision floating-point
-    numbers, compatible with Python `complex`.
-    """)
-
-add_newdoc_for_scalar_type('clongdouble', ['clongfloat', 'longcomplex'],
-    """
-    Complex number type composed of two extended-precision floating-point
-    numbers.
-    """)
-
-add_newdoc_for_scalar_type('object_', [],
-    """
-    Any Python object.
-    """)
-
-# TODO: work out how to put this on the base class, np.floating
-for float_name in ('half', 'single', 'double', 'longdouble'):
-    add_newdoc('numpy.core.numerictypes', float_name, ('as_integer_ratio',
-        """
-        {ftype}.as_integer_ratio() -> (int, int)
-
-        Return a pair of integers, whose ratio is exactly equal to the original
-        floating point number, and with a positive denominator.
-        Raise OverflowError on infinities and a ValueError on NaNs.
-
-        >>> np.{ftype}(10.0).as_integer_ratio()
-        (10, 1)
-        >>> np.{ftype}(0.0).as_integer_ratio()
-        (0, 1)
-        >>> np.{ftype}(-.25).as_integer_ratio()
-        (-1, 4)
-        """.format(ftype=float_name)))
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
new file mode 100644
index 000000000..b9b151224
--- /dev/null
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -0,0 +1,251 @@
+"""
+This file is separate from ``_add_newdocs.py`` so that it can be mocked out by
+our sphinx ``conf.py`` during doc builds, where we want to avoid showing
+platform-dependent information.
+"""
+from numpy.core import dtype
+from numpy.core import numerictypes as _numerictypes
+from numpy.core.function_base import add_newdoc
+
+##############################################################################
+#
+# Documentation for concrete scalar classes
+#
+##############################################################################
+
+def numeric_type_aliases(aliases):
+    def type_aliases_gen():
+        for alias, doc in aliases:
+            try:
+                alias_type = getattr(_numerictypes, alias)
+            except AttributeError:
+                # The set of aliases that actually exist varies between platforms
+                pass
+            else:
+                yield (alias_type, alias, doc)
+    return list(type_aliases_gen())
+
+
+possible_aliases = numeric_type_aliases([
+    ('int8', '8-bit signed integer (``-128`` to ``127``)'),
+    ('int16', '16-bit signed integer (``-32_768`` to ``32_767``)'),
+    ('int32', '32-bit signed integer (``-2_147_483_648`` to ``2_147_483_647``)'),
+    ('int64', '64-bit signed integer (``-9_223_372_036_854_775_808`` to ``9_223_372_036_854_775_807``)'),
+    ('intp', 'Signed integer large enough to fit pointer, compatible with C ``intptr_t``'),
+    ('uint8', '8-bit unsigned integer (``0`` to ``255``)'),
+    ('uint16', '16-bit unsigned integer (``0`` to ``65_535``)'),
+    ('uint32', '32-bit unsigned integer (``0`` to ``4_294_967_295``)'),
+    ('uint64', '64-bit unsigned integer (``0`` to ``18_446_744_073_709_551_615``)'),
+    ('uintp', 'Unsigned integer large enough to fit pointer, compatible with C ``uintptr_t``'),
+    ('float16', '16-bit-precision floating-point number type: sign bit, 5 bits exponent, 10 bits mantissa'),
+    ('float32', '32-bit-precision floating-point number type: sign bit, 8 bits exponent, 23 bits mantissa'),
+    ('float64', '64-bit precision floating-point number type: sign bit, 11 bits exponent, 52 bits mantissa'),
+    ('float96', '96-bit extended-precision floating-point number type'),
+    ('float128', '128-bit extended-precision floating-point number type'),
+    ('complex64', 'Complex number type composed of 2 32-bit-precision floating-point numbers'),
+    ('complex128', 'Complex number type composed of 2 64-bit-precision floating-point numbers'),
+    ('complex192', 'Complex number type composed of 2 96-bit extended-precision floating-point numbers'),
+    ('complex256', 'Complex number type composed of 2 128-bit extended-precision floating-point numbers'),
+    ])
+
+
+def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
+    # note: `:field: value` is rST syntax which renders as field lists.
+    o = getattr(_numerictypes, obj)
+
+    character_code = dtype(o).char
+    canonical_name_doc = "" if obj == o.__name__ else ":Canonical name: `numpy.{}`\n    ".format(obj)
+    alias_doc = ''.join(":Alias: `numpy.{}`\n    ".format(alias) for alias in fixed_aliases)
+    alias_doc += ''.join(":Alias on this platform: `numpy.{}`: {}.\n    ".format(alias, doc)
+                         for (alias_type, alias, doc) in possible_aliases if alias_type is o)
+    docstring = """
+    {doc}
+
+    :Character code: ``'{character_code}'``
+    {canonical_name_doc}{alias_doc}
+    """.format(doc=doc.strip(), character_code=character_code,
+               canonical_name_doc=canonical_name_doc, alias_doc=alias_doc)
+
+    add_newdoc('numpy.core.numerictypes', obj, docstring)
+
+
+add_newdoc_for_scalar_type('bool_', ['bool8'],
+    """
+    Boolean type (True or False), stored as a byte.
+
+    .. warning::
+
+       The :class:`bool_` type is not a subclass of the :class:`int_` type
+       (the :class:`bool_` is not even a number type). This is different
+       than Python's default implementation of :class:`bool` as a
+       sub-class of :class:`int`.
+    """)
+
+add_newdoc_for_scalar_type('byte', [],
+    """
+    Signed integer type, compatible with C ``char``.
+    """)
+
+add_newdoc_for_scalar_type('short', [],
+    """
+    Signed integer type, compatible with C ``short``.
+    """)
+
+add_newdoc_for_scalar_type('intc', [],
+    """
+    Signed integer type, compatible with C ``int``.
+    """)
+
+add_newdoc_for_scalar_type('int_', [],
+    """
+    Signed integer type, compatible with Python `int` and C ``long``.
+    """)
+
+add_newdoc_for_scalar_type('longlong', [],
+    """
+    Signed integer type, compatible with C ``long long``.
+    """)
+
+add_newdoc_for_scalar_type('ubyte', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned char``.
+    """)
+
+add_newdoc_for_scalar_type('ushort', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned short``.
+    """)
+
+add_newdoc_for_scalar_type('uintc', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned int``.
+    """)
+
+add_newdoc_for_scalar_type('uint', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned long``.
+    """)
+
+add_newdoc_for_scalar_type('ulonglong', [],
+    """
+    Signed integer type, compatible with C ``unsigned long long``.
+    """)
+
+add_newdoc_for_scalar_type('half', [],
+    """
+    Half-precision floating-point number type.
+    """)
+
+add_newdoc_for_scalar_type('single', [],
+    """
+    Single-precision floating-point number type, compatible with C ``float``.
+    """)
+
+add_newdoc_for_scalar_type('double', ['float_'],
+    """
+    Double-precision floating-point number type, compatible with Python `float`
+    and C ``double``.
+    """)
+
+add_newdoc_for_scalar_type('longdouble', ['longfloat'],
+    """
+    Extended-precision floating-point number type, compatible with C
+    ``long double`` but not necessarily with IEEE 754 quadruple-precision.
+    """)
+
+add_newdoc_for_scalar_type('csingle', ['singlecomplex'],
+    """
+    Complex number type composed of two single-precision floating-point
+    numbers.
+    """)
+
+add_newdoc_for_scalar_type('cdouble', ['cfloat', 'complex_'],
+    """
+    Complex number type composed of two double-precision floating-point
+    numbers, compatible with Python `complex`.
+    """)
+
+add_newdoc_for_scalar_type('clongdouble', ['clongfloat', 'longcomplex'],
+    """
+    Complex number type composed of two extended-precision floating-point
+    numbers.
+    """)
+
+add_newdoc_for_scalar_type('object_', [],
+    """
+    Any Python object.
+    """)
+
+add_newdoc_for_scalar_type('str_', ['unicode_'],
+    r"""
+    A unicode string.
+
+    When used in arrays, this type strips trailing null codepoints.
+
+    Unlike the builtin `str`, this supports the :ref:`python:bufferobjects`, exposing its
+    contents as UCS4:
+
+    >>> m = memoryview(np.str_("abc"))
+    >>> m.format
+    '3w'
+    >>> m.tobytes()
+    b'a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00'
+    """)
+
+add_newdoc_for_scalar_type('bytes_', ['string_'],
+    r"""
+    A byte string.
+
+    When used in arrays, this type strips trailing null bytes.
+    """)
+
+add_newdoc_for_scalar_type('void', [],
+    r"""
+    Either an opaque sequence of bytes, or a structure.
+    
+    >>> np.void(b'abcd')
+    void(b'\x61\x62\x63\x64')
+    
+    Structured `void` scalars can only be constructed via extraction from :ref:`structured_arrays`:
+    
+    >>> arr = np.array((1, 2), dtype=[('x', np.int8), ('y', np.int8)])
+    >>> arr[()]
+    (1, 2)  # looks like a tuple, but is `np.void`
+    """)
+
+add_newdoc_for_scalar_type('datetime64', [],
+    """
+    A datetime stored as a 64-bit integer, counting from ``1970-01-01T00:00:00``.
+
+    >>> np.datetime64(10, 'Y')
+    numpy.datetime64('1980')
+    >>> np.datetime64(10, 'D')
+    numpy.datetime64('1970-01-11')
+    
+    See :ref:`arrays.datetime` for more information.
+    """)
+
+add_newdoc_for_scalar_type('timedelta64', [],
+    """
+    A timedelta stored as a 64-bit integer.
+    
+    See :ref:`arrays.datetime` for more information.
+    """)
+
+# TODO: work out how to put this on the base class, np.floating
+for float_name in ('half', 'single', 'double', 'longdouble'):
+    add_newdoc('numpy.core.numerictypes', float_name, ('as_integer_ratio',
+        """
+        {ftype}.as_integer_ratio() -> (int, int)
+
+        Return a pair of integers, whose ratio is exactly equal to the original
+        floating point number, and with a positive denominator.
+        Raise `OverflowError` on infinities and a `ValueError` on NaNs.
+
+        >>> np.{ftype}(10.0).as_integer_ratio()
+        (10, 1)
+        >>> np.{ftype}(0.0).as_integer_ratio()
+        (0, 1)
+        >>> np.{ftype}(-.25).as_integer_ratio()
+        (-1, 4)
+        """.format(ftype=float_name)))
diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py
index df569f22d..a406308f3 100644
--- a/numpy/core/_asarray.py
+++ b/numpy/core/_asarray.py
@@ -3,7 +3,11 @@ Functions in the ``as*array`` family that promote array-likes into arrays.
 
 `require` fits this category despite its name not matching this pattern.
 """
-from .overrides import set_module
+from .overrides import (
+    array_function_dispatch,
+    set_array_function_like_doc,
+    set_module,
+)
 from .multiarray import array
 
 
@@ -11,8 +15,14 @@ __all__ = [
     "asarray", "asanyarray", "ascontiguousarray", "asfortranarray", "require",
 ]
 
+
+def _asarray_dispatcher(a, dtype=None, order=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asarray(a, dtype=None, order=None):
+def asarray(a, dtype=None, order=None, *, like=None):
     """Convert the input to an array.
 
     Parameters
@@ -23,10 +33,16 @@ def asarray(a, dtype=None, order=None):
         of lists and ndarrays.
     dtype : data-type, optional
         By default, the data-type is inferred from the input data.
-    order : {'C', 'F'}, optional
-        Whether to use row-major (C-style) or
-        column-major (Fortran-style) memory representation.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Memory layout.  'A' and 'K' depend on the order of input array a.
+        'C' row-major (C-style), 
+        'F' column-major (Fortran-style) memory representation.
+        'A' (any) means 'F' if `a` is Fortran contiguous, 'C' otherwise
+        'K' (keep) preserve input order
         Defaults to 'C'.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -80,11 +96,20 @@ def asarray(a, dtype=None, order=None):
     True
 
     """
+    if like is not None:
+        return _asarray_with_like(a, dtype=dtype, order=order, like=like)
+
     return array(a, dtype, copy=False, order=order)
 
 
+_asarray_with_like = array_function_dispatch(
+    _asarray_dispatcher
+)(asarray)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asanyarray(a, dtype=None, order=None):
+def asanyarray(a, dtype=None, order=None, *, like=None):
     """Convert the input to an ndarray, but pass ndarray subclasses through.
 
     Parameters
@@ -95,9 +120,16 @@ def asanyarray(a, dtype=None, order=None):
         tuples of lists, and ndarrays.
     dtype : data-type, optional
         By default, the data-type is inferred from the input data.
-    order : {'C', 'F'}, optional
-        Whether to use row-major (C-style) or column-major
-        (Fortran-style) memory representation.  Defaults to 'C'.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Memory layout.  'A' and 'K' depend on the order of input array a.
+        'C' row-major (C-style), 
+        'F' column-major (Fortran-style) memory representation.
+        'A' (any) means 'F' if `a` is Fortran contiguous, 'C' otherwise
+        'K' (keep) preserve input order
+        Defaults to 'C'.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -133,11 +165,24 @@ def asanyarray(a, dtype=None, order=None):
     True
 
     """
+    if like is not None:
+        return _asanyarray_with_like(a, dtype=dtype, order=order, like=like)
+
     return array(a, dtype, copy=False, order=order, subok=True)
 
 
+_asanyarray_with_like = array_function_dispatch(
+    _asarray_dispatcher
+)(asanyarray)
+
+
+def _asarray_contiguous_fortran_dispatcher(a, dtype=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def ascontiguousarray(a, dtype=None):
+def ascontiguousarray(a, dtype=None, *, like=None):
     """
     Return a contiguous array (ndim >= 1) in memory (C order).
 
@@ -147,6 +192,9 @@ def ascontiguousarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         Data-type of returned array.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -174,11 +222,20 @@ def ascontiguousarray(a, dtype=None):
     so it will not preserve 0-d arrays.  
 
     """
+    if like is not None:
+        return _ascontiguousarray_with_like(a, dtype=dtype, like=like)
+
     return array(a, dtype, copy=False, order='C', ndmin=1)
 
 
+_ascontiguousarray_with_like = array_function_dispatch(
+    _asarray_contiguous_fortran_dispatcher
+)(ascontiguousarray)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asfortranarray(a, dtype=None):
+def asfortranarray(a, dtype=None, *, like=None):
     """
     Return an array (ndim >= 1) laid out in Fortran order in memory.
 
@@ -188,6 +245,9 @@ def asfortranarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         By default, the data-type is inferred from the input data.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -215,11 +275,24 @@ def asfortranarray(a, dtype=None):
     so it will not preserve 0-d arrays.  
 
     """
+    if like is not None:
+        return _asfortranarray_with_like(a, dtype=dtype, like=like)
+
     return array(a, dtype, copy=False, order='F', ndmin=1)
 
 
+_asfortranarray_with_like = array_function_dispatch(
+    _asarray_contiguous_fortran_dispatcher
+)(asfortranarray)
+
+
+def _require_dispatcher(a, dtype=None, requirements=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def require(a, dtype=None, requirements=None):
+def require(a, dtype=None, requirements=None, *, like=None):
     """
     Return an ndarray of the provided type that satisfies requirements.
 
@@ -243,6 +316,9 @@ def require(a, dtype=None, requirements=None):
        * 'WRITEABLE' ('W')    - ensure a writable array
        * 'OWNDATA' ('O')      - ensure an array that owns its own data
        * 'ENSUREARRAY', ('E') - ensure a base array, instead of a subclass
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -286,6 +362,14 @@ def require(a, dtype=None, requirements=None):
       UPDATEIFCOPY : False
 
     """
+    if like is not None:
+        return _require_with_like(
+            a,
+            dtype=dtype,
+            requirements=requirements,
+            like=like,
+        )
+
     possible_flags = {'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C',
                       'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F',
                       'A': 'A', 'ALIGNED': 'A',
@@ -320,3 +404,8 @@ def require(a, dtype=None, requirements=None):
             arr = arr.copy(order)
             break
     return arr
+
+
+_require_with_like = array_function_dispatch(
+    _require_dispatcher
+)(require)
diff --git a/numpy/core/_asarray.pyi b/numpy/core/_asarray.pyi
new file mode 100644
index 000000000..8c200ba22
--- /dev/null
+++ b/numpy/core/_asarray.pyi
@@ -0,0 +1,77 @@
+import sys
+from typing import TypeVar, Union, Iterable, overload
+
+from numpy import ndarray, _OrderKACF
+from numpy.typing import ArrayLike, DTypeLike
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+def asarray(
+    a: object,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+@overload
+def asanyarray(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> _ArrayType: ...
+@overload
+def asanyarray(
+    a: object,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+def ascontiguousarray(
+    a: object, dtype: DTypeLike = ..., *, like: ArrayLike = ...
+) -> ndarray: ...
+def asfortranarray(
+    a: object, dtype: DTypeLike = ..., *, like: ArrayLike = ...
+) -> ndarray: ...
+
+_Requirements = Literal[
+    "C", "C_CONTIGUOUS", "CONTIGUOUS",
+    "F", "F_CONTIGUOUS", "FORTRAN",
+    "A", "ALIGNED",
+    "W", "WRITEABLE",
+    "O", "OWNDATA"
+]
+_E = Literal["E", "ENSUREARRAY"]
+_RequirementsWithE = Union[_Requirements, _E]
+
+@overload
+def require(
+    a: _ArrayType,
+    dtype: None = ...,
+    requirements: Union[None, _Requirements, Iterable[_Requirements]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> _ArrayType: ...
+@overload
+def require(
+    a: object,
+    dtype: DTypeLike = ...,
+    requirements: Union[_E, Iterable[_RequirementsWithE]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+@overload
+def require(
+    a: object,
+    dtype: DTypeLike = ...,
+    requirements: Union[None, _Requirements, Iterable[_Requirements]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
diff --git a/numpy/core/_dtype.py b/numpy/core/_dtype.py
index 50aeeb5bc..4249071ff 100644
--- a/numpy/core/_dtype.py
+++ b/numpy/core/_dtype.py
@@ -176,7 +176,7 @@ def _byte_order_str(dtype):
 
 
 def _datetime_metadata_str(dtype):
-    # TODO: this duplicates the C append_metastr_to_string
+    # TODO: this duplicates the C metastr_to_unicode functionality
     unit, count = np.datetime_data(dtype)
     if unit == 'generic':
         return ''
diff --git a/numpy/core/_exceptions.py b/numpy/core/_exceptions.py
index 99172e23d..5e17ed3b2 100644
--- a/numpy/core/_exceptions.py
+++ b/numpy/core/_exceptions.py
@@ -26,8 +26,6 @@ def _display_as_base(cls):
     """
     assert issubclass(cls, Exception)
     cls.__name__ = cls.__base__.__name__
-    cls.__qualname__ = cls.__base__.__qualname__
-    set_module(cls.__base__.__module__)(cls)
     return cls
 
 
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 85853622a..449926f58 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -17,26 +17,25 @@ except ImportError:
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
 
-if (sys.byteorder == 'little'):
+if sys.byteorder == 'little':
     _nbo = '<'
 else:
     _nbo = '>'
 
 def _makenames_list(adict, align):
     allfields = []
-    fnames = list(adict.keys())
-    for fname in fnames:
-        obj = adict[fname]
+
+    for fname, obj in adict.items():
         n = len(obj)
-        if not isinstance(obj, tuple) or n not in [2, 3]:
+        if not isinstance(obj, tuple) or n not in (2, 3):
             raise ValueError("entry not a 2- or 3- tuple")
-        if (n > 2) and (obj[2] == fname):
+        if n > 2 and obj[2] == fname:
             continue
         num = int(obj[1])
-        if (num < 0):
+        if num < 0:
             raise ValueError("invalid offset.")
         format = dtype(obj[0], align=align)
-        if (n > 2):
+        if n > 2:
             title = obj[2]
         else:
             title = None
@@ -68,7 +67,7 @@ def _usefields(adict, align):
             res = adict[name]
             formats.append(res[0])
             offsets.append(res[1])
-            if (len(res) > 2):
+            if len(res) > 2:
                 titles.append(res[2])
             else:
                 titles.append(None)
@@ -108,7 +107,7 @@ def _array_descr(descriptor):
     for field in ordered_fields:
         if field[1] > offset:
             num = field[1] - offset
-            result.append(('', '|V%d' % num))
+            result.append(('', f'|V{num}'))
             offset += num
         elif field[1] < offset:
             raise ValueError(
@@ -128,7 +127,7 @@ def _array_descr(descriptor):
 
     if descriptor.itemsize > offset:
         num = descriptor.itemsize - offset
-        result.append(('', '|V%d' % num))
+        result.append(('', f'|V{num}'))
 
     return result
 
@@ -191,7 +190,7 @@ def _commastring(astr):
                     (order1, order2))
             order = order1
 
-        if order in ['|', '=', _nbo]:
+        if order in ('|', '=', _nbo):
             order = ''
         dtype = order + dtype
         if (repeats == ''):
@@ -223,7 +222,7 @@ def _getintp_ctype():
         val = dummy_ctype(np.intp)
     else:
         char = dtype('p').char
-        if (char == 'i'):
+        if char == 'i':
             val = ctypes.c_int
         elif char == 'l':
             val = ctypes.c_long
@@ -379,7 +378,7 @@ def _newnames(datatype, order):
                     raise ValueError(f"unknown field name: {name}") from None
             seen.add(name)
         return tuple(list(order) + nameslist)
-    raise ValueError("unsupported order value: %s" % (order,))
+    raise ValueError(f"unsupported order value: {order}")
 
 def _copy_fields(ary):
     """Return copy of structured array with padding between fields removed.
@@ -680,8 +679,7 @@ def __dtype_from_pep3118(stream, is_subdtype):
 
         if not (is_padding and name is None):
             if name is not None and name in field_spec['names']:
-                raise RuntimeError("Duplicate field name '%s' in PEP3118 format"
-                                   % name)
+                raise RuntimeError(f"Duplicate field name '{name}' in PEP3118 format")
             field_spec['names'].append(name)
             field_spec['formats'].append(value)
             field_spec['offsets'].append(offset)
@@ -717,7 +715,7 @@ def _fix_names(field_spec):
 
         j = 0
         while True:
-            name = 'f{}'.format(j)
+            name = f'f{j}'
             if name not in names:
                 break
             j = j + 1
@@ -790,7 +788,7 @@ def _ufunc_doc_signature_formatter(ufunc):
     if ufunc.nin == 1:
         in_args = 'x'
     else:
-        in_args = ', '.join('x{}'.format(i+1) for i in range(ufunc.nin))
+        in_args = ', '.join(f'x{i+1}' for i in range(ufunc.nin))
 
     # output arguments are both keyword or positional
     if ufunc.nout == 0:
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 86ddf4d17..1867ba68c 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -4,6 +4,7 @@ and the Python code for the NumPy-namespace function
 
 """
 import warnings
+from contextlib import nullcontext
 
 from numpy.core import multiarray as mu
 from numpy.core import umath as um
@@ -11,7 +12,7 @@ from numpy.core._asarray import asanyarray
 from numpy.core import numerictypes as nt
 from numpy.core import _exceptions
 from numpy._globals import _NoValue
-from numpy.compat import pickle, os_fspath, contextlib_nullcontext
+from numpy.compat import pickle, os_fspath
 
 # save those O(100) nanoseconds!
 umr_maximum = um.maximum.reduce
@@ -50,20 +51,38 @@ def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
           initial=_NoValue, where=True):
     return umr_prod(a, axis, dtype, out, keepdims, initial, where)
 
-def _any(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_any(a, axis, dtype, out, keepdims)
-
-def _all(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_all(a, axis, dtype, out, keepdims)
-
-def _count_reduce_items(arr, axis):
-    if axis is None:
-        axis = tuple(range(arr.ndim))
-    if not isinstance(axis, tuple):
-        axis = (axis,)
-    items = 1
-    for ax in axis:
-        items *= arr.shape[mu.normalize_axis_index(ax, arr.ndim)]
+def _any(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
+    # Parsing keyword arguments is currently fairly slow, so avoid it for now
+    if where is True:
+        return umr_any(a, axis, dtype, out, keepdims)
+    return umr_any(a, axis, dtype, out, keepdims, where=where)
+
+def _all(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
+    # Parsing keyword arguments is currently fairly slow, so avoid it for now
+    if where is True:
+        return umr_all(a, axis, dtype, out, keepdims)
+    return umr_all(a, axis, dtype, out, keepdims, where=where)
+
+def _count_reduce_items(arr, axis, keepdims=False, where=True):
+    # fast-path for the default case
+    if where is True:
+        # no boolean mask given, calculate items according to axis
+        if axis is None:
+            axis = tuple(range(arr.ndim))
+        elif not isinstance(axis, tuple):
+            axis = (axis,)
+        items = nt.intp(1)
+        for ax in axis:
+            items *= arr.shape[mu.normalize_axis_index(ax, arr.ndim)]
+    else:
+        # TODO: Optimize case when `where` is broadcast along a non-reduction
+        # axis and full sum is more excessive than needed.
+
+        # guarded to protect circular imports
+        from numpy.lib.stride_tricks import broadcast_to
+        # count True values in (potentially broadcasted) boolean mask
+        items = umr_sum(broadcast_to(where, arr.shape), axis, nt.intp, None,
+                        keepdims)
     return items
 
 # Numpy 1.17.0, 2019-02-24
@@ -140,13 +159,13 @@ def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
         return _clip_dep_invoke_with_casting(
             um.clip, a, min, max, out=out, casting=casting, **kwargs)
 
-def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
+def _mean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     arr = asanyarray(a)
 
     is_float16_result = False
-    rcount = _count_reduce_items(arr, axis)
-    # Make this warning show up first
-    if rcount == 0:
+
+    rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where)
+    if rcount == 0 if where is True else umr_any(rcount == 0):
         warnings.warn("Mean of empty slice.", RuntimeWarning, stacklevel=2)
 
     # Cast bool, unsigned int, and int to float64 by default
@@ -157,7 +176,7 @@ def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
             dtype = mu.dtype('f4')
             is_float16_result = True
 
-    ret = umr_sum(arr, axis, dtype, out, keepdims)
+    ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
     if isinstance(ret, mu.ndarray):
         ret = um.true_divide(
                 ret, rcount, out=ret, casting='unsafe', subok=False)
@@ -173,12 +192,13 @@ def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
 
     return ret
 
-def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *,
+         where=True):
     arr = asanyarray(a)
 
-    rcount = _count_reduce_items(arr, axis)
+    rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where)
     # Make this warning show up on top.
-    if ddof >= rcount:
+    if ddof >= rcount if where is True else umr_any(ddof >= rcount):
         warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning,
                       stacklevel=2)
 
@@ -189,10 +209,18 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
     # Compute the mean.
     # Note that if dtype is not of inexact type then arraymean will
     # not be either.
-    arrmean = umr_sum(arr, axis, dtype, keepdims=True)
+    arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
+    # The shape of rcount has to match arrmean to not change the shape of out
+    # in broadcasting. Otherwise, it cannot be stored back to arrmean.
+    if rcount.ndim == 0:
+        # fast-path for default case when where is True
+        div = rcount
+    else:
+        # matching rcount to arrmean when where is specified as array
+        div = rcount.reshape(arrmean.shape)
     if isinstance(arrmean, mu.ndarray):
-        arrmean = um.true_divide(
-                arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
+        arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
+                                 subok=False)
     else:
         arrmean = arrmean.dtype.type(arrmean / rcount)
 
@@ -213,10 +241,10 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
     else:
         x = um.multiply(x, um.conjugate(x), out=x).real
 
-    ret = umr_sum(x, axis, dtype, out, keepdims)
+    ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
 
     # Compute degrees of freedom and make sure it is not negative.
-    rcount = max([rcount - ddof, 0])
+    rcount = um.maximum(rcount - ddof, 0)
 
     # divide by degrees of freedom
     if isinstance(ret, mu.ndarray):
@@ -229,9 +257,10 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
 
     return ret
 
-def _std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+def _std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *,
+         where=True):
     ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
-               keepdims=keepdims)
+               keepdims=keepdims, where=where)
 
     if isinstance(ret, mu.ndarray):
         ret = um.sqrt(ret, out=ret)
@@ -251,7 +280,7 @@ def _ptp(a, axis=None, out=None, keepdims=False):
 
 def _dump(self, file, protocol=2):
     if hasattr(file, 'write'):
-        ctx = contextlib_nullcontext(file)
+        ctx = nullcontext(file)
     else:
         ctx = open(os_fspath(file), "wb")
     with ctx as f:
diff --git a/numpy/core/_type_aliases.pyi b/numpy/core/_type_aliases.pyi
new file mode 100644
index 000000000..6a1099cd3
--- /dev/null
+++ b/numpy/core/_type_aliases.pyi
@@ -0,0 +1,19 @@
+import sys
+from typing import Dict, Union, Type, List
+
+from numpy import generic, signedinteger, unsignedinteger, floating, complexfloating
+
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+class _SCTypes(TypedDict):
+    int: List[Type[signedinteger]]
+    uint: List[Type[unsignedinteger]]
+    float: List[Type[floating]]
+    complex: List[Type[complexfloating]]
+    others: List[type]
+
+sctypeDict: Dict[Union[int, str], Type[generic]]
+sctypes: _SCTypes
diff --git a/numpy/core/_ufunc_config.pyi b/numpy/core/_ufunc_config.pyi
new file mode 100644
index 000000000..e90f1c510
--- /dev/null
+++ b/numpy/core/_ufunc_config.pyi
@@ -0,0 +1,43 @@
+import sys
+from typing import Optional, Union, Callable, Any
+
+if sys.version_info >= (3, 8):
+    from typing import Literal, Protocol, TypedDict
+else:
+    from typing_extensions import Literal, Protocol, TypedDict
+
+_ErrKind = Literal["ignore", "warn", "raise", "call", "print", "log"]
+_ErrFunc = Callable[[str, int], Any]
+
+class _SupportsWrite(Protocol):
+    def write(self, __msg: str) -> Any: ...
+
+class _ErrDict(TypedDict):
+    divide: _ErrKind
+    over: _ErrKind
+    under: _ErrKind
+    invalid: _ErrKind
+
+class _ErrDictOptional(TypedDict, total=False):
+    all: Optional[_ErrKind]
+    divide: Optional[_ErrKind]
+    over: Optional[_ErrKind]
+    under: Optional[_ErrKind]
+    invalid: Optional[_ErrKind]
+
+def seterr(
+    all: Optional[_ErrKind] = ...,
+    divide: Optional[_ErrKind] = ...,
+    over: Optional[_ErrKind] = ...,
+    under: Optional[_ErrKind] = ...,
+    invalid: Optional[_ErrKind] = ...,
+) -> _ErrDict: ...
+def geterr() -> _ErrDict: ...
+def setbufsize(size: int) -> int: ...
+def getbufsize() -> int: ...
+def seterrcall(
+    func: Union[None, _ErrFunc, _SupportsWrite]
+) -> Union[None, _ErrFunc, _SupportsWrite]: ...
+def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite]: ...
+
+# See `numpy/__init__.pyi` for the `errstate` class
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 5d9642ea8..ad1530419 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -1628,6 +1628,3 @@ def set_string_function(f, repr=True):
             return multiarray.set_string_function(_default_array_str, 0)
     else:
         return multiarray.set_string_function(f, repr)
-
-set_string_function(_default_array_str, False)
-set_string_function(_default_array_repr, True)
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 1868610f4..2d3a65391 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -50,8 +50,9 @@
 # Version 13 (NumPy 1.17) No change.
 # Version 13 (NumPy 1.18) No change.
 # Version 13 (NumPy 1.19) No change.
-# Version 13 (NumPy 1.20) No change.
 0x0000000d = 5b0e8bbded00b166125974fc71e80a33
 
-# Version 14 (NumPy 1.19) DType related API additions
+# Version 14 (NumPy 1.20)
+# DType related API additions.
+# A new field was added to the end of PyArrayObject_fields.
 0x0000000e = 17a0f366e55ec05e5c5c149123478452
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 856db0410..ca6a22828 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -26,6 +26,7 @@ API_FILES = [join('multiarray', 'alloc.c'),
              join('multiarray', 'array_assign_array.c'),
              join('multiarray', 'array_assign_scalar.c'),
              join('multiarray', 'array_coercion.c'),
+             join('multiarray', 'array_method.c'),
              join('multiarray', 'arrayobject.c'),
              join('multiarray', 'arraytypes.c.src'),
              join('multiarray', 'buffer.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 2ce2fdb55..cb1147b93 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -8,12 +8,12 @@ sys.path.insert(0, os.path.dirname(__file__))
 import ufunc_docstrings as docstrings
 sys.path.pop(0)
 
-Zero = "PyInt_FromLong(0)"
-One = "PyInt_FromLong(1)"
+Zero = "PyLong_FromLong(0)"
+One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
 False_ = "(Py_INCREF(Py_False), Py_False)"
 None_ = object()
-AllOnes = "PyInt_FromLong(-1)"
+AllOnes = "PyLong_FromLong(-1)"
 MinusInfinity = 'PyFloat_FromDouble(-NPY_INFINITY)'
 ReorderableNone = "(Py_INCREF(Py_None), Py_None)"
 
@@ -48,8 +48,11 @@ class TypeDescription:
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
+    dispatch: list
+        Available SIMD ufunc loops, dispatched at runtime in specified order
+        Currently only supported for simples types (see make_arrays)
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -62,6 +65,7 @@ class TypeDescription:
             out = out.replace('P', type)
         self.out = out
         self.simd = simd
+        self.dispatch = dispatch
 
     def finish_signature(self, nin, nout):
         if self.in_ is None:
@@ -86,7 +90,7 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
+def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -115,7 +119,14 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
             simdt = [k for k, v in simd if t in v]
         else:
             simdt = []
-        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
+        # [(dispatch file name without extension '.dispatch.c*', list of types)]
+        if dispatch:
+            dispt = [k for k, v in dispatch if t in v]
+        else:
+            dispt = []
+        tds.append(TypeDescription(
+            t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+        ))
     return tds
 
 class Ufunc:
@@ -341,14 +352,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
+          TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
+          TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -378,7 +389,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
           TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -726,6 +737,7 @@ defdict = {
           None,
           TD('e', f='log', astype={'e':'f'}),
           TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+          TD('d', simd=[('avx512f', 'd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
           ),
@@ -755,7 +767,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
@@ -1024,6 +1036,16 @@ def make_arrays(funcdict):
                             ISA=vt.upper(), isa=vt,
                             fname=name, type=tname, idx=k
                         ))
+                if t.dispatch is not None:
+                    for dname in t.dispatch:
+                        code2list.append(textwrap.dedent("""\
+                        #ifndef NPY_DISABLE_OPTIMIZATION
+                        #include "{dname}.dispatch.h"
+                        #endif
+                        NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
+                        """).format(
+                            dname=dname, name=name, tname=tname, k=k
+                        ))
             else:
                 funclist.append('NULL')
                 try:
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 82cd6fb27..b7edd2834 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -107,6 +107,13 @@ add_newdoc('numpy.core.umath', 'absolute',
     >>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
     >>> plt.show()
 
+    The `abs` function can be used as a shorthand for ``np.absolute`` on
+    ndarrays.
+
+    >>> x = np.array([-1.2, 1.2])
+    >>> abs(x)
+    array([1.2, 1.2])
+
     """)
 
 add_newdoc('numpy.core.umath', 'add',
@@ -141,6 +148,14 @@ add_newdoc('numpy.core.umath', 'add',
            [  3.,   5.,   7.],
            [  6.,   8.,  10.]])
 
+    The ``+`` operator can be used as a shorthand for ``np.add`` on ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 + x2
+    array([[ 0.,  2.,  4.],
+           [ 3.,  5.,  7.],
+           [ 6.,  8., 10.]])
     """)
 
 add_newdoc('numpy.core.umath', 'arccos',
@@ -608,6 +623,14 @@ add_newdoc('numpy.core.umath', 'bitwise_and',
     >>> np.bitwise_and([True, True], [False, True])
     array([False,  True])
 
+    The ``&`` operator can be used as a shorthand for ``np.bitwise_and`` on
+    ndarrays.
+
+    >>> x1 = np.array([2, 5, 255])
+    >>> x2 = np.array([3, 14, 16])
+    >>> x1 & x2
+    array([ 2,  4, 16])
+
     """)
 
 add_newdoc('numpy.core.umath', 'bitwise_or',
@@ -667,6 +690,14 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
     >>> np.bitwise_or([True, True], [False, True])
     array([ True,  True])
 
+    The ``|`` operator can be used as a shorthand for ``np.bitwise_or`` on
+    ndarrays.
+
+    >>> x1 = np.array([2, 5, 255])
+    >>> x2 = np.array([4, 4, 4])
+    >>> x1 | x2
+    array([  6,   5, 255])
+
     """)
 
 add_newdoc('numpy.core.umath', 'bitwise_xor',
@@ -719,6 +750,14 @@ add_newdoc('numpy.core.umath', 'bitwise_xor',
     >>> np.bitwise_xor([True, True], [False, True])
     array([ True, False])
 
+    The ``^`` operator can be used as a shorthand for ``np.bitwise_xor`` on
+    ndarrays.
+
+    >>> x1 = np.array([True, True])
+    >>> x2 = np.array([False, True])
+    >>> x1 ^ x2
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'ceil',
@@ -1088,6 +1127,16 @@ add_newdoc('numpy.core.umath', 'divide',
     >>> np.divide(1, 0)
     0
 
+    The ``/`` operator can be used as a shorthand for ``np.divide`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = 2 * np.ones(3)
+    >>> x1 / x2
+    array([[0. , 0.5, 1. ],
+           [1.5, 2. , 2.5],
+           [3. , 3.5, 4. ]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'equal',
@@ -1123,6 +1172,14 @@ add_newdoc('numpy.core.umath', 'equal',
     >>> np.equal(1, np.ones(1))
     array([ True])
 
+    The ``==`` operator can be used as a shorthand for ``np.equal`` on
+    ndarrays.
+
+    >>> a = np.array([2, 4, 6])
+    >>> b = np.array([2, 4, 2])
+    >>> a == b
+    array([ True,  True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'exp',
@@ -1370,6 +1427,13 @@ add_newdoc('numpy.core.umath', 'floor_divide',
     >>> np.floor_divide([1., 2., 3., 4.], 2.5)
     array([ 0.,  0.,  1.,  1.])
 
+    The ``//`` operator can be used as a shorthand for ``np.floor_divide``
+    on ndarrays.
+
+    >>> x1 = np.array([1., 2., 3., 4.])
+    >>> x1 // 2.5
+    array([0., 0., 1., 1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'fmod',
@@ -1458,10 +1522,11 @@ add_newdoc('numpy.core.umath', 'greater',
     >>> np.greater([4,2],[2,2])
     array([ True, False])
 
-    If the inputs are ndarrays, then np.greater is equivalent to '>'.
+    The ``>`` operator can be used as a shorthand for ``np.greater`` on
+    ndarrays.
 
-    >>> a = np.array([4,2])
-    >>> b = np.array([2,2])
+    >>> a = np.array([4, 2])
+    >>> b = np.array([2, 2])
     >>> a > b
     array([ True, False])
 
@@ -1494,6 +1559,14 @@ add_newdoc('numpy.core.umath', 'greater_equal',
     >>> np.greater_equal([4, 2, 1], [2, 2, 2])
     array([ True, True, False])
 
+    The ``>=`` operator can be used as a shorthand for ``np.greater_equal``
+    on ndarrays.
+
+    >>> a = np.array([4, 2, 1])
+    >>> b = np.array([2, 2, 2])
+    >>> a >= b
+    array([ True,  True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'hypot',
@@ -1612,6 +1685,13 @@ add_newdoc('numpy.core.umath', 'invert',
     >>> np.invert(np.array([True, False]))
     array([False,  True])
 
+    The ``~`` operator can be used as a shorthand for ``np.invert`` on
+    ndarrays.
+
+    >>> x1 = np.array([True, False])
+    >>> ~x1
+    array([False,  True])
+
     """)
 
 add_newdoc('numpy.core.umath', 'isfinite',
@@ -1846,6 +1926,14 @@ add_newdoc('numpy.core.umath', 'left_shift',
     >>> print(b, type(b))
     254 <class 'numpy.uint8'>
 
+    The ``<<`` operator can be used as a shorthand for ``np.left_shift`` on
+    ndarrays.
+
+    >>> x1 = 5
+    >>> x2 = np.array([1, 2, 3])
+    >>> x1 << x2
+    array([10, 20, 40])
+
     """)
 
 add_newdoc('numpy.core.umath', 'less',
@@ -1875,11 +1963,18 @@ add_newdoc('numpy.core.umath', 'less',
     >>> np.less([1, 2], [2, 2])
     array([ True, False])
 
+    The ``<`` operator can be used as a shorthand for ``np.less`` on ndarrays.
+
+    >>> a = np.array([1, 2])
+    >>> b = np.array([2, 2])
+    >>> a < b
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'less_equal',
     """
-    Return the truth value of (x1 =< x2) element-wise.
+    Return the truth value of (x1 <= x2) element-wise.
 
     Parameters
     ----------
@@ -1904,6 +1999,14 @@ add_newdoc('numpy.core.umath', 'less_equal',
     >>> np.less_equal([4, 2, 1], [2, 2, 2])
     array([False,  True,  True])
 
+    The ``<=`` operator can be used as a shorthand for ``np.less_equal`` on
+    ndarrays.
+
+    >>> a = np.array([4, 2, 1])
+    >>> b = np.array([2, 2, 2])
+    >>> a <= b
+    array([False,  True,  True])
+
     """)
 
 add_newdoc('numpy.core.umath', 'log',
@@ -2231,6 +2334,15 @@ add_newdoc('numpy.core.umath', 'logical_and',
     >>> np.logical_and(x>1, x<4)
     array([False, False,  True,  True, False])
 
+
+    The ``&`` operator can be used as a shorthand for ``np.logical_and`` on
+    boolean ndarrays.
+
+    >>> a = np.array([True, False])
+    >>> b = np.array([False, False])
+    >>> a & b
+    array([False, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'logical_not',
@@ -2301,6 +2413,14 @@ add_newdoc('numpy.core.umath', 'logical_or',
     >>> np.logical_or(x < 1, x > 3)
     array([ True, False, False, False,  True])
 
+    The ``|`` operator can be used as a shorthand for ``np.logical_or`` on
+    boolean ndarrays.
+
+    >>> a = np.array([True, False])
+    >>> b = np.array([False, False])
+    >>> a | b
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'logical_xor',
@@ -2646,8 +2766,8 @@ add_newdoc('numpy.core.umath', 'matmul',
     Raises
     ------
     ValueError
-        If the last dimension of `a` is not the same size as
-        the second-to-last dimension of `b`.
+        If the last dimension of `x1` is not the same size as
+        the second-to-last dimension of `x2`.
 
         If a scalar value is passed in.
 
@@ -2738,6 +2858,14 @@ add_newdoc('numpy.core.umath', 'matmul',
     ...
     ValueError: matmul: Input operand 1 does not have enough dimensions ...
 
+    The ``@`` operator can be used as a shorthand for ``np.matmul`` on
+    ndarrays.
+
+    >>> x1 = np.array([2j, 3j])
+    >>> x2 = np.array([2j, 3j])
+    >>> x1 @ x2
+    (-13+0j)
+
     .. versionadded:: 1.10.0
     """)
 
@@ -2814,6 +2942,16 @@ add_newdoc('numpy.core.umath', 'multiply',
            [  0.,   4.,  10.],
            [  0.,   7.,  16.]])
 
+    The ``*`` operator can be used as a shorthand for ``np.multiply`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 * x2
+    array([[  0.,   1.,   4.],
+           [  0.,   4.,  10.],
+           [  0.,   7.,  16.]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'negative',
@@ -2837,6 +2975,13 @@ add_newdoc('numpy.core.umath', 'negative',
     >>> np.negative([1.,-1.])
     array([-1.,  1.])
 
+    The unary ``-`` operator can be used as a shorthand for ``np.negative`` on
+    ndarrays.
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> -x1
+    array([-1.,  1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'positive',
@@ -2861,6 +3006,20 @@ add_newdoc('numpy.core.umath', 'positive',
     Equivalent to `x.copy()`, but only defined for types that support
     arithmetic.
 
+    Examples
+    --------
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> np.positive(x1)
+    array([ 1., -1.])
+
+    The unary ``+`` operator can be used as a shorthand for ``np.positive`` on
+    ndarrays.
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> +x1
+    array([ 1., -1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'not_equal',
@@ -2893,6 +3052,15 @@ add_newdoc('numpy.core.umath', 'not_equal',
     array([[False,  True],
            [False,  True]])
 
+    The ``!=`` operator can be used as a shorthand for ``np.not_equal`` on
+    ndarrays.
+
+    >>> a = np.array([1., 2.])
+    >>> b = np.array([1., 3.])
+    >>> a != b
+    array([False,  True])
+
+
     """)
 
 add_newdoc('numpy.core.umath', '_ones_like',
@@ -2936,9 +3104,9 @@ add_newdoc('numpy.core.umath', 'power',
 
     Examples
     --------
-    Cube each element in a list.
+    Cube each element in an array.
 
-    >>> x1 = range(6)
+    >>> x1 = np.arange(6)
     >>> x1
     [0, 1, 2, 3, 4, 5]
     >>> np.power(x1, 3)
@@ -2960,6 +3128,14 @@ add_newdoc('numpy.core.umath', 'power',
     array([[ 0,  1,  8, 27, 16,  5],
            [ 0,  1,  8, 27, 16,  5]])
 
+    The ``**`` operator can be used as a shorthand for ``np.power`` on
+    ndarrays.
+
+    >>> x2 = np.array([1, 2, 3, 3, 2, 1])
+    >>> x1 = np.arange(6)
+    >>> x1 ** x2
+    array([ 0,  1,  8, 27, 16,  5])
+
     """)
 
 add_newdoc('numpy.core.umath', 'float_power',
@@ -3183,6 +3359,13 @@ add_newdoc('numpy.core.umath', 'remainder',
     >>> np.remainder(np.arange(7), 5)
     array([0, 1, 2, 3, 4, 0, 1])
 
+    The ``%`` operator can be used as a shorthand for ``np.remainder`` on
+    ndarrays.
+
+    >>> x1 = np.arange(7)
+    >>> x1 % 5
+    array([0, 1, 2, 3, 4, 0, 1])
+
     """)
 
 add_newdoc('numpy.core.umath', 'divmod',
@@ -3225,6 +3408,13 @@ add_newdoc('numpy.core.umath', 'divmod',
     >>> np.divmod(np.arange(5), 3)
     (array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))
 
+    The `divmod` function can be used as a shorthand for ``np.divmod`` on
+    ndarrays.
+
+    >>> x = np.arange(5)
+    >>> divmod(x, 3)
+    (array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))
+
     """)
 
 add_newdoc('numpy.core.umath', 'right_shift',
@@ -3268,6 +3458,14 @@ add_newdoc('numpy.core.umath', 'right_shift',
     >>> np.right_shift(10, [1,2,3])
     array([5, 2, 1])
 
+    The ``>>`` operator can be used as a shorthand for ``np.right_shift`` on
+    ndarrays.
+
+    >>> x1 = 10
+    >>> x2 = np.array([1,2,3])
+    >>> x1 >> x2
+    array([5, 2, 1])
+
     """)
 
 add_newdoc('numpy.core.umath', 'rint',
@@ -3709,6 +3907,16 @@ add_newdoc('numpy.core.umath', 'subtract',
            [ 3.,  3.,  3.],
            [ 6.,  6.,  6.]])
 
+    The ``-`` operator can be used as a shorthand for ``np.subtract`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 - x2
+    array([[0., 0., 0.],
+           [3., 3., 3.],
+           [6., 6., 6.]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'tan',
@@ -3851,6 +4059,14 @@ add_newdoc('numpy.core.umath', 'true_divide',
 
     >>> x//4
     array([0, 0, 0, 0, 1])
+
+    The ``/`` operator can be used as a shorthand for ``np.true_divide`` on
+    ndarrays.
+
+    >>> x = np.arange(5)
+    >>> x / 4
+    array([0.  , 0.25, 0.5 , 0.75, 1.  ])
+
     """)
 
 add_newdoc('numpy.core.umath', 'frexp',
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 1d447b86a..9d7b54a1a 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -114,8 +114,8 @@ def equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -140,8 +140,8 @@ def not_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -167,8 +167,8 @@ def greater_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -193,8 +193,8 @@ def less_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -219,8 +219,8 @@ def greater(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -245,8 +245,8 @@ def less(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index f65f4015c..e0942beca 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -1062,6 +1062,17 @@ def einsum(*operands, out=None, optimize=False, **kwargs):
     --------
     einsum_path, dot, inner, outer, tensordot, linalg.multi_dot
 
+    einops:
+        similar verbose interface is provided by
+        `einops <https://github.com/arogozhnikov/einops>`_ package to cover
+        additional operations: transpose, reshape/flatten, repeat/tile,
+        squeeze/unsqueeze and reductions.
+
+    opt_einsum:
+        `opt_einsum <https://optimized-einsum.readthedocs.io/en/stable/>`_
+        optimizes contraction order for einsum-like expressions
+        in backend-agnostic manner.
+
     Notes
     -----
     .. versionadded:: 1.6.0
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index f8c11c015..efb052bc2 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -460,7 +460,7 @@ def repeat(a, repeats, axis=None):
     --------
     tile : Tile an array.
     unique : Find the unique elements of an array.
-    
+
     Examples
     --------
     >>> np.repeat(3, 4)
@@ -1375,7 +1375,7 @@ def resize(a, new_shape):
     reshaped_array : ndarray
         The new array is formed from the data in the old array, repeated
         if necessary to fill out the required number of elements.  The
-        data are repeated in the order that they are stored in memory.
+        data are repeated iterating over the array in C-order.
 
     See Also
     --------
@@ -1392,11 +1392,11 @@ def resize(a, new_shape):
 
     Warning: This functionality does **not** consider axes separately,
     i.e. it does not apply interpolation/extrapolation.
-    It fills the return array with the required number of elements, taken
-    from `a` as they are laid out in memory, disregarding strides and axes.
-    (This is in case the new shape is smaller. For larger, see above.)
-    This functionality is therefore not suitable to resize images,
-    or data where each axis represents a separate and distinct entity.
+    It fills the return array with the required number of elements, iterating
+    over `a` in C-order, disregarding axes (and cycling back from the start if
+    the new shape is larger).  This functionality is therefore not suitable to
+    resize images, or data where each axis represents a separate and distinct
+    entity.
 
     Examples
     --------
@@ -1840,11 +1840,11 @@ def nonzero(a):
     .. note::
 
        When called on a zero-d array or scalar, ``nonzero(a)`` is treated
-       as ``nonzero(atleast1d(a))``.
+       as ``nonzero(atleast_1d(a))``.
 
        .. deprecated:: 1.17.0
 
-          Use `atleast1d` explicitly if this behavior is deliberate.
+          Use `atleast_1d` explicitly if this behavior is deliberate.
 
     Parameters
     ----------
@@ -1941,7 +1941,7 @@ def shape(a):
 
     See Also
     --------
-    alen
+    len
     ndarray.shape : Equivalent array method.
 
     Examples
@@ -2007,8 +2007,8 @@ def compress(condition, a, axis=None, out=None):
     --------
     take, choose, diag, diagonal, select
     ndarray.compress : Equivalent method in ndarray
-    np.extract: Equivalent method when working on 1-D arrays
-    ufuncs-output-type
+    extract: Equivalent method when working on 1-D arrays
+    :ref:`ufuncs-output-type`
 
     Examples
     --------
@@ -2059,15 +2059,10 @@ def clip(a, a_min, a_max, out=None, **kwargs):
     ----------
     a : array_like
         Array containing elements to clip.
-    a_min : scalar or array_like or None
-        Minimum value. If None, clipping is not performed on lower
-        interval edge. Not more than one of `a_min` and `a_max` may be
-        None.
-    a_max : scalar or array_like or None
-        Maximum value. If None, clipping is not performed on upper
-        interval edge. Not more than one of `a_min` and `a_max` may be
-        None. If `a_min` or `a_max` are array_like, then the three
-        arrays will be broadcasted to match their shapes.
+    a_min, a_max : array_like or None
+        Minimum and maximum value. If ``None``, clipping is not performed on
+        the corresponding edge. Only one of `a_min` and `a_max` may be
+        ``None``. Both are broadcast against `a`.
     out : ndarray, optional
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
@@ -2087,7 +2082,7 @@ def clip(a, a_min, a_max, out=None, **kwargs):
 
     See Also
     --------
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Examples
     --------
@@ -2253,12 +2248,13 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
                           initial=initial, where=where)
 
 
-def _any_dispatcher(a, axis=None, out=None, keepdims=None):
-    return (a, out)
+def _any_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                    where=np._NoValue):
+    return (a, where, out)
 
 
 @array_function_dispatch(_any_dispatcher)
-def any(a, axis=None, out=None, keepdims=np._NoValue):
+def any(a, axis=None, out=None, keepdims=np._NoValue, *, where=np._NoValue):
     """
     Test whether any array element along a given axis evaluates to True.
 
@@ -2283,7 +2279,7 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         the same shape as the expected output and its type is preserved
         (e.g., if it is of type float, then it will remain so, returning
         1.0 for True and 0.0 for False, regardless of the type of `a`).
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2296,6 +2292,12 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in checking for any `True` values.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     any : bool or ndarray
@@ -2327,6 +2329,9 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.any(np.nan)
     True
 
+    >>> np.any([[True, False], [False, False]], where=[[False], [True]])
+    False
+
     >>> o=np.array(False)
     >>> z=np.any([-1, 4, 5], out=o)
     >>> z, o
@@ -2338,15 +2343,17 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
     (191614240, 191614240)
 
     """
-    return _wrapreduction(a, np.logical_or, 'any', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.logical_or, 'any', axis, None, out,
+                          keepdims=keepdims, where=where)
 
 
-def _all_dispatcher(a, axis=None, out=None, keepdims=None):
-    return (a, out)
+def _all_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                    where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_all_dispatcher)
-def all(a, axis=None, out=None, keepdims=np._NoValue):
+def all(a, axis=None, out=None, keepdims=np._NoValue, *, where=np._NoValue):
     """
     Test whether all array elements along a given axis evaluate to True.
 
@@ -2368,7 +2375,7 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.
         It must have the same shape as the expected output and its
         type is preserved (e.g., if ``dtype(out)`` is float, the result
-        will consist of 0.0's and 1.0's). See `ufuncs-output-type` for more
+        will consist of 0.0's and 1.0's). See :ref:`ufuncs-output-type` for more
         details.
 
     keepdims : bool, optional
@@ -2382,6 +2389,12 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in checking for all `True` values.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     all : ndarray, bool
@@ -2413,13 +2426,17 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.all([1.0, np.nan])
     True
 
+    >>> np.all([[True, True], [False, True]], where=[[True], [False]])
+    True
+
     >>> o=np.array(False)
     >>> z=np.all([-1, 4, 5], out=o)
     >>> id(z), id(o), z
     (28293632, 28293632, array(True)) # may vary
 
     """
-    return _wrapreduction(a, np.logical_and, 'all', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.logical_and, 'all', axis, None, out,
+                          keepdims=keepdims, where=where)
 
 
 def _cumsum_dispatcher(a, axis=None, dtype=None, out=None):
@@ -2447,7 +2464,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must
         have the same shape and buffer length as the expected output
-        but the type will be cast if necessary. See `ufuncs-output-type` for
+        but the type will be cast if necessary. See :ref:`ufuncs-output-type` for
         more details.
 
     Returns
@@ -2618,7 +2635,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2743,7 +2760,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2953,7 +2970,7 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
     See Also
     --------
     ndarray.prod : equivalent method
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3049,7 +3066,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
 
     See Also
     --------
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3195,7 +3212,7 @@ def around(a, decimals=0, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must have
         the same shape as the expected output, but the type of the output
-        values will be cast if necessary. See `ufuncs-output-type` for more
+        values will be cast if necessary. See :ref:`ufuncs-output-type` for more
         details.
 
     Returns
@@ -3276,12 +3293,14 @@ def around(a, decimals=0, out=None):
     return _wrapfunc(a, 'round', decimals=decimals, out=out)
 
 
-def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None):
-    return (a, out)
+def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None, *,
+                     where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_mean_dispatcher)
-def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
+def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, *,
+         where=np._NoValue):
     """
     Compute the arithmetic mean along the specified axis.
 
@@ -3310,7 +3329,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -3323,6 +3342,11 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the mean. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     m : ndarray, see dtype parameter above
@@ -3371,10 +3395,19 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
     >>> np.mean(a, dtype=np.float64)
     0.55000000074505806 # may vary
 
+    Specifying a where argument:
+    >>> a = np.array([[5, 9, 13], [14, 10, 12], [11, 15, 19]])
+    >>> np.mean(a)
+    12.0
+    >>> np.mean(a, where=[[True], [False], [False]])
+    9.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
+    if where is not np._NoValue:
+        kwargs['where'] = where
     if type(a) is not mu.ndarray:
         try:
             mean = a.mean
@@ -3387,13 +3420,14 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
                           out=out, **kwargs)
 
 
-def _std_dispatcher(
-        a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
-    return (a, out)
+def _std_dispatcher(a, axis=None, dtype=None, out=None, ddof=None,
+                    keepdims=None, *, where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_std_dispatcher)
-def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
+def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
+        where=np._NoValue):
     """
     Compute the standard deviation along the specified axis.
 
@@ -3436,6 +3470,12 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the standard deviation.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     standard_deviation : ndarray, see dtype parameter above.
@@ -3445,12 +3485,12 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     var, mean, nanmean, nanstd, nanvar
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
     The standard deviation is the square root of the average of the squared
-    deviations from the mean, i.e., ``std = sqrt(mean(x))``, where 
+    deviations from the mean, i.e., ``std = sqrt(mean(x))``, where
     ``x = abs(a - a.mean())**2``.
 
     The average squared deviation is typically calculated as ``x.sum() / N``,
@@ -3495,11 +3535,20 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     >>> np.std(a, dtype=np.float64)
     0.44999999925494177 # may vary
 
+    Specifying a where argument:
+
+    >>> a = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]])
+    >>> np.std(a)
+    2.614064523559687 # may vary
+    >>> np.std(a, where=[[True], [True], [False]])
+    2.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
-
+    if where is not np._NoValue:
+        kwargs['where'] = where
     if type(a) is not mu.ndarray:
         try:
             std = a.std
@@ -3512,13 +3561,14 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
                          **kwargs)
 
 
-def _var_dispatcher(
-        a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
-    return (a, out)
+def _var_dispatcher(a, axis=None, dtype=None, out=None, ddof=None,
+                    keepdims=None, *, where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_var_dispatcher)
-def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
+def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
+        where=np._NoValue):
     """
     Compute the variance along the specified axis.
 
@@ -3562,6 +3612,12 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the variance. See `~numpy.ufunc.reduce` for
+        details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     variance : ndarray, see dtype parameter above
@@ -3571,7 +3627,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     std, mean, nanmean, nanstd, nanvar
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3619,10 +3675,20 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
     0.2025
 
+    Specifying a where argument:
+
+    >>> a = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]])
+    >>> np.var(a)
+    6.833333333333333 # may vary
+    >>> np.var(a, where=[[True], [True], [False]])
+    4.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
+    if where is not np._NoValue:
+        kwargs['where'] = where
 
     if type(a) is not mu.ndarray:
         try:
diff --git a/numpy/core/fromnumeric.pyi b/numpy/core/fromnumeric.pyi
new file mode 100644
index 000000000..66eb3bfb8
--- /dev/null
+++ b/numpy/core/fromnumeric.pyi
@@ -0,0 +1,483 @@
+import sys
+import datetime as dt
+from typing import Optional, Union, Sequence, Tuple, Any, overload, TypeVar
+
+from numpy import (
+    ndarray,
+    number,
+    integer,
+    bool_,
+    generic,
+    _OrderKACF,
+    _OrderACF,
+    _ArrayLikeBool,
+    _ArrayLikeIntOrBool,
+    _ModeKind,
+    _PartitionKind,
+    _SortKind,
+    _SortSide,
+)
+from numpy.typing import (
+    DTypeLike,
+    ArrayLike,
+    _ShapeLike,
+    _Shape,
+    _IntLike,
+    _BoolLike,
+    _NumberLike,
+)
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+# Various annotations for scalars
+
+# While dt.datetime and dt.timedelta are not technically part of NumPy,
+# they are one of the rare few builtin scalars which serve as valid return types.
+# See https://github.com/numpy/numpy-stubs/pull/67#discussion_r412604113.
+_ScalarNumpy = Union[generic, dt.datetime, dt.timedelta]
+_ScalarBuiltin = Union[str, bytes, dt.date, dt.timedelta, bool, int, float, complex]
+_Scalar = Union[_ScalarBuiltin, _ScalarNumpy]
+
+# Integers and booleans can generally be used interchangeably
+_ScalarIntOrBool = TypeVar("_ScalarIntOrBool", bound=Union[integer, bool_])
+_ScalarGeneric = TypeVar("_ScalarGeneric", bound=generic)
+_ScalarGenericDT = TypeVar(
+    "_ScalarGenericDT", bound=Union[dt.datetime, dt.timedelta, generic]
+)
+
+_Number = TypeVar("_Number", bound=number)
+
+# The signature of take() follows a common theme with its overloads:
+# 1. A generic comes in; the same generic comes out
+# 2. A scalar comes in; a generic comes out
+# 3. An array-like object comes in; some keyword ensures that a generic comes out
+# 4. An array-like object comes in; an ndarray or generic comes out
+@overload
+def take(
+    a: _ScalarGenericDT,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarGenericDT: ...
+@overload
+def take(
+    a: _Scalar,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarNumpy: ...
+@overload
+def take(
+    a: ArrayLike,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarNumpy: ...
+@overload
+def take(
+    a: ArrayLike,
+    indices: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> Union[_ScalarNumpy, ndarray]: ...
+def reshape(a: ArrayLike, newshape: _ShapeLike, order: _OrderACF = ...) -> ndarray: ...
+@overload
+def choose(
+    a: _ScalarIntOrBool,
+    choices: ArrayLike,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarIntOrBool: ...
+@overload
+def choose(
+    a: Union[_IntLike, _BoolLike], choices: ArrayLike, out: Optional[ndarray] = ..., mode: _ModeKind = ...
+) -> Union[integer, bool_]: ...
+@overload
+def choose(
+    a: _ArrayLikeIntOrBool,
+    choices: ArrayLike,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> ndarray: ...
+def repeat(
+    a: ArrayLike, repeats: _ArrayLikeIntOrBool, axis: Optional[int] = ...
+) -> ndarray: ...
+def put(
+    a: ndarray, ind: _ArrayLikeIntOrBool, v: ArrayLike, mode: _ModeKind = ...
+) -> None: ...
+def swapaxes(a: ArrayLike, axis1: int, axis2: int) -> ndarray: ...
+def transpose(
+    a: ArrayLike, axes: Union[None, Sequence[int], ndarray] = ...
+) -> ndarray: ...
+def partition(
+    a: ArrayLike,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argpartition(
+    a: generic,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> integer: ...
+@overload
+def argpartition(
+    a: _ScalarBuiltin,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argpartition(
+    a: ArrayLike,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+def sort(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    kind: Optional[_SortKind] = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+def argsort(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    kind: Optional[_SortKind] = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argmax(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
+@overload
+def argmax(
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
+) -> Union[integer, ndarray]: ...
+@overload
+def argmin(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
+@overload
+def argmin(
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
+) -> Union[integer, ndarray]: ...
+@overload
+def searchsorted(
+    a: ArrayLike,
+    v: _Scalar,
+    side: _SortSide = ...,
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
+) -> integer: ...
+@overload
+def searchsorted(
+    a: ArrayLike,
+    v: ArrayLike,
+    side: _SortSide = ...,
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
+) -> ndarray: ...
+def resize(a: ArrayLike, new_shape: _ShapeLike) -> ndarray: ...
+@overload
+def squeeze(a: _ScalarGeneric, axis: Optional[_ShapeLike] = ...) -> _ScalarGeneric: ...
+@overload
+def squeeze(a: ArrayLike, axis: Optional[_ShapeLike] = ...) -> ndarray: ...
+def diagonal(
+    a: ArrayLike, offset: int = ..., axis1: int = ..., axis2: int = ...  # >= 2D array
+) -> ndarray: ...
+def trace(
+    a: ArrayLike,  # >= 2D array
+    offset: int = ...,
+    axis1: int = ...,
+    axis2: int = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> Union[number, ndarray]: ...
+def ravel(a: ArrayLike, order: _OrderKACF = ...) -> ndarray: ...
+def nonzero(a: ArrayLike) -> Tuple[ndarray, ...]: ...
+def shape(a: ArrayLike) -> _Shape: ...
+def compress(
+    condition: ArrayLike,  # 1D bool array
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+@overload
+def clip(
+    a: _Number,
+    a_min: ArrayLike,
+    a_max: Optional[ArrayLike],
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> _Number: ...
+@overload
+def clip(
+    a: _Number,
+    a_min: None,
+    a_max: ArrayLike,
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> _Number: ...
+@overload
+def clip(
+    a: ArrayLike,
+    a_min: ArrayLike,
+    a_max: Optional[ArrayLike],
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> Union[number, ndarray]: ...
+@overload
+def clip(
+    a: ArrayLike,
+    a_min: None,
+    a_max: ArrayLike,
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> Union[number, ndarray]: ...
+@overload
+def sum(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def sum(
+    a: ArrayLike,
+    axis: _ShapeLike = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def all(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> bool_: ...
+@overload
+def all(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[bool_, ndarray]: ...
+@overload
+def any(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> bool_: ...
+@overload
+def any(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[bool_, ndarray]: ...
+def cumsum(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+@overload
+def ptp(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> _Number: ...
+@overload
+def ptp(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def ptp(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def amax(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def amax(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def amax(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def amin(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def amin(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def amin(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+
+# TODO: `np.prod()``: For object arrays `initial` does not necessarily
+# have to be a numerical scalar.
+# The only requirement is that it is compatible
+# with the `.__mul__()` method(s) of the passed array's elements.
+
+# Note that the same situation holds for all wrappers around
+# `np.ufunc.reduce`, e.g. `np.sum()` (`.__add__()`).
+@overload
+def prod(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def prod(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def prod(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+def cumprod(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+def ndim(a: ArrayLike) -> int: ...
+def size(a: ArrayLike, axis: Optional[int] = ...) -> int: ...
+@overload
+def around(
+    a: _Number, decimals: int = ..., out: Optional[ndarray] = ...
+) -> _Number: ...
+@overload
+def around(
+    a: _NumberLike, decimals: int = ..., out: Optional[ndarray] = ...
+) -> number: ...
+@overload
+def around(
+    a: ArrayLike, decimals: int = ..., out: Optional[ndarray] = ...
+) -> ndarray: ...
+@overload
+def mean(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def mean(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def std(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    ddof: int = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def std(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    ddof: int = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def var(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    ddof: int = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def var(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    ddof: int = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index f57e95742..e940ac230 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -34,6 +34,11 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     .. versionchanged:: 1.16.0
         Non-scalar `start` and `stop` are now supported.
 
+    .. versionchanged:: 1.20.0
+        Values are rounded towards ``-inf`` instead of ``0`` when an
+        integer ``dtype`` is specified. The old behavior can
+        still be obtained with ``np.linspace(start, stop, num).astype(int)``
+
     Parameters
     ----------
     start : array_like
@@ -161,6 +166,9 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     if axis != 0:
         y = _nx.moveaxis(y, 0, axis)
 
+    if _nx.issubdtype(dtype, _nx.integer):
+        _nx.floor(y, out=y)
+
     if retstep:
         return y.astype(dtype, copy=False), step
     else:
@@ -199,7 +207,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     endpoint : boolean, optional
         If true, `stop` is the last sample. Otherwise, it is not included.
         Default is True.
-    base : float, optional
+    base : array_like, optional
         The base of the log space. The step size between the elements in
         ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
         Default is 10.0.
@@ -363,7 +371,7 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
             6.12323400e-17+1.00000000e+00j,  7.07106781e-01+7.07106781e-01j,
             1.00000000e+00+0.00000000e+00j])
 
-    Graphical illustration of ``endpoint`` parameter:
+    Graphical illustration of `endpoint` parameter:
 
     >>> import matplotlib.pyplot as plt
     >>> N = 10
diff --git a/numpy/core/function_base.pyi b/numpy/core/function_base.pyi
new file mode 100644
index 000000000..1490bed4a
--- /dev/null
+++ b/numpy/core/function_base.pyi
@@ -0,0 +1,56 @@
+import sys
+from typing import overload, Tuple, Union, Sequence, Any
+
+from numpy import ndarray, inexact
+from numpy.typing import ArrayLike, DTypeLike, _SupportsArray, _NumberLike
+
+if sys.version_info >= (3, 8):
+    from typing import SupportsIndex, Literal
+else:
+    from typing_extensions import Literal, Protocol
+
+    class SupportsIndex(Protocol):
+        def __index__(self) -> int: ...
+
+# TODO: wait for support for recursive types
+_ArrayLikeNested = Sequence[Sequence[Any]]
+_ArrayLikeNumber = Union[
+    _NumberLike, Sequence[_NumberLike], ndarray, _SupportsArray, _ArrayLikeNested
+]
+@overload
+def linspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    retstep: Literal[False] = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
+@overload
+def linspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    retstep: Literal[True] = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> Tuple[ndarray, inexact]: ...
+def logspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    base: _ArrayLikeNumber = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
+def geomspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index 6dce88df3..14a31988f 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -134,8 +134,7 @@ typedef struct {
         char obval;
 } PyScalarObject;
 
-#define PyStringScalarObject PyStringObject
-#define PyStringScalarObject PyStringObject
+#define PyStringScalarObject PyBytesObject
 typedef struct {
         /* note that the PyObject_HEAD macro lives right here */
         PyUnicodeObject base;
@@ -150,6 +149,7 @@ typedef struct {
         PyArray_Descr *descr;
         int flags;
         PyObject *base;
+        void *_buffer_info;  /* private buffer info, tagged to allow warning */
 } PyVoidScalarObject;
 
 /* Macros
diff --git a/numpy/core/include/numpy/libdivide/LICENSE.txt b/numpy/core/include/numpy/libdivide/LICENSE.txt
new file mode 100644
index 000000000..d72a7c388
--- /dev/null
+++ b/numpy/core/include/numpy/libdivide/LICENSE.txt
@@ -0,0 +1,21 @@
+  zlib License
+  ------------
+
+  Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+  Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
diff --git a/numpy/core/include/numpy/libdivide/libdivide.h b/numpy/core/include/numpy/libdivide/libdivide.h
new file mode 100644
index 000000000..81057b7b4
--- /dev/null
+++ b/numpy/core/include/numpy/libdivide/libdivide.h
@@ -0,0 +1,2079 @@
+// libdivide.h - Optimized integer division
+// https://libdivide.com
+//
+// Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+//
+// libdivide is dual-licensed under the Boost or zlib licenses.
+// You may use libdivide under the terms of either of these.
+// See LICENSE.txt for more details.
+
+#ifndef LIBDIVIDE_H
+#define LIBDIVIDE_H
+
+#define LIBDIVIDE_VERSION "3.0"
+#define LIBDIVIDE_VERSION_MAJOR 3
+#define LIBDIVIDE_VERSION_MINOR 0
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+    #include <cstdlib>
+    #include <cstdio>
+    #include <type_traits>
+#else
+    #include <stdlib.h>
+    #include <stdio.h>
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+    #include <immintrin.h>
+#elif defined(LIBDIVIDE_AVX2)
+    #include <immintrin.h>
+#elif defined(LIBDIVIDE_SSE2)
+    #include <emmintrin.h>
+#endif
+
+#if defined(_MSC_VER)
+    #include <intrin.h>
+    // disable warning C4146: unary minus operator applied
+    // to unsigned type, result still unsigned
+    #pragma warning(disable: 4146)
+    #define LIBDIVIDE_VC
+#endif
+
+#if !defined(__has_builtin)
+    #define __has_builtin(x) 0
+#endif
+
+#if defined(__SIZEOF_INT128__)
+    #define HAS_INT128_T
+    // clang-cl on Windows does not yet support 128-bit division
+    #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
+        #define HAS_INT128_DIV
+    #endif
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+    #define LIBDIVIDE_X86_64
+#endif
+
+#if defined(__i386__)
+    #define LIBDIVIDE_i386
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define LIBDIVIDE_GCC_STYLE_ASM
+#endif
+
+#if defined(__cplusplus) || defined(LIBDIVIDE_VC)
+    #define LIBDIVIDE_FUNCTION __FUNCTION__
+#else
+    #define LIBDIVIDE_FUNCTION __func__
+#endif
+
+#define LIBDIVIDE_ERROR(msg) \
+    do { \
+        fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", \
+            __LINE__, LIBDIVIDE_FUNCTION, msg); \
+        abort(); \
+    } while (0)
+
+#if defined(LIBDIVIDE_ASSERTIONS_ON)
+    #define LIBDIVIDE_ASSERT(x) \
+        do { \
+            if (!(x)) { \
+                fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \
+                    __LINE__, LIBDIVIDE_FUNCTION, #x); \
+                abort(); \
+            } \
+        } while (0)
+#else
+    #define LIBDIVIDE_ASSERT(x)
+#endif
+
+#ifdef __cplusplus
+namespace libdivide {
+#endif
+
+// pack divider structs to prevent compilers from padding.
+// This reduces memory usage by up to 43% when using a large
+// array of libdivide dividers and improves performance
+// by up to 10% because of reduced memory bandwidth.
+#pragma pack(push, 1)
+
+struct libdivide_u32_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u32_branchfree_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_branchfree_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_branchfree_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_branchfree_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+#pragma pack(pop)
+
+// Explanation of the "more" field:
+//
+// * Bits 0-5 is the shift value (for shift path or mult path).
+// * Bit 6 is the add indicator for mult path.
+// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
+//   divisor indicator so that we can efficiently use sign extension to
+//   create a bitmask with all bits set to 1 (if the divisor is negative)
+//   or 0 (if the divisor is positive).
+//
+// u32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// u64: [0-5] shift value
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s64: [0-5] shift value
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// In s32 and s64 branchfree modes, the magic number is negated according to
+// whether the divisor is negated. In branchfree strategy, it is not negated.
+
+enum {
+    LIBDIVIDE_32_SHIFT_MASK = 0x1F,
+    LIBDIVIDE_64_SHIFT_MASK = 0x3F,
+    LIBDIVIDE_ADD_MARKER = 0x40,
+    LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
+};
+
+static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d);
+static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
+static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d);
+static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
+
+static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
+static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
+static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
+static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
+
+static inline int32_t  libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
+static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
+static inline int64_t  libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
+static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom);
+
+static inline int32_t  libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom);
+static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
+static inline int64_t  libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom);
+static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
+
+static inline int32_t  libdivide_s32_recover(const struct libdivide_s32_t *denom);
+static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
+static inline int64_t  libdivide_s64_recover(const struct libdivide_s64_t *denom);
+static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
+
+static inline int32_t  libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom);
+static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom);
+static inline int64_t  libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom);
+static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
+    uint64_t xl = x, yl = y;
+    uint64_t rl = xl * yl;
+    return (uint32_t)(rl >> 32);
+}
+
+static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+    int64_t xl = x, yl = y;
+    int64_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int32_t)(rl >> 32);
+}
+
+static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
+    return __umulh(x, y);
+#elif defined(HAS_INT128_T)
+    __uint128_t xl = x, yl = y;
+    __uint128_t rl = xl * yl;
+    return (uint64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t x1 = (uint32_t)(x >> 32);
+    uint32_t y0 = (uint32_t)(y & mask);
+    uint32_t y1 = (uint32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    uint64_t x0y1 = x0 * (uint64_t)y1;
+    uint64_t x1y0 = x1 * (uint64_t)y0;
+    uint64_t x1y1 = x1 * (uint64_t)y1;
+    uint64_t temp = x1y0 + x0y0_hi;
+    uint64_t temp_lo = temp & mask;
+    uint64_t temp_hi = temp >> 32;
+
+    return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
+#endif
+}
+
+static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
+    return __mulh(x, y);
+#elif defined(HAS_INT128_T)
+    __int128_t xl = x, yl = y;
+    __int128_t rl = xl * yl;
+    return (int64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t y0 = (uint32_t)(y & mask);
+    int32_t x1 = (int32_t)(x >> 32);
+    int32_t y1 = (int32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    int64_t t = x1 * (int64_t)y0 + x0y0_hi;
+    int64_t w1 = x0 * (int64_t)y1 + (t & mask);
+
+    return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
+#endif
+}
+
+static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val);
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, val)) {
+        return 31 - result;
+    }
+    return 0;
+#else
+    if (val == 0)
+        return 32;
+    int32_t result = 8;
+    uint32_t hi = 0xFFU << 24;
+    while ((val & hi) == 0) {
+        hi >>= 8;
+        result += 8;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clzll)
+    // Fast way to count leading zeros
+    return __builtin_clzll(val);
+#elif defined(LIBDIVIDE_VC) && defined(_WIN64)
+    unsigned long result;
+    if (_BitScanReverse64(&result, val)) {
+        return 63 - result;
+    }
+    return 0;
+#else
+    uint32_t hi = val >> 32;
+    uint32_t lo = val & 0xFFFFFFFF;
+    if (hi != 0) return libdivide_count_leading_zeros32(hi);
+    return 32 + libdivide_count_leading_zeros32(lo);
+#endif
+}
+
+// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
+// uint {v}. The result must fit in 32 bits.
+// Returns the quotient directly and the remainder in *r
+static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
+#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \
+     defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint32_t result;
+    __asm__("divl %[v]"
+            : "=a"(result), "=d"(*r)
+            : [v] "r"(v), "a"(u0), "d"(u1)
+            );
+    return result;
+#else
+    uint64_t n = ((uint64_t)u1 << 32) | u0;
+    uint32_t result = (uint32_t)(n / v);
+    *r = (uint32_t)(n - result * (uint64_t)v);
+    return result;
+#endif
+}
+
+// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
+// uint {v}. The result must fit in 64 bits.
+// Returns the quotient directly and the remainder in *r
+static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
+#if defined(LIBDIVIDE_X86_64) && \
+    defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint64_t result;
+    __asm__("divq %[v]"
+            : "=a"(result), "=d"(*r)
+            : [v] "r"(v), "a"(u0), "d"(u1)
+            );
+    return result;
+#elif defined(HAS_INT128_T) && \
+      defined(HAS_INT128_DIV)
+    __uint128_t n = ((__uint128_t)u1 << 64) | u0;
+    uint64_t result = (uint64_t)(n / v);
+    *r = (uint64_t)(n - result * (__uint128_t)v);
+    return result;
+#else
+    // Code taken from Hacker's Delight:
+    // http://www.hackersdelight.org/HDcode/divlu.c.
+    // License permits inclusion here per:
+    // http://www.hackersdelight.org/permissions.htm
+
+    const uint64_t b = (1ULL << 32); // Number base (32 bits)
+    uint64_t un1, un0; // Norm. dividend LSD's
+    uint64_t vn1, vn0; // Norm. divisor digits
+    uint64_t q1, q0; // Quotient digits
+    uint64_t un64, un21, un10; // Dividend digit pairs
+    uint64_t rhat; // A remainder
+    int32_t s; // Shift amount for norm
+
+    // If overflow, set rem. to an impossible value,
+    // and return the largest possible quotient
+    if (u1 >= v) {
+        *r = (uint64_t) -1;
+        return (uint64_t) -1;
+    }
+
+    // count leading zeros
+    s = libdivide_count_leading_zeros64(v);
+    if (s > 0) {
+        // Normalize divisor
+        v = v << s;
+        un64 = (u1 << s) | (u0 >> (64 - s));
+        un10 = u0 << s; // Shift dividend left
+    } else {
+        // Avoid undefined behavior of (u0 >> 64).
+        // The behavior is undefined if the right operand is
+        // negative, or greater than or equal to the length
+        // in bits of the promoted left operand.
+        un64 = u1;
+        un10 = u0;
+    }
+
+    // Break divisor up into two 32-bit digits
+    vn1 = v >> 32;
+    vn0 = v & 0xFFFFFFFF;
+
+    // Break right half of dividend into two digits
+    un1 = un10 >> 32;
+    un0 = un10 & 0xFFFFFFFF;
+
+    // Compute the first quotient digit, q1
+    q1 = un64 / vn1;
+    rhat = un64 - q1 * vn1;
+
+    while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+        q1 = q1 - 1;
+        rhat = rhat + vn1;
+        if (rhat >= b)
+            break;
+    }
+
+     // Multiply and subtract
+    un21 = un64 * b + un1 - q1 * v;
+
+    // Compute the second quotient digit
+    q0 = un21 / vn1;
+    rhat = un21 - q0 * vn1;
+
+    while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+        q0 = q0 - 1;
+        rhat = rhat + vn1;
+        if (rhat >= b)
+            break;
+    }
+
+    *r = (un21 * b + un0 - q0 * v) >> s;
+    return q1 * b + q0;
+#endif
+}
+
+// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
+static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
+    if (signed_shift > 0) {
+        uint32_t shift = signed_shift;
+        *u1 <<= shift;
+        *u1 |= *u0 >> (64 - shift);
+        *u0 <<= shift;
+    }
+    else if (signed_shift < 0) {
+        uint32_t shift = -signed_shift;
+        *u0 >>= shift;
+        *u0 |= *u1 << (64 - shift);
+        *u1 >>= shift;
+    }
+}
+
+// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
+static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
+#if defined(HAS_INT128_T) && \
+    defined(HAS_INT128_DIV)
+    __uint128_t ufull = u_hi;
+    __uint128_t vfull = v_hi;
+    ufull = (ufull << 64) | u_lo;
+    vfull = (vfull << 64) | v_lo;
+    uint64_t res = (uint64_t)(ufull / vfull);
+    __uint128_t remainder = ufull - (vfull * res);
+    *r_lo = (uint64_t)remainder;
+    *r_hi = (uint64_t)(remainder >> 64);
+    return res;
+#else
+    // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
+    // We want to compute u / v
+    typedef struct { uint64_t hi; uint64_t lo; } u128_t;
+    u128_t u = {u_hi, u_lo};
+    u128_t v = {v_hi, v_lo};
+
+    if (v.hi == 0) {
+        // divisor v is a 64 bit value, so we just need one 128/64 division
+        // Note that we are simpler than Hacker's Delight here, because we know
+        // the quotient fits in 64 bits whereas Hacker's Delight demands a full
+        // 128 bit quotient
+        *r_hi = 0;
+        return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
+    }
+    // Here v >= 2**64
+    // We know that v.hi != 0, so count leading zeros is OK
+    // We have 0 <= n <= 63
+    uint32_t n = libdivide_count_leading_zeros64(v.hi);
+
+    // Normalize the divisor so its MSB is 1
+    u128_t v1t = v;
+    libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
+    uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64
+
+    // To ensure no overflow
+    u128_t u1 = u;
+    libdivide_u128_shift(&u1.hi, &u1.lo, -1);
+
+    // Get quotient from divide unsigned insn.
+    uint64_t rem_ignored;
+    uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
+
+    // Undo normalization and division of u by 2.
+    u128_t q0 = {0, q1};
+    libdivide_u128_shift(&q0.hi, &q0.lo, n);
+    libdivide_u128_shift(&q0.hi, &q0.lo, -63);
+
+    // Make q0 correct or too small by 1
+    // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
+    if (q0.hi != 0 || q0.lo != 0) {
+        q0.hi -= (q0.lo == 0); // borrow
+        q0.lo -= 1;
+    }
+
+    // Now q0 is correct.
+    // Compute q0 * v as q0v
+    // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
+    // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
+    //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
+    // Each term is 128 bit
+    // High half of full product (upper 128 bits!) are dropped
+    u128_t q0v = {0, 0};
+    q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
+    q0v.lo = q0.lo*v.lo;
+
+    // Compute u - q0v as u_q0v
+    // This is the remainder
+    u128_t u_q0v = u;
+    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow
+    u_q0v.lo -= q0v.lo;
+
+    // Check if u_q0v >= v
+    // This checks if our remainder is larger than the divisor
+    if ((u_q0v.hi > v.hi) ||
+        (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
+        // Increment q0
+        q0.lo += 1;
+        q0.hi += (q0.lo == 0); // carry
+
+        // Subtract v from remainder
+        u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
+        u_q0v.lo -= v.lo;
+    }
+
+    *r_hi = u_q0v.hi;
+    *r_lo = u_q0v.lo;
+
+    LIBDIVIDE_ASSERT(q0.hi == 0);
+    return q0.lo;
+#endif
+}
+
+////////// UINT32
+
+static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u32_t result;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint8_t more;
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint32_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < (1U << floor_log_2_d))) {
+            // This power works
+            more = floor_log_2_d;
+        } else {
+            // We have to use the general 33-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
+    return libdivide_internal_u32_gen(d, 0);
+}
+
+struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
+    struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
+    return ret;
+}
+
+uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    }
+    else {
+        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint32_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
+        }
+        else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+    uint32_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1U << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(32 + shift)
+        // Therefore we have d = 2^(32 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint32_t hi_dividend = 1U << shift;
+        uint32_t rem_ignored;
+        return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = 1ULL << (32 + shift);
+        uint64_t d = (1ULL << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1U << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = 1ULL << (32 + shift);
+        uint64_t d = (1ULL << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+/////////// UINT64
+
+static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u64_t result;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint64_t proposed_m, rem;
+        uint8_t more;
+        // (1 << (64 + floor_log_2_d)) / d
+        proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint64_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d;
+        } else {
+            // We have to use the general 65-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder. By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+                more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases,
+        // which is why we do it outside of the if statement.
+    }
+    return result;
+}
+
+struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
+    return libdivide_internal_u64_gen(d, 0);
+}
+
+struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
+    struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
+    return ret;
+}
+
+uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    }
+    else {
+        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint64_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
+        }
+        else {
+             // All upper bits are 0,
+             // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+    uint64_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1ULL << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(64 + shift)
+        // Therefore we have d = 2^(64 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint64_t hi_dividend = 1ULL << shift;
+        uint64_t rem_ignored;
+        return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1ULL << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+/////////// SINT32
+
+static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s32_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint32_t ud = (uint32_t)d;
+    uint32_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint32_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1U << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d - 1;
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+
+        proposed_m += 1;
+        int32_t magic = (int32_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
+    return libdivide_internal_s32_gen(d, 0);
+}
+
+struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
+    struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
+    struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        uint32_t sign = (int8_t)more >> 7;
+        uint32_t mask = (1U << shift) - 1;
+        uint32_t uq = numer + ((numer >> 31) & mask);
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int32_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint32_t)numer ^ sign) - sign;
+        }
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int32_t sign = (int8_t)more >> 7;
+    int32_t magic = denom->magic;
+    int32_t q = libdivide_mullhi_s32(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    uint32_t q_sign = (uint32_t)(q >> 31);
+    q += q_sign & ((1U << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    if (!denom->magic) {
+        uint32_t absD = 1U << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int32_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
+            ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int32_t result = 1U << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30
+        uint32_t q = (uint32_t)(n / d);
+        int32_t result = (int32_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
+    return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
+}
+
+///////////// SINT64
+
+static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s64_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set.  This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint64_t ud = (uint64_t)d;
+    uint64_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and non-branchfree cases are the same
+        result.magic = 0;
+        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+    } else {
+        // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint8_t more;
+        uint64_t rem, proposed_m;
+        proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint64_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d - 1;
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
+            // also set ADD_MARKER this is an annoying optimization that
+            // enables algorithm #4 to avoid the mask. However we always set it
+            // in the branchfree case
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        proposed_m += 1;
+        int64_t magic = (int64_t)proposed_m;
+
+        // Mark if we are negative
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
+    return libdivide_internal_s64_gen(d, 0);
+}
+
+struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
+    struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
+    struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
+    return ret;
+}
+
+int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) { // shift path
+        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t uq = numer + ((numer >> 63) & mask);
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        // must be arithmetic shift and then sign-extend
+        int64_t sign = (int8_t)more >> 7;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int64_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint64_t)numer ^ sign) - sign;
+        }
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int64_t sign = (int8_t)more >> 7;
+    int64_t magic = denom->magic;
+    int64_t q = libdivide_mullhi_s64(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2.
+    uint64_t is_power_of_2 = (magic == 0);
+    uint64_t q_sign = (uint64_t)(q >> 63);
+    q += q_sign & ((1ULL << shift) - is_power_of_2);
+
+    // Arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    if (denom->magic == 0) { // shift path
+        uint64_t absD = 1ULL << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int64_t)absD;
+    } else {
+        // Unsigned math is much easier
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
+            ? denom->magic > 0 : denom->magic < 0;
+
+        uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n_hi = 1ULL << shift, n_lo = 0;
+        uint64_t rem_ignored;
+        uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
+        int64_t result = (int64_t)(q + 1);
+        if (negative_divisor) {
+            result = -result;
+        }
+        return result;
+    }
+}
+
+int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
+    return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
+}
+
+#if defined(LIBDIVIDE_AVX512)
+
+static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom);
+static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom);
+static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom);
+static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom);
+
+static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static inline __m512i libdivide_s64_signbits(__m512i v) {;
+    return _mm512_srai_epi64(v, 63);
+}
+
+static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) {
+    return _mm512_srai_epi64(v, amt);
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) {
+    __m512i lomask = _mm512_set1_epi64(0xffffffff);
+    __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1);
+    __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1);
+    __m512i w0 = _mm512_mul_epu32(x, y);
+    __m512i w1 = _mm512_mul_epu32(x, yh);
+    __m512i w2 = _mm512_mul_epu32(xh, y);
+    __m512i w3 = _mm512_mul_epu32(xh, yh);
+    __m512i w0h = _mm512_srli_epi64(w0, 32);
+    __m512i s1 = _mm512_add_epi64(w1, w0h);
+    __m512i s1l = _mm512_and_si512(s1, lomask);
+    __m512i s1h = _mm512_srli_epi64(s1, 32);
+    __m512i s2 = _mm512_add_epi64(w2, s1l);
+    __m512i s2h = _mm512_srli_epi64(s2, 32);
+    __m512i hi = _mm512_add_epi64(w3, s1h);
+            hi = _mm512_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) {
+    __m512i p = libdivide_mullhi_u64_vector(x, y);
+    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y);
+    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x);
+    p = _mm512_sub_epi64(p, t1);
+    p = _mm512_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi32(numers, more);
+    }
+    else {
+        __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+            return _mm512_srli_epi32(t, shift);
+        }
+        else {
+            return _mm512_srli_epi32(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+    __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+    return _mm512_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi64(numers, more);
+    }
+    else {
+        __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+            return _mm512_srli_epi64(t, shift);
+        }
+        else {
+            return _mm512_srli_epi64(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+    __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+    return _mm512_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm512_srai_epi32(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
+        return q;
+    }
+    else {
+        __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+    __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic));
+    q = _mm512_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31
+    __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm512_srai_epi32(q, shift); // q >>= shift
+    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
+        return q;
+    }
+    else {
+        __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
+    q = _mm512_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2);
+    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#elif defined(LIBDIVIDE_AVX2)
+
+static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom);
+static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom);
+static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom);
+static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom);
+
+static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
+static inline __m256i libdivide_s64_signbits(__m256i v) {
+    __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm256_srai_epi64 (from AVX512).
+static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) {
+    const int b = 64 - amt;
+    __m256i m = _mm256_set1_epi64x(1ULL << (b - 1));
+    __m256i x = _mm256_srli_epi64(v, amt);
+    __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) {
+    __m256i lomask = _mm256_set1_epi64x(0xffffffff);
+    __m256i xh = _mm256_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
+    __m256i yh = _mm256_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
+    __m256i w0 = _mm256_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
+    __m256i w1 = _mm256_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
+    __m256i w2 = _mm256_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
+    __m256i w3 = _mm256_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
+    __m256i w0h = _mm256_srli_epi64(w0, 32);
+    __m256i s1 = _mm256_add_epi64(w1, w0h);
+    __m256i s1l = _mm256_and_si256(s1, lomask);
+    __m256i s1h = _mm256_srli_epi64(s1, 32);
+    __m256i s2 = _mm256_add_epi64(w2, s1l);
+    __m256i s2h = _mm256_srli_epi64(s2, 32);
+    __m256i hi = _mm256_add_epi64(w3, s1h);
+            hi = _mm256_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) {
+    __m256i p = libdivide_mullhi_u64_vector(x, y);
+    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y);
+    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x);
+    p = _mm256_sub_epi64(p, t1);
+    p = _mm256_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi32(numers, more);
+    }
+    else {
+        __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+            return _mm256_srli_epi32(t, shift);
+        }
+        else {
+            return _mm256_srli_epi32(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+    __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+    return _mm256_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi64(numers, more);
+    }
+    else {
+        __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+            return _mm256_srli_epi64(t, shift);
+        }
+        else {
+            return _mm256_srli_epi64(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+    __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+    return _mm256_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm256_srai_epi32(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
+        return q;
+    }
+    else {
+        __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+    __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic));
+    q = _mm256_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31
+    __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm256_srai_epi32(q, shift); // q >>= shift
+    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
+        return q;
+    }
+    else {
+        __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
+    q = _mm256_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2);
+    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#elif defined(LIBDIVIDE_SSE2)
+
+static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
+static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
+static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
+static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
+
+static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
+static inline __m128i libdivide_s64_signbits(__m128i v) {
+    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm_srai_epi64 (from AVX512).
+static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
+    const int b = 64 - amt;
+    __m128i m = _mm_set1_epi64x(1ULL << (b - 1));
+    __m128i x = _mm_srli_epi64(v, amt);
+    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) {
+    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
+    __m128i a1X3X = _mm_srli_epi64(a, 32);
+    __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
+    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
+    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// SSE2 does not have a signed multiplication instruction, but we can convert
+// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
+// repeated four times.
+static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) {
+    __m128i p = libdivide_mullhi_u32_vector(a, b);
+    // t1 = (a >> 31) & y, arithmetic shift
+    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
+    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
+    p = _mm_sub_epi32(p, t1);
+    p = _mm_sub_epi32(p, t2);
+    return p;
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) {
+    __m128i lomask = _mm_set1_epi64x(0xffffffff);
+    __m128i xh = _mm_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
+    __m128i yh = _mm_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
+    __m128i w0 = _mm_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
+    __m128i w1 = _mm_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
+    __m128i w2 = _mm_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
+    __m128i w3 = _mm_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
+    __m128i w0h = _mm_srli_epi64(w0, 32);
+    __m128i s1 = _mm_add_epi64(w1, w0h);
+    __m128i s1l = _mm_and_si128(s1, lomask);
+    __m128i s1h = _mm_srli_epi64(s1, 32);
+    __m128i s2 = _mm_add_epi64(w2, s1l);
+    __m128i s2h = _mm_srli_epi64(s2, 32);
+    __m128i hi = _mm_add_epi64(w3, s1h);
+            hi = _mm_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) {
+    __m128i p = libdivide_mullhi_u64_vector(x, y);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
+    p = _mm_sub_epi64(p, t1);
+    p = _mm_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi32(numers, more);
+    }
+    else {
+        __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+            return _mm_srli_epi32(t, shift);
+        }
+        else {
+            return _mm_srli_epi32(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+    return _mm_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi64(numers, more);
+    }
+    else {
+        __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+            return _mm_srli_epi64(t, shift);
+        }
+        else {
+            return _mm_srli_epi64(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+    return _mm_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm_srai_epi32(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
+        return q;
+    }
+    else {
+        __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+    __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic));
+    q = _mm_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
+    __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm_srai_epi32(q, shift); // q >>= shift
+    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
+        return q;
+    }
+    else {
+        __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
+    q = _mm_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2);
+    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+/////////// C++ stuff
+
+#ifdef __cplusplus
+
+// The C++ divider class is templated on both an integer type
+// (like uint64_t) and an algorithm type.
+// * BRANCHFULL is the default algorithm type.
+// * BRANCHFREE is the branchfree algorithm type.
+enum {
+    BRANCHFULL,
+    BRANCHFREE
+};
+
+#if defined(LIBDIVIDE_AVX512)
+    #define LIBDIVIDE_VECTOR_TYPE __m512i
+#elif defined(LIBDIVIDE_AVX2)
+    #define LIBDIVIDE_VECTOR_TYPE __m256i
+#elif defined(LIBDIVIDE_SSE2)
+    #define LIBDIVIDE_VECTOR_TYPE __m128i
+#endif
+
+#if !defined(LIBDIVIDE_VECTOR_TYPE)
+    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO)
+#else
+    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
+        LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \
+            return libdivide_##ALGO##_do_vector(n, &denom); \
+        }
+#endif
+
+// The DISPATCHER_GEN() macro generates C++ methods (for the given integer
+// and algorithm types) that redirect to libdivide's C API.
+#define DISPATCHER_GEN(T, ALGO) \
+    libdivide_##ALGO##_t denom; \
+    dispatcher() { } \
+    dispatcher(T d) \
+        : denom(libdivide_##ALGO##_gen(d)) \
+    { } \
+    T divide(T n) const { \
+        return libdivide_##ALGO##_do(n, &denom); \
+    } \
+    LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
+    T recover() const { \
+        return libdivide_##ALGO##_recover(&denom); \
+    }
+
+// The dispatcher selects a specific division algorithm for a given
+// type and ALGO using partial template specialization.
+template<bool IS_INTEGRAL, bool IS_SIGNED, int SIZEOF, int ALGO> struct dispatcher { };
+
+template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFULL> { DISPATCHER_GEN(int32_t, s32) };
+template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFREE> { DISPATCHER_GEN(int32_t, s32_branchfree) };
+template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFULL> { DISPATCHER_GEN(uint32_t, u32) };
+template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFREE> { DISPATCHER_GEN(uint32_t, u32_branchfree) };
+template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFULL> { DISPATCHER_GEN(int64_t, s64) };
+template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFREE> { DISPATCHER_GEN(int64_t, s64_branchfree) };
+template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFULL> { DISPATCHER_GEN(uint64_t, u64) };
+template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFREE> { DISPATCHER_GEN(uint64_t, u64_branchfree) };
+
+// This is the main divider class for use by the user (C++ API).
+// The actual division algorithm is selected using the dispatcher struct
+// based on the integer and algorithm template parameters.
+template<typename T, int ALGO = BRANCHFULL>
+class divider {
+public:
+    // We leave the default constructor empty so that creating
+    // an array of dividers and then initializing them
+    // later doesn't slow us down.
+    divider() { }
+
+    // Constructor that takes the divisor as a parameter
+    divider(T d) : div(d) { }
+
+    // Divides n by the divisor
+    T divide(T n) const {
+        return div.divide(n);
+    }
+
+    // Recovers the divisor, returns the value that was
+    // used to initialize this divider object.
+    T recover() const {
+        return div.recover();
+    }
+
+    bool operator==(const divider<T, ALGO>& other) const {
+        return div.denom.magic == other.denom.magic &&
+               div.denom.more == other.denom.more;
+    }
+
+    bool operator!=(const divider<T, ALGO>& other) const {
+        return !(*this == other);
+    }
+
+#if defined(LIBDIVIDE_VECTOR_TYPE)
+    // Treats the vector as packed integer values with the same type as
+    // the divider (e.g. s32, u32, s64, u64) and divides each of
+    // them by the divider, returning the packed quotients.
+    LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const {
+        return div.divide(n);
+    }
+#endif
+
+private:
+    // Storage for the actual divisor
+    dispatcher<std::is_integral<T>::value,
+               std::is_signed<T>::value, sizeof(T), ALGO> div;
+};
+
+// Overload of operator / for scalar division
+template<typename T, int ALGO>
+T operator/(T n, const divider<T, ALGO>& div) {
+    return div.divide(n);
+}
+
+// Overload of operator /= for scalar division
+template<typename T, int ALGO>
+T& operator/=(T& n, const divider<T, ALGO>& div) {
+    n = div.divide(n);
+    return n;
+}
+
+#if defined(LIBDIVIDE_VECTOR_TYPE)
+    // Overload of operator / for vector division
+    template<typename T, int ALGO>
+    LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider<T, ALGO>& div) {
+        return div.divide(n);
+    }
+    // Overload of operator /= for vector division
+    template<typename T, int ALGO>
+    LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider<T, ALGO>& div) {
+        n = div.divide(n);
+        return n;
+    }
+#endif
+
+// libdivdie::branchfree_divider<T>
+template <typename T>
+using branchfree_divider = divider<T, BRANCHFREE>;
+
+} // namespace libdivide
+
+#endif // __cplusplus
+
+#endif // LIBDIVIDE_H
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index bbcf468c1..63e8bf974 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -210,6 +210,7 @@ typedef enum {
 
 /* For specifying allowed casting in operations which support it */
 typedef enum {
+        _NPY_ERROR_OCCURRED_IN_CAST = -1,
         /* Only allow identical types */
         NPY_NO_CASTING=0,
         /* Allow identical and byte swapped types */
@@ -219,7 +220,14 @@ typedef enum {
         /* Allow safe casts or casts within the same kind */
         NPY_SAME_KIND_CASTING=3,
         /* Allow any casts */
-        NPY_UNSAFE_CASTING=4
+        NPY_UNSAFE_CASTING=4,
+        /*
+         * Flag to allow signalling that a cast is a view, this flag is not
+         * valid when requesting a cast of specific safety.
+         * _NPY_CAST_IS_VIEW|NPY_EQUIV_CASTING means the same as NPY_NO_CASTING.
+         */
+        // TODO-DTYPES: Needs to be documented.
+        _NPY_CAST_IS_VIEW = 1 << 16,
 } NPY_CASTING;
 
 typedef enum {
@@ -701,6 +709,7 @@ typedef struct tagPyArrayObject_fields {
     int flags;
     /* For weak references */
     PyObject *weakreflist;
+    void *_buffer_info;  /* private buffer info, tagged to allow warning */
 } PyArrayObject_fields;
 
 /*
@@ -720,7 +729,18 @@ typedef struct tagPyArrayObject {
 } PyArrayObject;
 #endif
 
-#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields))
+/*
+ * Removed 2020-Nov-25, NumPy 1.20
+ * #define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields))
+ *
+ * The above macro was removed as it gave a false sense of a stable ABI
+ * with respect to the structures size.  If you require a runtime constant,
+ * you can use `PyArray_Type.tp_basicsize` instead.  Otherwise, please
+ * see the PyArrayObject documentation or ask the NumPy developers for
+ * information on how to correctly replace the macro in a way that is
+ * compatible with multiple NumPy versions.
+ */
+
 
 /* Array Flags Object */
 typedef struct PyArrayFlagsObject {
@@ -1759,8 +1779,8 @@ typedef struct {
 } npy_stride_sort_item;
 
 /************************************************************
- * This is the form of the struct that's returned pointed by the
- * PyCObject attribute of an array __array_struct__. See
+ * This is the form of the struct that's stored in the
+ * PyCapsule returned by an array's __array_struct__ attribute. See
  * https://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full
  * documentation.
  ************************************************************/
@@ -1839,6 +1859,10 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
             PyArray_DTypeMeta *cls, PyTypeObject *obj);
 
     typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
+    typedef PyArray_DTypeMeta *(common_dtype_function)(
+            PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtyep2);
+    typedef PyArray_Descr *(common_instance_function)(
+            PyArray_Descr *dtype1, PyArray_Descr *dtyep2);
 
     /*
      * While NumPy DTypes would not need to be heap types the plan is to
@@ -1894,6 +1918,14 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
         discover_descr_from_pyobject_function *discover_descr_from_pyobject;
         is_known_scalar_type_function *is_known_scalar_type;
         default_descr_function *default_descr;
+        common_dtype_function *common_dtype;
+        common_instance_function *common_instance;
+        /*
+         * Dictionary of ArrayMethods representing most possible casts
+         * (structured and object are exceptions).
+         * This should potentially become a weak mapping in the future.
+         */
+        PyObject *castingimpls;
     };
 
 #endif  /* NPY_INTERNAL_BUILD */
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index 7e6b01924..191cd244f 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -28,6 +28,30 @@ extern "C" {
  * PyInt -> PyLong
  */
 
+
+/*
+ * This is a renamed copy of the Python non-limited API function _PyLong_AsInt. It is
+ * included here because it is missing from the PyPy API. It completes the PyLong_As*
+ * group of functions and can be useful in replacing PyInt_Check.
+ */
+static NPY_INLINE int
+Npy__PyLong_AsInt(PyObject *obj)
+{
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(obj, &overflow);
+
+    /* INT_MAX and INT_MIN are defined in Python.h */
+    if (overflow || result > INT_MAX || result < INT_MIN) {
+        /* XXX: could be cute and give a different
+           message for overflow == -1 */
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)result;
+}
+
+
 #if defined(NPY_PY3K)
 /* Return True only if the long fits in a C long */
 static NPY_INLINE int PyInt_Check(PyObject *op) {
@@ -39,6 +63,7 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
     return (overflow == 0);
 }
 
+
 #define PyInt_FromLong PyLong_FromLong
 #define PyInt_AsLong PyLong_AsLong
 #define PyInt_AS_LONG PyLong_AsLong
@@ -65,6 +90,8 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
     #define Py_SET_TYPE(obj, type) ((Py_TYPE(obj) = (type)), (void)0)
     /* Introduced in https://github.com/python/cpython/commit/b10dc3e7a11fcdb97e285882eba6da92594f90f9 */
     #define Py_SET_SIZE(obj, size) ((Py_SIZE(obj) = (size)), (void)0)
+    /* Introduced in https://github.com/python/cpython/commit/c86a11221df7e37da389f9c6ce6e47ea22dc44ff */
+    #define Py_SET_REFCNT(obj, refcnt) ((Py_REFCNT(obj) = (refcnt)), (void)0)
 #endif
 
 
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 5706e0576..c8495db8e 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -1,22 +1,34 @@
 #ifndef _NPY_COMMON_H_
 #define _NPY_COMMON_H_
 
+/* need Python.h for npy_intp, npy_uintp */
+#include <Python.h>
+
 /* numpconfig.h is auto-generated */
 #include "numpyconfig.h"
 #ifdef HAVE_NPY_CONFIG_H
 #include <npy_config.h>
 #endif
 
-/* need Python.h for npy_intp, npy_uintp */
-#include <Python.h>
-
+// compile time environment variables
+#ifndef NPY_RELAXED_STRIDES_CHECKING
+    #define NPY_RELAXED_STRIDES_CHECKING 0
+#endif
+#ifndef NPY_RELAXED_STRIDES_DEBUG
+    #define NPY_RELAXED_STRIDES_DEBUG 0
+#endif
+#ifndef NPY_USE_NEW_CASTINGIMPL
+    #define NPY_USE_NEW_CASTINGIMPL 0
+#endif
 /*
  * using static inline modifiers when defining npy_math functions
  * allows the compiler to make optimizations when possible
  */
-#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
 #ifndef NPY_INLINE_MATH
-#define NPY_INLINE_MATH 1
+#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
+    #define NPY_INLINE_MATH 1
+#else
+    #define NPY_INLINE_MATH 0
 #endif
 #endif
 
@@ -262,11 +274,10 @@ typedef Py_uintptr_t npy_uintp;
 #define constchar char
 
 /* NPY_INTP_FMT Note:
- *      Unlike the other NPY_*_FMT macros which are used with
- *      PyOS_snprintf, NPY_INTP_FMT is used with PyErr_Format and
- *      PyString_Format. These functions use different formatting
- *      codes which are portably specified according to the Python
- *      documentation. See ticket #1795.
+ *      Unlike the other NPY_*_FMT macros, which are used with PyOS_snprintf,
+ *      NPY_INTP_FMT is used with PyErr_Format and PyUnicode_FromFormat. Those
+ *      functions use different formatting codes that are portably specified
+ *      according to the Python documentation. See issue gh-2388.
  */
 #if NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_INT
         #define NPY_INTP NPY_INT
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 509e23a51..4dbf9d84e 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -24,7 +24,6 @@
 #define _NPY_CPUARCH_H_
 
 #include "numpyconfig.h"
-#include <string.h> /* for memcpy */
 
 #if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
     /*
@@ -111,8 +110,6 @@
     information about your platform (OS, CPU and compiler)
 #endif
 
-#define NPY_COPY_PYOBJECT_PTR(dst, src) memcpy(dst, src, sizeof(PyObject *))
-
 #if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
 #define NPY_CPU_HAVE_UNALIGNED_ACCESS 1
 #else
diff --git a/numpy/core/include/numpy/npy_interrupt.h b/numpy/core/include/numpy/npy_interrupt.h
index 40cb7ac5e..bcb539326 100644
--- a/numpy/core/include/numpy/npy_interrupt.h
+++ b/numpy/core/include/numpy/npy_interrupt.h
@@ -1,79 +1,18 @@
-
-/* Signal handling:
-
-This header file defines macros that allow your code to handle
-interrupts received during processing.  Interrupts that
-could reasonably be handled:
-
-SIGINT, SIGABRT, SIGALRM, SIGSEGV
-
-****Warning***************
-
-Do not allow code that creates temporary memory or increases reference
-counts of Python objects to be interrupted unless you handle it
-differently.
-
-**************************
-
-The mechanism for handling interrupts is conceptually simple:
-
-  - replace the signal handler with our own home-grown version
-     and store the old one.
-  - run the code to be interrupted -- if an interrupt occurs
-     the handler should basically just cause a return to the
-     calling function for finish work.
-  - restore the old signal handler
-
-Of course, every code that allows interrupts must account for
-returning via the interrupt and handle clean-up correctly.  But,
-even still, the simple paradigm is complicated by at least three
-factors.
-
- 1) platform portability (i.e. Microsoft says not to use longjmp
-     to return from signal handling.  They have a __try  and __except
-     extension to C instead but what about mingw?).
-
- 2) how to handle threads: apparently whether signals are delivered to
-    every thread of the process or the "invoking" thread is platform
-    dependent. --- we don't handle threads for now.
-
- 3) do we need to worry about re-entrance.  For now, assume the
-    code will not call-back into itself.
-
-Ideas:
-
- 1) Start by implementing an approach that works on platforms that
-    can use setjmp and longjmp functionality and does nothing
-    on other platforms.
-
- 2) Ignore threads --- i.e. do not mix interrupt handling and threads
-
- 3) Add a default signal_handler function to the C-API but have the rest
-    use macros.
-
-
-Simple Interface:
-
-
-In your C-extension: around a block of code you want to be interruptible
-with a SIGINT
-
-NPY_SIGINT_ON
-[code]
-NPY_SIGINT_OFF
-
-In order for this to work correctly, the
-[code] block must not allocate any memory or alter the reference count of any
-Python objects.  In other words [code] must be interruptible so that continuation
-after NPY_SIGINT_OFF will only be "missing some computations"
-
-Interrupt handling does not work well with threads.
-
-*/
-
-/* Add signal handling macros
-   Make the global variable and signal handler part of the C-API
-*/
+/*
+ * This API is only provided because it is part of publicly exported
+ * headers. Its use is considered DEPRECATED, and it will be removed
+ * eventually.
+ * (This includes the _PyArray_SigintHandler and _PyArray_GetSigintBuf
+ * functions which are however, public API, and not headers.)
+ *
+ * Instead of using these non-threadsafe macros consider periodically
+ * querying `PyErr_CheckSignals()` or `PyOS_InterruptOccurred()` will work.
+ * Both of these require holding the GIL, although cpython could add a
+ * version of `PyOS_InterruptOccurred()` which does not. Such a version
+ * actually exists as private API in Python 3.10, and backported to 3.9 and 3.8,
+ * see also https://bugs.python.org/issue41037 and
+ * https://github.com/python/cpython/pull/20599).
+ */
 
 #ifndef NPY_INTERRUPT_H
 #define NPY_INTERRUPT_H
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index a07f49501..7d71c36cc 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -5,14 +5,12 @@
 extern "C" {
 #endif
 
+#include <numpy/npy_common.h>
+
 #include <math.h>
 #ifdef __SUNPRO_CC
 #include <sunmath.h>
 #endif
-#ifdef HAVE_NPY_CONFIG_H
-#include <npy_config.h>
-#endif
-#include <numpy/npy_common.h>
 
 /* By adding static inline specifiers to npy_math function definitions when
    appropriate, compiler is given the opportunity to optimize */
@@ -213,7 +211,7 @@ double npy_spacing(double x);
 
 /* use builtins to avoid function calls in tight loops
  * only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISNAN
+#ifdef HAVE___BUILTIN_ISNAN
     #define npy_isnan(x) __builtin_isnan(x)
 #else
     #ifndef NPY_HAVE_DECL_ISNAN
@@ -229,7 +227,7 @@ double npy_spacing(double x);
 
 
 /* only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISFINITE
+#ifdef HAVE___BUILTIN_ISFINITE
     #define npy_isfinite(x) __builtin_isfinite(x)
 #else
     #ifndef NPY_HAVE_DECL_ISFINITE
@@ -244,7 +242,7 @@ double npy_spacing(double x);
 #endif
 
 /* only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISINF
+#ifdef HAVE___BUILTIN_ISINF
     #define npy_isinf(x) __builtin_isinf(x)
 #else
     #ifndef NPY_HAVE_DECL_ISINF
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 8eaf446b7..a1b1de0ef 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -41,6 +41,7 @@
 #define NPY_1_17_API_VERSION 0x00000008
 #define NPY_1_18_API_VERSION 0x00000008
 #define NPY_1_19_API_VERSION 0x00000008
-#define NPY_1_20_API_VERSION 0x00000008
+#define NPY_1_20_API_VERSION 0x0000000e
+#define NPY_1_21_API_VERSION 0x0000000e
 
 #endif
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index cb025736e..892ad2540 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -1,8 +1,8 @@
+from contextlib import nullcontext
+
 import numpy as np
 from .numeric import uint8, ndarray, dtype
-from numpy.compat import (
-    os_fspath, contextlib_nullcontext, is_pathlib_path
-)
+from numpy.compat import os_fspath, is_pathlib_path
 from numpy.core.overrides import set_module
 
 __all__ = ['memmap']
@@ -37,7 +37,10 @@ class memmap(ndarray):
     This class may at some point be turned into a factory function
     which returns a view into an mmap buffer.
 
-    Delete the memmap instance to close the memmap file.
+    Flush the memmap instance to write the changes to the file. Currently there
+    is no API to close the underlying ``mmap``. It is tricky to ensure the
+    resource is actually closed, since it may be shared between different
+    memmap instances.
 
 
     Parameters
@@ -97,7 +100,7 @@ class memmap(ndarray):
     flush
         Flush any changes in memory to file on disk.
         When you delete a memmap object, flush is called first to write
-        changes to disk before removing the object.
+        changes to disk.
 
 
     See also
@@ -109,7 +112,7 @@ class memmap(ndarray):
     The memmap object can be used anywhere an ndarray is accepted.
     Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
     ``True``.
-    
+
     Memory-mapped files cannot be larger than 2GB on 32-bit systems.
 
     When a memmap causes a file to be created or extended beyond its
@@ -148,9 +151,9 @@ class memmap(ndarray):
     >>> fp.filename == path.abspath(filename)
     True
 
-    Deletion flushes memory changes to disk before removing the object:
+    Flushes memory changes to disk in order to read them back
 
-    >>> del fp
+    >>> fp.flush()
 
     Load the memmap and verify data was stored:
 
@@ -220,7 +223,7 @@ class memmap(ndarray):
             raise ValueError("shape must be given")
 
         if hasattr(filename, 'read'):
-            f_ctx = contextlib_nullcontext(filename)
+            f_ctx = nullcontext(filename)
         else:
             f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
 
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 0becc2393..f736973de 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -38,7 +38,7 @@ __all__ = [
     'nested_iters', 'normalize_axis_index', 'packbits',
     'promote_types', 'putmask', 'ravel_multi_index', 'result_type', 'scalar',
     'set_datetimeparse_function', 'set_legacy_print_mode', 'set_numeric_ops',
-    'set_string_function', 'set_typeDict', 'shares_memory', 'test_interrupt',
+    'set_string_function', 'set_typeDict', 'shares_memory',
     'tracemalloc_domain', 'typeinfo', 'unpackbits', 'unravel_index', 'vdot',
     'where', 'zeros']
 
@@ -90,14 +90,14 @@ def empty_like(prototype, dtype=None, order=None, subok=None, shape=None):
         .. versionadded:: 1.6.0
     order : {'C', 'F', 'A', or 'K'}, optional
         Overrides the memory layout of the result. 'C' means C-order,
-        'F' means F-order, 'A' means 'F' if ``prototype`` is Fortran
-        contiguous, 'C' otherwise. 'K' means match the layout of ``prototype``
+        'F' means F-order, 'A' means 'F' if `prototype` is Fortran
+        contiguous, 'C' otherwise. 'K' means match the layout of `prototype`
         as closely as possible.
 
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `prototype`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -141,9 +141,9 @@ def empty_like(prototype, dtype=None, order=None, subok=None, shape=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.concatenate)
-def concatenate(arrays, axis=None, out=None):
+def concatenate(arrays, axis=None, out=None, *, dtype=None, casting=None):
     """
-    concatenate((a1, a2, ...), axis=0, out=None)
+    concatenate((a1, a2, ...), axis=0, out=None, dtype=None, casting="same_kind")
 
     Join a sequence of arrays along an existing axis.
 
@@ -159,6 +159,16 @@ def concatenate(arrays, axis=None, out=None):
         If provided, the destination to place the result. The shape must be
         correct, matching that of what concatenate would have returned if no
         out argument were specified.
+    dtype : str or dtype
+        If provided, the destination array will have this dtype. Cannot be
+        provided together with `out`.
+
+        .. versionadded:: 1.20.0
+
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur. Defaults to 'same_kind'.
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -396,7 +406,7 @@ def lexsort(keys, axis=None):
     for the primary sort order, the second-to-last key for the secondary sort
     order, and so on. The keys argument must be a sequence of objects that
     can be converted to arrays of the same shape. If a 2D array is provided
-    for the keys argument, it's rows are interpreted as the sorting keys and
+    for the keys argument, its rows are interpreted as the sorting keys and
     sorting is according to the last row, second last row etc.
 
     Parameters
@@ -989,7 +999,7 @@ def ravel_multi_index(multi_index, dims, mode=None, order=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.unravel_index)
-def unravel_index(indices, shape=None, order=None, dims=None):
+def unravel_index(indices, shape=None, order=None):
     """
     unravel_index(indices, shape, order='C')
 
@@ -1035,9 +1045,6 @@ def unravel_index(indices, shape=None, order=None, dims=None):
     (3, 1, 4, 1)
 
     """
-    if dims is not None:
-        warnings.warn("'shape' argument should be used instead of 'dims'",
-                      DeprecationWarning, stacklevel=3)
     return (indices,)
 
 
@@ -1090,7 +1097,7 @@ def putmask(a, mask, values):
 
     Parameters
     ----------
-    a : array_like
+    a : ndarray
         Target array.
     mask : array_like
         Boolean mask array. It has to be the same shape as `a`.
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index e77161f59..c95c48d71 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -21,7 +21,7 @@ from .multiarray import (
 from . import overrides
 from . import umath
 from . import shape_base
-from .overrides import set_module
+from .overrides import set_array_function_like_doc, set_module
 from .umath import (multiply, invert, sin, PINF, NAN)
 from . import numerictypes
 from .numerictypes import longlong, intc, int_, float_, complex_, bool_
@@ -95,7 +95,7 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -141,8 +141,13 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
+def _ones_dispatcher(shape, dtype=None, order=None, *, like=None):
+    return(like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def ones(shape, dtype=None, order='C'):
+def ones(shape, dtype=None, order='C', *, like=None):
     """
     Return a new array of given shape and type, filled with ones.
 
@@ -157,6 +162,9 @@ def ones(shape, dtype=None, order='C'):
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -189,11 +197,19 @@ def ones(shape, dtype=None, order='C'):
            [1.,  1.]])
 
     """
+    if like is not None:
+        return _ones_with_like(shape, dtype=dtype, order=order, like=like)
+
     a = empty(shape, dtype, order)
     multiarray.copyto(a, 1, casting='unsafe')
     return a
 
 
+_ones_with_like = array_function_dispatch(
+    _ones_dispatcher
+)(ones)
+
+
 def _ones_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
@@ -221,7 +237,7 @@ def ones_like(a, dtype=None, order='K', subok=True, shape=None):
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -265,8 +281,13 @@ def ones_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
+def _full_dispatcher(shape, fill_value, dtype=None, order=None, *, like=None):
+    return(like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def full(shape, fill_value, dtype=None, order='C'):
+def full(shape, fill_value, dtype=None, order='C', *, like=None):
     """
     Return a new array of given shape and type, filled with `fill_value`.
 
@@ -282,6 +303,9 @@ def full(shape, fill_value, dtype=None, order='C'):
     order : {'C', 'F'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -309,6 +333,9 @@ def full(shape, fill_value, dtype=None, order='C'):
            [1, 2]])
 
     """
+    if like is not None:
+        return _full_with_like(shape, fill_value, dtype=dtype, order=order, like=like)
+
     if dtype is None:
         fill_value = asarray(fill_value)
         dtype = fill_value.dtype
@@ -317,6 +344,11 @@ def full(shape, fill_value, dtype=None, order='C'):
     return a
 
 
+_full_with_like = array_function_dispatch(
+    _full_dispatcher
+)(full)
+
+
 def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
@@ -342,7 +374,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
         as possible.
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -377,7 +409,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
 
     >>> y = np.arange(6, dtype=np.double)
     >>> np.full_like(y, 0.1)
-    array([0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+    array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
 
     """
     res = empty_like(a, dtype=dtype, order=order, subok=subok, shape=shape)
@@ -1754,8 +1786,13 @@ def indices(dimensions, dtype=int, sparse=False):
     return res
 
 
+def _fromfunction_dispatcher(function, shape, *, dtype=None, like=None, **kwargs):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def fromfunction(function, shape, *, dtype=float, **kwargs):
+def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
     """
     Construct an array by executing a function over each coordinate.
 
@@ -1776,6 +1813,9 @@ def fromfunction(function, shape, *, dtype=float, **kwargs):
     dtype : data-type, optional
         Data-type of the coordinate arrays passed to `function`.
         By default, `dtype` is float.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1806,10 +1846,18 @@ def fromfunction(function, shape, *, dtype=float, **kwargs):
            [2, 3, 4]])
 
     """
+    if like is not None:
+        return _fromfunction_with_like(function, shape, dtype=dtype, like=like, **kwargs)
+
     args = indices(shape, dtype=dtype)
     return function(*args, **kwargs)
 
 
+_fromfunction_with_like = array_function_dispatch(
+    _fromfunction_dispatcher
+)(fromfunction)
+
+
 def _frombuffer(buf, dtype, shape, order):
     return frombuffer(buf, dtype=dtype).reshape(shape, order=order)
 
@@ -2082,8 +2130,13 @@ def _maketup(descr, val):
         return tuple(res)
 
 
+def _identity_dispatcher(n, dtype=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def identity(n, dtype=None):
+def identity(n, dtype=None, *, like=None):
     """
     Return the identity array.
 
@@ -2096,6 +2149,9 @@ def identity(n, dtype=None):
         Number of rows (and columns) in `n` x `n` output.
     dtype : data-type, optional
         Data-type of the output.  Defaults to ``float``.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -2111,8 +2167,16 @@ def identity(n, dtype=None):
            [0.,  0.,  1.]])
 
     """
+    if like is not None:
+        return _identity_with_like(n, dtype=dtype, like=like)
+
     from numpy import eye
-    return eye(n, dtype=dtype)
+    return eye(n, dtype=dtype, like=like)
+
+
+_identity_with_like = array_function_dispatch(
+    _identity_dispatcher
+)(identity)
 
 
 def _allclose_dispatcher(a, b, rtol=None, atol=None, equal_nan=None):
@@ -2173,6 +2237,8 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     ``allclose(a, b)`` to evaluate to True.  The same is true for
     `equal` but not `array_equal`.
 
+    `allclose` is not defined for non-numeric data types.
+
     Examples
     --------
     >>> np.allclose([1e10,1e-7], [1.00001e10,1e-8])
@@ -2252,6 +2318,8 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     `atol` should be carefully selected for the use case at hand. A zero value
     for `atol` will result in `False` if either `a` or `b` is zero.
 
+    `isclose` is not defined for non-numeric data types.
+
     Examples
     --------
     >>> np.isclose([1e10,1e-7], [1.00001e10,1e-8])
diff --git a/numpy/core/numeric.pyi b/numpy/core/numeric.pyi
new file mode 100644
index 000000000..d91cb31c2
--- /dev/null
+++ b/numpy/core/numeric.pyi
@@ -0,0 +1,189 @@
+import sys
+from typing import (
+    Any,
+    Optional,
+    Union,
+    Sequence,
+    Tuple,
+    Callable,
+    List,
+    overload,
+    TypeVar,
+    Iterable,
+)
+
+from numpy import ndarray, generic, dtype, bool_, signedinteger, _OrderKACF, _OrderCF
+from numpy.typing import ArrayLike, DTypeLike, _ShapeLike
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+_T = TypeVar("_T")
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+_CorrelateMode = Literal["valid", "same", "full"]
+
+@overload
+def zeros_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def zeros_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def ones(
+    shape: _ShapeLike,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> ndarray: ...
+@overload
+def ones_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def ones_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+@overload
+def empty_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def empty_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> ndarray: ...
+@overload
+def full_like(
+    a: _ArrayType,
+    fill_value: Any,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def full_like(
+    a: ArrayLike,
+    fill_value: Any,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+@overload
+def count_nonzero(
+    a: ArrayLike, axis: None = ..., *, keepdims: Literal[False] = ...
+) -> int: ...
+@overload
+def count_nonzero(
+    a: ArrayLike, axis: _ShapeLike = ..., *, keepdims: bool = ...
+) -> Union[signedinteger[Any], ndarray]: ...  # TODO: np.intp
+def isfortran(a: Union[ndarray, generic]) -> bool: ...
+def argwhere(a: ArrayLike) -> ndarray: ...
+def flatnonzero(a: ArrayLike) -> ndarray: ...
+def correlate(a: ArrayLike, v: ArrayLike, mode: _CorrelateMode = ...) -> ndarray: ...
+def convolve(a: ArrayLike, v: ArrayLike, mode: _CorrelateMode = ...) -> ndarray: ...
+@overload
+def outer(a: ArrayLike, b: ArrayLike, out: None = ...) -> ndarray: ...
+@overload
+def outer(a: ArrayLike, b: ArrayLike, out: _ArrayType = ...) -> _ArrayType: ...
+def tensordot(
+    a: ArrayLike,
+    b: ArrayLike,
+    axes: Union[int, Tuple[_ShapeLike, _ShapeLike]] = ...,
+) -> ndarray: ...
+def roll(
+    a: ArrayLike,
+    shift: _ShapeLike,
+    axis: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def rollaxis(a: ndarray, axis: int, start: int = ...) -> ndarray: ...
+def moveaxis(
+    a: ndarray,
+    source: _ShapeLike,
+    destination: _ShapeLike,
+) -> ndarray: ...
+def cross(
+    a: ArrayLike,
+    b: ArrayLike,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: Optional[int] = ...,
+) -> ndarray: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike = ...,
+    sparse: Literal[False] = ...,
+) -> ndarray: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike = ...,
+    sparse: Literal[True] = ...,
+) -> Tuple[ndarray, ...]: ...
+def fromfunction(
+    function: Callable[..., _T],
+    shape: Sequence[int],
+    *,
+    dtype: DTypeLike = ...,
+    like: ArrayLike = ...,
+    **kwargs: Any,
+) -> _T: ...
+def isscalar(element: Any) -> bool: ...
+def binary_repr(num: int, width: Optional[int] = ...) -> str: ...
+def base_repr(number: int, base: int = ..., padding: int = ...) -> str: ...
+def identity(n: int, dtype: DTypeLike = ..., *, like: ArrayLike = ...) -> ndarray: ...
+def allclose(
+    a: ArrayLike,
+    b: ArrayLike,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> bool: ...
+def isclose(
+    a: ArrayLike,
+    b: ArrayLike,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> Union[bool_, ndarray]: ...
+def array_equal(a1: ArrayLike, a2: ArrayLike) -> bool: ...
+def array_equiv(a1: ArrayLike, a2: ArrayLike) -> bool: ...
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 2a015f48f..e705dd3ea 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -358,13 +358,15 @@ def issubsctype(arg1, arg2):
 
 @set_module('numpy')
 def issubdtype(arg1, arg2):
-    """
+    r"""
     Returns True if first argument is a typecode lower/equal in type hierarchy.
 
+    This is like the builtin :func:`issubclass`, but for `dtype`\ s.
+
     Parameters
     ----------
     arg1, arg2 : dtype_like
-        dtype or string representing a typecode.
+        `dtype` or object coercible to one
 
     Returns
     -------
@@ -372,15 +374,45 @@ def issubdtype(arg1, arg2):
 
     See Also
     --------
+    :ref:`arrays.scalars` : Overview of the numpy type hierarchy.
     issubsctype, issubclass_
-    numpy.core.numerictypes : Overview of numpy type hierarchy.
 
     Examples
     --------
-    >>> np.issubdtype('S1', np.string_)
+    `issubdtype` can be used to check the type of arrays:
+
+    >>> ints = np.array([1, 2, 3], dtype=np.int32)
+    >>> np.issubdtype(ints.dtype, np.integer)
+    True
+    >>> np.issubdtype(ints.dtype, np.floating)
+    False
+
+    >>> floats = np.array([1, 2, 3], dtype=np.float32)
+    >>> np.issubdtype(floats.dtype, np.integer)
+    False
+    >>> np.issubdtype(floats.dtype, np.floating)
     True
+
+    Similar types of different sizes are not subdtypes of each other:
+
     >>> np.issubdtype(np.float64, np.float32)
     False
+    >>> np.issubdtype(np.float32, np.float64)
+    False
+
+    but both are subtypes of `floating`:
+
+    >>> np.issubdtype(np.float64, np.floating)
+    True
+    >>> np.issubdtype(np.float32, np.floating)
+    True
+
+    For convenience, dtype-like objects are allowed too:
+
+    >>> np.issubdtype('S1', np.string_)
+    True
+    >>> np.issubdtype('i4', np.signedinteger)
+    True
 
     """
     if not issubclass_(arg1, generic):
diff --git a/numpy/core/numerictypes.pyi b/numpy/core/numerictypes.pyi
new file mode 100644
index 000000000..192015ff1
--- /dev/null
+++ b/numpy/core/numerictypes.pyi
@@ -0,0 +1,29 @@
+from typing import TypeVar, Optional, Type, Union, Tuple, Sequence, overload, Any
+
+from numpy import generic, ndarray, dtype
+from numpy.typing import DTypeLike
+
+_DefaultType = TypeVar("_DefaultType")
+
+def maximum_sctype(t: DTypeLike) -> dtype: ...
+def issctype(rep: object) -> bool: ...
+@overload
+def obj2sctype(rep: object) -> Optional[generic]: ...
+@overload
+def obj2sctype(rep: object, default: None) -> Optional[generic]: ...
+@overload
+def obj2sctype(
+    rep: object, default: Type[_DefaultType]
+) -> Union[generic, Type[_DefaultType]]: ...
+def issubclass_(arg1: object, arg2: Union[object, Tuple[object, ...]]) -> bool: ...
+def issubsctype(
+    arg1: Union[ndarray, DTypeLike], arg2: Union[ndarray, DTypeLike]
+) -> bool: ...
+def issubdtype(arg1: DTypeLike, arg2: DTypeLike) -> bool: ...
+def sctype2char(sctype: object) -> str: ...
+def find_common_type(
+    array_types: Sequence[DTypeLike], scalar_types: Sequence[DTypeLike]
+) -> dtype: ...
+
+# TODO: Add annotations for the following objects:
+# typeDict, nbytes, cast, ScalarType & typecodes
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 816b11293..c2b5fb7fa 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -12,6 +12,27 @@ from numpy.compat._inspect import getargspec
 ARRAY_FUNCTION_ENABLED = bool(
     int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 1)))
 
+array_function_like_doc = (
+    """like : array_like
+        Reference object to allow the creation of arrays which are not
+        NumPy arrays. If an array-like passed in as ``like`` supports
+        the ``__array_function__`` protocol, the result will be defined
+        by it. In this case, it ensures the creation of an array object
+        compatible with that passed in via this argument.
+
+        .. note::
+            The ``like`` keyword is an experimental feature pending on
+            acceptance of :ref:`NEP 35 <NEP35>`."""
+)
+
+def set_array_function_like_doc(public_api):
+    if public_api.__doc__ is not None:
+        public_api.__doc__ = public_api.__doc__.replace(
+            "${ARRAY_FUNCTION_LIKE}",
+            array_function_like_doc,
+        )
+    return public_api
+
 
 add_docstring(
     implement_array_function,
diff --git a/numpy/core/records.py b/numpy/core/records.py
index 0d3fd9118..00d456658 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -36,12 +36,11 @@ Record arrays allow us to access fields as properties::
 import os
 import warnings
 from collections import Counter, OrderedDict
+from contextlib import nullcontext
 
 from . import numeric as sb
 from . import numerictypes as nt
-from numpy.compat import (
-    isfileobj, os_fspath, contextlib_nullcontext
-)
+from numpy.compat import os_fspath
 from numpy.core.overrides import set_module
 from .arrayprint import get_printoptions
 
@@ -374,7 +373,7 @@ class recarray(ndarray):
 
     See Also
     --------
-    rec.fromrecords : Construct a record array from data.
+    core.records.fromrecords : Construct a record array from data.
     record : fundamental data-type for `recarray`.
     format_parser : determine a data-type from formats, names, titles.
 
@@ -630,7 +629,7 @@ def fromarrays(arrayList, dtype=None, shape=None, formats=None,
     >>> x1[1]=34
     >>> r.a
     array([1, 2, 3, 4])
-    
+
     >>> x1 = np.array([1, 2, 3, 4])
     >>> x2 = np.array(['a', 'dd', 'xyz', '12'])
     >>> x3 = np.array([1.1, 2, 3,4])
@@ -847,13 +846,12 @@ def fromstring(datastring, dtype=None, shape=None, offset=0, formats=None,
     return _array
 
 def get_remaining_size(fd):
+    pos = fd.tell()
     try:
-        fn = fd.fileno()
-    except AttributeError:
-        return os.path.getsize(fd.name) - fd.tell()
-    st = os.fstat(fn)
-    size = st.st_size - fd.tell()
-    return size
+        fd.seek(0, 2)
+        return fd.tell() - pos
+    finally:
+        fd.seek(pos, 0)
 
 def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
              names=None, titles=None, aligned=False, byteorder=None):
@@ -911,9 +909,11 @@ def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
     elif isinstance(shape, int):
         shape = (shape,)
 
-    if isfileobj(fd):
+    if hasattr(fd, 'readinto'):
+        # GH issue 2504. fd supports io.RawIOBase or io.BufferedIOBase interface.
+        # Example of fd: gzip, BytesIO, BufferedReader
         # file already opened
-        ctx = contextlib_nullcontext(fd)
+        ctx = nullcontext(fd)
     else:
         # open file
         ctx = open(os_fspath(fd), 'rb')
@@ -957,7 +957,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     """
     Construct a record array from a wide-variety of objects.
 
-    A general-purpose record array constructor that dispatches to the 
+    A general-purpose record array constructor that dispatches to the
     appropriate `recarray` creation function based on the inputs (see Notes).
 
     Parameters
@@ -995,7 +995,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     `obj` is a string, then call the `fromstring` constructor. If `obj` is a
     list or a tuple, then if the first object is an `~numpy.ndarray`, call
     `fromarrays`, otherwise call `fromrecords`. If `obj` is a
-    `~numpy.recarray`, then make a copy of the data in the recarray 
+    `~numpy.recarray`, then make a copy of the data in the recarray
     (if ``copy=True``) and use the new formats, names, and titles. If `obj`
     is a file, then call `fromfile`. Finally, if obj is an `ndarray`, then
     return ``obj.view(recarray)``, making a copy of the data if ``copy=True``.
@@ -1036,7 +1036,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     array('def', dtype='<U3')
     """
 
-    if ((isinstance(obj, (type(None), str)) or isfileobj(obj)) and
+    if ((isinstance(obj, (type(None), str)) or hasattr(obj, 'readinto')) and
            formats is None and dtype is None):
         raise ValueError("Must define formats (or dtype) if object is "
                          "None, string, or an open file")
@@ -1078,7 +1078,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
             new = new.copy()
         return new
 
-    elif isfileobj(obj):
+    elif hasattr(obj, 'readinto'):
         return fromfile(obj, dtype=dtype, shape=shape, offset=offset)
 
     elif isinstance(obj, ndarray):
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index aede12080..2ec5e1a64 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -9,7 +9,7 @@ from os.path import join
 
 from numpy.distutils import log
 from distutils.dep_util import newer
-from distutils.sysconfig import get_config_var
+from sysconfig import get_config_var
 from numpy.compat import npy_load_module
 from setup_common import *  # noqa: F403
 
@@ -23,6 +23,11 @@ NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "
 NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0")
 NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING
 
+# Set to True to use the new casting implementation as much as implemented.
+# Allows running the full test suit to exercise the new machinery until
+# it is used as default and the old version is eventually deleted.
+NPY_USE_NEW_CASTINGIMPL = os.environ.get('NPY_USE_NEW_CASTINGIMPL', "0") != "0"
+
 # XXX: ugly, we use a class to avoid calling twice some expensive functions in
 # config.h/numpyconfig.h. I don't see a better way because distutils force
 # config.h generation inside an Extension class, and as such sharing
@@ -102,7 +107,7 @@ def win32_checks(deflist):
     if a == "Intel" or a == "AMD64":
         deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING')
 
-def check_math_capabilities(config, moredefs, mathlibs):
+def check_math_capabilities(config, ext, moredefs, mathlibs):
     def check_func(func_name):
         return config.check_func(func_name, libraries=mathlibs,
                                  decl=True, call=True)
@@ -167,6 +172,14 @@ def check_math_capabilities(config, moredefs, mathlibs):
     for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
+            if fn == 'attribute_target_avx512f':
+                # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
+                # support on Windows-based platforms
+                if (sys.platform in ('win32', 'cygwin') and
+                        config.check_compiler_gcc() and
+                        not config.check_gcc_version_at_least(8, 4)):
+                    ext.extra_compile_args.extend(
+                            ['-ffixed-xmm%s' % n for n in range(16, 32)])
 
     for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
         if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
@@ -434,7 +447,7 @@ def configuration(parent_package='',top_path=None):
             mathlibs = check_mathlib(config_cmd)
             moredefs.append(('MATHLIB', ','.join(mathlibs)))
 
-            check_math_capabilities(config_cmd, moredefs, mathlibs)
+            check_math_capabilities(config_cmd, ext, moredefs, mathlibs)
             moredefs.extend(cocache.check_ieee_macros(config_cmd)[0])
             moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0])
 
@@ -460,6 +473,10 @@ def configuration(parent_package='',top_path=None):
             if NPY_RELAXED_STRIDES_DEBUG:
                 moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
 
+            # Use the new experimental casting implementation in NumPy 1.20:
+            if NPY_USE_NEW_CASTINGIMPL:
+                moredefs.append(('NPY_USE_NEW_CASTINGIMPL', 1))
+
             # Get long double representation
             rep = check_long_double_representation(config_cmd)
             moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
@@ -618,6 +635,7 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', '_simd'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
     config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -687,26 +705,6 @@ def configuration(parent_package='',top_path=None):
             subst_dict)
 
     #######################################################################
-    #                         npysort library                             #
-    #######################################################################
-
-    # This library is created for the build but it is not installed
-    npysort_sources = [join('src', 'common', 'npy_sort.h.src'),
-                       join('src', 'npysort', 'quicksort.c.src'),
-                       join('src', 'npysort', 'mergesort.c.src'),
-                       join('src', 'npysort', 'timsort.c.src'),
-                       join('src', 'npysort', 'heapsort.c.src'),
-                       join('src', 'npysort', 'radixsort.c.src'),
-                       join('src', 'common', 'npy_partition.h.src'),
-                       join('src', 'npysort', 'selection.c.src'),
-                       join('src', 'common', 'npy_binsearch.h.src'),
-                       join('src', 'npysort', 'binsearch.c.src'),
-                       ]
-    config.add_library('npysort',
-                       sources=npysort_sources,
-                       include_dirs=[])
-
-    #######################################################################
     #                     multiarray_tests module                         #
     #######################################################################
 
@@ -780,6 +778,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'arraytypes.h'),
             join('src', 'multiarray', 'arrayfunction_override.h'),
             join('src', 'multiarray', 'array_coercion.h'),
+            join('src', 'multiarray', 'array_method.h'),
             join('src', 'multiarray', 'npy_buffer.h'),
             join('src', 'multiarray', 'calculation.h'),
             join('src', 'multiarray', 'common.h'),
@@ -790,9 +789,12 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'descriptor.h'),
             join('src', 'multiarray', 'dtypemeta.h'),
             join('src', 'multiarray', 'dragon4.h'),
+            join('src', 'multiarray', 'einsum_debug.h'),
+            join('src', 'multiarray', 'einsum_sumprod.h'),
             join('src', 'multiarray', 'getset.h'),
             join('src', 'multiarray', 'hashdescr.h'),
             join('src', 'multiarray', 'iterators.h'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.h'),
             join('src', 'multiarray', 'mapping.h'),
             join('src', 'multiarray', 'methods.h'),
             join('src', 'multiarray', 'multiarraymodule.h'),
@@ -825,7 +827,7 @@ def configuration(parent_package='',top_path=None):
             join('include', 'numpy', 'npy_1_7_deprecated_api.h'),
             # add library sources as distuils does not consider libraries
             # dependencies
-            ] + npysort_sources + npymath_sources
+            ] + npymath_sources
 
     multiarray_src = [
             join('src', 'multiarray', 'abstractdtypes.c'),
@@ -833,6 +835,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'arrayobject.c'),
             join('src', 'multiarray', 'arraytypes.c.src'),
             join('src', 'multiarray', 'array_coercion.c'),
+            join('src', 'multiarray', 'array_method.c'),
             join('src', 'multiarray', 'array_assign_scalar.c'),
             join('src', 'multiarray', 'array_assign_array.c'),
             join('src', 'multiarray', 'arrayfunction_override.c'),
@@ -853,11 +856,13 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
+            join('src', 'multiarray', 'einsum_sumprod.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
             join('src', 'multiarray', 'item_selection.c'),
             join('src', 'multiarray', 'iterators.c'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.c'),
             join('src', 'multiarray', 'lowlevel_strided_loops.c.src'),
             join('src', 'multiarray', 'mapping.c'),
             join('src', 'multiarray', 'methods.c'),
@@ -877,6 +882,16 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'typeinfo.c'),
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
+            join('src', 'common', 'npy_sort.h.src'),
+            join('src', 'npysort', 'quicksort.c.src'),
+            join('src', 'npysort', 'mergesort.c.src'),
+            join('src', 'npysort', 'timsort.c.src'),
+            join('src', 'npysort', 'heapsort.c.src'),
+            join('src', 'npysort', 'radixsort.c.src'),
+            join('src', 'common', 'npy_partition.h.src'),
+            join('src', 'npysort', 'selection.c.src'),
+            join('src', 'common', 'npy_binsearch.h.src'),
+            join('src', 'npysort', 'binsearch.c.src'),
             ]
 
     #######################################################################
@@ -902,6 +917,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
@@ -927,7 +943,7 @@ def configuration(parent_package='',top_path=None):
 
     config.add_extension('_multiarray_umath',
                          sources=multiarray_src + umath_src +
-                                 npymath_sources + common_src +
+                                 common_src +
                                  [generate_config_h,
                                   generate_numpyconfig_h,
                                   generate_numpy_api,
@@ -938,7 +954,7 @@ def configuration(parent_package='',top_path=None):
                                  ],
                          depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
-                         libraries=['npymath', 'npysort'],
+                         libraries=['npymath'],
                          extra_info=extra_info)
 
     #######################################################################
@@ -973,6 +989,28 @@ def configuration(parent_package='',top_path=None):
     config.add_extension('_operand_flag_tests',
                     sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
 
+    #######################################################################
+    #                        SIMD module                                  #
+    #######################################################################
+
+    config.add_extension('_simd', sources=[
+        join('src', 'common', 'npy_cpu_features.c.src'),
+        join('src', '_simd', '_simd.c'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd.dispatch.c.src'),
+    ], depends=[
+        join('src', 'common', 'npy_cpu_dispatch.h'),
+        join('src', 'common', 'simd', 'simd.h'),
+        join('src', '_simd', '_simd.h'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd_arg.inc'),
+        join('src', '_simd', '_simd_convert.inc'),
+        join('src', '_simd', '_simd_easyintrin.inc'),
+        join('src', '_simd', '_simd_vector.inc'),
+    ])
+
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
     config.add_data_dir('tests/examples')
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 8c0149497..2d85e0718 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -40,7 +40,8 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000c - 1.14.x
 # 0x0000000c - 1.15.x
 # 0x0000000d - 1.16.x
-# 0x0000000e - 1.19.x
+# 0x0000000d - 1.19.x
+# 0x0000000e - 1.20.x
 C_API_VERSION = 0x0000000e
 
 class MismatchCAPIWarning(Warning):
@@ -50,7 +51,7 @@ def is_released(config):
     """Return True if a released version of numpy is detected."""
     from distutils.version import LooseVersion
 
-    v = config.get_version('../version.py')
+    v = config.get_version('../_version.py')
     if v is None:
         raise ValueError("Could not get version")
     pv = LooseVersion(vstring=v).version
@@ -178,6 +179,9 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
 # gcc 4.8.4 support attributes but not with intrisics
 # tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
 # function name will be converted to HAVE_<upper-case-name> preprocessor macro
+# The _mm512_castps_si512 instruction is specific check for AVX-512F support
+# in gcc-4.9 which is missing a subset of intrinsics. See
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
 OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
                                 'attribute_target_avx2_with_intrinsics',
                                 '__m256 temp = _mm256_set1_ps(1.0); temp = \
@@ -185,11 +189,12 @@ OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fm
                                 'immintrin.h'),
                                 ('__attribute__((target("avx512f")))',
                                 'attribute_target_avx512f_with_intrinsics',
-                                '__m512 temp = _mm512_set1_ps(1.0)',
+                                '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
                                 'immintrin.h'),
                                 ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
                                 'attribute_target_avx512_skx_with_intrinsics',
                                 '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
+                                __m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0));\
                                 _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
                                 'immintrin.h'),
                                 ]
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 7a76bbf9d..e4dc30d4c 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -539,7 +539,8 @@ def _accumulate(values):
 def _concatenate_shapes(shapes, axis):
     """Given array shapes, return the resulting shape and slices prefixes.
 
-    These help in nested concatation.
+    These help in nested concatenation.
+    
     Returns
     -------
     shape: tuple of int
diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi
new file mode 100644
index 000000000..b20598b1a
--- /dev/null
+++ b/numpy/core/shape_base.pyi
@@ -0,0 +1,41 @@
+import sys
+from typing import TypeVar, overload, List, Sequence
+
+from numpy import ndarray
+from numpy.typing import ArrayLike
+
+if sys.version_info >= (3, 8):
+    from typing import SupportsIndex
+else:
+    from typing_extensions import Protocol
+    class SupportsIndex(Protocol):
+        def __index__(self) -> int: ...
+
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+@overload
+def atleast_1d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_1d(*arys: ArrayLike) -> List[ndarray]: ...
+
+@overload
+def atleast_2d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_2d(*arys: ArrayLike) -> List[ndarray]: ...
+
+@overload
+def atleast_3d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_3d(*arys: ArrayLike) -> List[ndarray]: ...
+
+def vstack(tup: Sequence[ArrayLike]) -> ndarray: ...
+def hstack(tup: Sequence[ArrayLike]) -> ndarray: ...
+@overload
+def stack(
+    arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: None = ...
+) -> ndarray: ...
+@overload
+def stack(
+    arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: _ArrayType = ...
+) -> _ArrayType: ...
+def block(arrays: ArrayLike) -> ndarray: ...
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
new file mode 100644
index 000000000..b1fdd4478
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.c
@@ -0,0 +1,73 @@
+#include "_simd.h"
+
+PyMODINIT_FUNC PyInit__simd(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_name = "numpy.core._simd",
+        .m_size = -1
+    };
+    if (npy_cpu_init() < 0) {
+        return NULL;
+    }
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    PyObject *targets = PyDict_New();
+    if (targets == NULL) {
+        goto err;
+    }
+    if (PyModule_AddObject(m, "targets", targets) < 0) {
+        Py_DECREF(targets);
+        goto err;
+    }
+    // add keys for non-supported optimizations with None value
+    #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY)       \
+        {                                                                      \
+            PyObject *simd_mod;                                                \
+            if (!TESTED_FEATURES) {                                            \
+                Py_INCREF(Py_None);                                            \
+                simd_mod = Py_None;                                            \
+            } else {                                                           \
+                simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)();        \
+                if (simd_mod == NULL) {                                        \
+                    goto err;                                                  \
+                }                                                              \
+            }                                                                  \
+            const char *target_name = NPY_TOSTRING(TARGET_NAME);               \
+            if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) {    \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+            Py_INCREF(simd_mod);                                               \
+            if (PyModule_AddObject(m, target_name, simd_mod) < 0) {            \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY)                            \
+        {                                                                      \
+            PyObject *simd_mod = simd_create_module();                         \
+            if (simd_mod == NULL) {                                            \
+                goto err;                                                      \
+            }                                                                  \
+            if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) {     \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+            Py_INCREF(simd_mod);                                               \
+            if (PyModule_AddObject(m, "baseline", simd_mod) < 0) {             \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+    NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
new file mode 100644
index 000000000..e3dbcdece
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -0,0 +1,612 @@
+/*@targets $werror #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_data.inc"
+#include "_simd_convert.inc"
+#include "_simd_vector.inc"
+#include "_simd_arg.inc"
+#include "_simd_easyintrin.inc"
+
+//#########################################################################
+//## Defining NPYV intrinsics as module functions
+//#########################################################################
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
+/**end repeat1**/
+/**begin repeat1
+ * # intrin = store, storea, stores, storel, storeh#
+ */
+// special definition due to the nature of @intrin@
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:@intrin@_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
+SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+
+// Partial Store
+static PyObject *
+simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_@sfx@(
+        seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+// Non-contiguous Load
+/**begin repeat1
+ * #intrin = loadn, loadn_till, loadn_tillz#
+ * #till   = 0,     1,          1#
+ * #fill   = 0,     1,          0#
+ * #format = ,    O&O&,         O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if @fill@
+    simd_arg fill_arg = {.dtype = simd_data_@sfx@};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&:@intrin@_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if @fill@
+        ,simd_arg_converter, &fill_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@sfx@ rvec = npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        , nlane_arg.data.u32
+    #endif
+    #if @fill@
+        , fill_arg.data.@sfx@
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+
+// Non-contiguous Store
+/**begin repeat1
+ * #intrin = storen, storen_till#
+ * #till   = 0,      1#
+ * #format = ,       O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&O&:storen_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+#endif // @ncont_sup@
+
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
+SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_@sfx@ and npy_set_@sfx@.
+*/
+/**begin repeat1
+ * #intrin = setf, set#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@);
+}
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = combine, zip#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @rev64_sup@
+SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8)
+SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@)
+SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@)
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #intrin  = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@,  v@bsfx@)
+SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin  = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#if @sum_sup@
+SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
+#endif // sum_sup
+
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif
+
+#endif // simd_sup
+/**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
+/**end repeat**/
+
+//#########################################################################
+//## Attach module functions
+//#########################################################################
+static PyMethodDef simd__intrinsics_methods[] = {
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+/**begin repeat1
+ * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz,
+ *           store_till, storen, storen_till#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // ncont_sup
+
+/***************************
+ * Misc
+ ***************************/
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = set, setf, setall, zero, select#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh, combine, zip#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @rev64_sup@
+SIMD_INTRIN_DEF(rev64_@sfx@)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+/**begin repeat1
+ * # intrin = shl, shr, shli, shri#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_INTRIN_DEF(mul_@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_INTRIN_DEF(div_@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin  = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#if @sum_sup@
+SIMD_INTRIN_DEF(sum_@sfx@)
+#endif // sum_sup
+
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+#endif // simd_sup
+/**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_INTRIN_DEF(cleanup)
+
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_INTRIN_DEF(tobits_@bsfx@)
+/**end repeat**/
+
+/************************************************************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+//#########################################################################
+//## Defining a separate module for each target
+//#########################################################################
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_size = -1,
+    #ifdef NPY__CPU_TARGET_CURRENT
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+    #else
+        .m_name = "numpy.core._simd.baseline",
+    #endif
+    #if NPY_SIMD
+        .m_methods = simd__intrinsics_methods
+    #else
+        .m_methods = NULL
+    #endif
+    };
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+        goto err;
+    }
+#if NPY_SIMD
+    if (PySIMDVectorType_Init(m)) {
+        goto err;
+    }
+    /**begin repeat
+     * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+     */
+    if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) {
+        goto err;
+    }
+    /**end repeat**/
+#endif // NPY_SIMD
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h
new file mode 100644
index 000000000..d9905c801
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.h
@@ -0,0 +1,30 @@
+/**
+ * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes.
+ *
+ * Please keep this module independent from other c-extension modules,
+ * since NPYV intrinsics may be involved in their functionality,
+ * which increases the degree of complexity in tracking and detecting errors.
+ *
+ * TODO: Add an independent sphinx doc.
+ *
+ * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'.
+ */
+#ifndef _SIMD_SIMD_H_
+#define _SIMD_SIMD_H_
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+// autogenerated, required for CPU dispatch macros
+#include "_simd.dispatch.h"
+#endif
+/**
+ * Create a new module for each required optimization which contains all NPYV intrinsics,
+ *
+ * If required optimization is not supported by NPYV, the module will still provides
+ * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without
+ * any intrinsics.
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void))
+#endif // _SIMD_SIMD_H_
diff --git a/numpy/core/src/_simd/_simd_arg.inc b/numpy/core/src/_simd/_simd_arg.inc
new file mode 100644
index 000000000..f5bcf5487
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_arg.inc
@@ -0,0 +1,85 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        arg->data = simd_scalar_from_number(obj, arg->dtype);
+    }
+    else if (info->is_sequence) {
+        unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
+        arg->data.qu8 = simd_sequence_from_iterable(obj, arg->dtype, min_seq_size);
+    }
+    else if (info->is_vectorx) {
+        arg->data = simd_vectorx_from_tuple(obj, arg->dtype);
+    }
+    else if (info->is_vector) {
+        arg->data = PySIMDVector_AsData((PySIMDVectorObject*)obj, arg->dtype);
+    } else {
+        arg->data.u64 = 0;
+        PyErr_Format(PyExc_RuntimeError,
+            "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname
+        );
+        return -1;
+    }
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        return simd_scalar_to_number(arg->data, arg->dtype);
+    }
+    if (info->is_sequence) {
+        return simd_sequence_to_list(arg->data.qu8, arg->dtype);
+    }
+    if (info->is_vectorx) {
+        return simd_vectorx_to_tuple(arg->data, arg->dtype);
+    }
+    if (info->is_vector) {
+        return (PyObject*)PySIMDVector_FromData(arg->data, arg->dtype);
+    }
+    PyErr_Format(PyExc_RuntimeError,
+        "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
+    );
+    return NULL;
+}
+
+static void
+simd_arg_free(simd_arg *arg)
+{
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_sequence) {
+        simd_sequence_free(arg->data.qu8);
+    }
+}
+
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg)
+{
+    if (obj != NULL) {
+        if (simd_arg_from_obj(obj, arg) < 0) {
+            return 0;
+        }
+        arg->obj = obj;
+        return Py_CLEANUP_SUPPORTED;
+    } else {
+        simd_arg_free(arg);
+    }
+    return 1;
+}
diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc
new file mode 100644
index 000000000..73869ef1f
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_convert.inc
@@ -0,0 +1,210 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    simd_data data;
+    if (info->is_float) {
+        data.f64 = PyFloat_AsDouble(obj);
+        if (dtype == simd_data_f32){
+            data.f32 = (float)data.f64;
+        }
+    } else {
+        data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+    }
+    return data;
+}
+
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    if (info->is_float) {
+        if (dtype == simd_data_f32) {
+            return PyFloat_FromDouble(data.f32);
+        }
+        return PyFloat_FromDouble(data.f64);
+    }
+    int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+    data.u64 <<= leftb;
+    if (info->is_signed) {
+        return PyLong_FromLongLong(data.s64 >> leftb);
+    }
+    return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
+}
+
+typedef struct {
+    Py_ssize_t len;
+    void *ptr;
+} simd__alloc_data;
+
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(len > 0 && info->is_sequence && info->lane_size > 0);
+    size_t size = sizeof(simd__alloc_data) + len * info->lane_size + NPY_SIMD_WIDTH;
+    void *ptr = malloc(size);
+    if (ptr == NULL) {
+        return PyErr_NoMemory();
+    }
+    // align the pointer
+    simd__alloc_data *a_ptr = (simd__alloc_data *)(
+        ((uintptr_t)ptr + sizeof(simd__alloc_data) + NPY_SIMD_WIDTH) & ~(uintptr_t)(NPY_SIMD_WIDTH-1)
+    );
+    a_ptr[-1].len = len;
+    a_ptr[-1].ptr = ptr;
+    return a_ptr;
+}
+
+static Py_ssize_t
+simd_sequence_len(void const *ptr)
+{
+    return ((simd__alloc_data const*)ptr)[-1].len;
+}
+
+static void
+simd_sequence_free(void *ptr)
+{
+    free(((simd__alloc_data *)ptr)[-1].ptr);
+}
+
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_sequence && info->lane_size > 0);
+    PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence");
+    if (seq_obj == NULL) {
+        return NULL;
+    }
+    Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
+    if (seq_size < min_size) {
+        PyErr_Format(PyExc_ValueError,
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            min_size, seq_size
+        );
+        return NULL;
+    }
+    npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype);
+    if (dst == NULL) {
+        return NULL;
+    }
+    PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
+    for (Py_ssize_t i = 0; i < seq_size; ++i) {
+        simd_data data = simd_scalar_from_number(seq_items[i], info->to_scalar);
+        npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
+        memcpy(sdst, &data.u64, info->lane_size);
+    }
+    Py_DECREF(seq_obj);
+
+    if (PyErr_Occurred()) {
+        simd_sequence_free(dst);
+        return NULL;
+    }
+    return dst;
+}
+
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    if (!PySequence_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+            "a sequence object is required to fill %s", info->pyname
+        );
+        return -1;
+    }
+    const npyv_lanetype_u8 *src = ptr;
+    Py_ssize_t seq_len = simd_sequence_len(ptr);
+    for (Py_ssize_t i = 0; i < seq_len; ++i) {
+        const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
+        simd_data data;
+        memcpy(&data.u64, ssrc, info->lane_size);
+        PyObject *item = simd_scalar_to_number(data, info->to_scalar);
+        if (item == NULL) {
+            return -1;
+        }
+        int res = PySequence_SetItem(obj, i, item);
+        Py_DECREF(item);
+        if (res < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype)
+{
+    PyObject *list = PyList_New(simd_sequence_len(ptr));
+    if (list == NULL) {
+        return NULL;
+    }
+    if (simd_sequence_fill_iterable(list, ptr, dtype) < 0) {
+        Py_DECREF(list);
+        return NULL;
+    }
+    return list;
+}
+
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    simd_data data = {.u64 = 0};
+    if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) {
+        PyErr_Format(PyExc_TypeError,
+            "a tuple of %d vector type %s is required",
+            info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname
+        );
+        return data;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        PyObject *item = PyTuple_GET_ITEM(obj, i);
+        // get the max multi-vec and let the compiler do the rest
+        data.vu64x3.val[i] = PySIMDVector_AsData((PySIMDVectorObject*)item, info->to_vector).vu64;
+        if (PyErr_Occurred()) {
+            return data;
+        }
+    }
+    return data;
+}
+
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    PyObject *tuple = PyTuple_New(info->is_vectorx);
+    if (tuple == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        // get the max multi-vector and let the compiler handle the rest
+        simd_data vdata = {.vu64 = data.vu64x3.val[i]};
+        PyObject *item = (PyObject*)PySIMDVector_FromData(vdata, info->to_vector);
+        if (item == NULL) {
+            // TODO: improve log add item number
+            Py_DECREF(tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(tuple, i, item);
+    }
+    return tuple;
+}
diff --git a/numpy/core/src/_simd/_simd_data.inc.src b/numpy/core/src/_simd/_simd_data.inc.src
new file mode 100644
index 000000000..5c796487c
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_data.inc.src
@@ -0,0 +1,93 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+    [simd_data_none] = {.pyname="none"},
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig  = 0*4, 1*4, 0*2#
+     * #fp   = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_@sfx@] = {
+        .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_q@sfx@] = {
+        .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@] = {
+        .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // boolean vectors, treated as unsigned and converted internally
+    // to add compatibility among all SIMD extensions
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64#
+     * #bsfx = b8, b16, b32, b64#
+     */
+    [simd_data_v@bsfx@] = {
+        .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x2] = {
+        .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x3] = {
+        .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
new file mode 100644
index 000000000..54e7ccf01
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -0,0 +1,214 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+#define SIMD_INTRIN_DEF(NAME) \
+    { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma
+
+#define SIMD_IMPL_INTRIN_0(NAME, RET)                     \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (!PyArg_ParseTuple(                            \
+            args, ":" NPY_TOSTRING(NAME))                 \
+        ) return NULL;                                    \
+        simd_arg a = {                                    \
+            .dtype = simd_data_##RET,                     \
+            .data  = {.RET = npyv_##NAME()},              \
+        };                                                \
+        return simd_arg_to_obj(&a);                       \
+    }
+
+#define SIMD_IMPL_INTRIN_0N(NAME)                         \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (!PyArg_ParseTuple(                            \
+            args, ":" NPY_TOSTRING(NAME))                 \
+        ) return NULL;                                    \
+        npyv_##NAME();                                    \
+        Py_RETURN_NONE;                                   \
+    }
+
+#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0)                \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg = {.dtype = simd_data_##IN0};        \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&:"NPY_TOSTRING(NAME),                \
+            simd_arg_converter, &arg                      \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg.data.IN0                                  \
+        )};                                               \
+        simd_arg_free(&arg);                              \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1)           \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&:"NPY_TOSTRING(NAME),              \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1                  \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
+    C == arg2.data.u8 ? NPY_CAT(npyv_, NAME)(arg1.data.IN0, C) :
+
+#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG)  \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_u8};          \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&:"NPY_TOSTRING(NAME),              \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2                     \
+        )) return NULL;                                   \
+        simd_data data;                                   \
+        data.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \
+            SIMD__REPEAT_2IMM, NAME, IN0                  \
+        ) npyv_##NAME(arg1.data.IN0, 0);                  \
+        simd_arg_free(&arg1);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2)      \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&:"NPY_TOSTRING(NAME),            \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2                                 \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+/**
+ * Helper macros for repeating and expand a certain macro.
+ * Mainly used for converting a scalar to an immediate constant.
+ */
+#define SIMD__IMPL_COUNT_7(FN, ...)      \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_8(FN, ...)      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(8,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_16(FN, ...)      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(16,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_32(FN, ...)      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(32,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_48(FN, ...)      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(48,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_64(FN, ...)      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(64,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_7_(FN, ...)                                \
+                                    NPY_EXPAND(FN(1,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(2,  __VA_ARGS__)) NPY_EXPAND(FN(3,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(4,  __VA_ARGS__)) NPY_EXPAND(FN(5,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(6,  __VA_ARGS__)) NPY_EXPAND(FN(7,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15_(FN, ...)                               \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)                            \
+    NPY_EXPAND(FN(8,  __VA_ARGS__)) NPY_EXPAND(FN(9,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \
+    NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \
+    NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31_(FN, ...)                               \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \
+    NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \
+    NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \
+    NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \
+    NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \
+    NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \
+    NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \
+    NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47_(FN, ...)                               \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \
+    NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \
+    NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \
+    NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \
+    NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \
+    NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \
+    NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \
+    NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63_(FN, ...)                               \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \
+    NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \
+    NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \
+    NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \
+    NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \
+    NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \
+    NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \
+    NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__))
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
new file mode 100644
index 000000000..9858fc0dc
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -0,0 +1,421 @@
+#ifndef _SIMD_SIMD_INC_H_
+#define _SIMD_SIMD_INC_H_
+
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ @sfx@;
+    /**end repeat**/
+    // sequence
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ *q@sfx@;
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+     */
+    npyv_@sfx@ v@sfx@;
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x2 v@sfx@x2;
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x3 v@sfx@x3;
+    /**end repeat**/
+#if NPY_SIMD_F64
+    npyv_f64    vf64;
+    npyv_f64x2  vf64x2;
+    npyv_f64x3  vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+    simd_data_none = 0,
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_@sfx@,
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_q@sfx@,
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64#
+     */
+    simd_data_v@sfx@,
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x2,
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x3,
+    /**end repeat**/
+    simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+    // type name compatible with python style
+    const char *pyname;
+    // returns '1' if the type represent a unsigned integer
+    int is_unsigned:1;
+    // returns '1' if the type represent a signed integer
+    int is_signed:1;
+    // returns '1' if the type represent a single or double precision
+    int is_float:1;
+    // returns '1' if the type represent a boolean
+    int is_bool:1;
+    // returns '1' if the type represent a sequence
+    int is_sequence:1;
+    // returns '1' if the type represent a scalar
+    int is_scalar:1;
+    // returns '1' if the type represent a vector
+    int is_vector:1;
+    // returns the len of multi-vector if the type reprsent x2 or x3 vector
+    // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+    int is_vectorx;
+    // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+    simd_data_type to_scalar;
+    // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+    // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+    // e.g. simd_data_vb8 -> simd_data_vu8
+    simd_data_type to_vector;
+    // number of vector lanes
+    int nlanes;
+    // sizeof lane type
+    int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ **  const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ **  if (info->is_vector && info->is_unsigned) {
+ **     ...
+ **  }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+    PyObject_HEAD
+    // vector type id
+    simd_data_type dtype;
+    // vector data, aligned for safe casting
+    npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} PySIMDVectorObject;
+/**
+ * Create a Python obj(PySIMDVectorObject) from a NPYV vector based on the contents
+ * of `data`(simd_data) and according to the vector data type `dtype`
+ * on range(simd_data_[vu8:vf64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.vu8 = npyv_setall_u8(0xff)};
+ ** PySIMDVectorObject *obj = PySIMDVector_FromData(data, simd_data_vu8);
+ ** if (obj != NULL) {
+ **    printf("I have a valid vector obj and first element is \n", obj->data[0]);
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype);
+/**
+ * Return a NPYV vector(simd_data) representation of `obj`(PySIMDVectorObject) and
+ * according to the vector data type `dtype` on range (simd_data_[vu8:vf64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = PySIMDVector_AsData(vec_obj, simd_data_vf32);
+ ** if (!PyErr_Occurred()) {
+ **    npyv_f32 add_1 = npyv_add_f32(data.vf32, npyv_setall_f32(1));
+ **    ...
+ ** }
+ */
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *obj, simd_data_type dtype);
+/**
+ * initialize and register PySIMDVectorType to certain PyModule,
+ * PySIMDVectorType can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+PySIMDVectorType_Init(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_number(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ **    printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_number(data, simd_data_s32);
+ ** if (obj != NULL) {
+ **    printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ **    // aligned store
+ **    npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ **    printf("The first element of my array %f\n", aligned_ptr[0]);
+ **    simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_iterable()`.
+ */
+static Py_ssize_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_iterable()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_iterable(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ **     npyv_u32 a = npyv_load_u32(ptr);
+ **     ...
+ **     simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_tuple(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ **     npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ **     ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+    simd_data_type dtype;
+    simd_data data;
+    // set by simd_arg_converter()
+    PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ **     // fails to convert a python sequence object to C array of uint8
+ **     return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_arg_free(&arg);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ **     // fails to convert a python simd_vector to NPYV vector
+ **     return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ **    // fails convert C uint32 to Python integer.
+ **    return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Converter function used similar to simd_arg_from_obj() but
+ * used with PyArg_Parse*().
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *  - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Example:
+ **  simd_arg seq_f32 = {.dtype = simd_data_qf32};
+ **  simd_arg vec_f32 = {.dtype = simd_data_vf32};
+ **  if (!PyArg_ParseTuple(
+ **     args, "O&O&:add_sum_f32",
+ **     simd_arg_converter, &seq_f32,
+ **     simd_arg_converter, &vec_f32
+ **  )) {
+ **     // fail
+ **     return;
+ **  }
+ **  npyv_f32 load_a = npyv_load_f32(seq_f32.data.qf32);
+ **  npyv_f32 sum = npyv_add_f32(load_a, vec_f32.data.vf32);
+ **  ...
+ **  simd_arg_free(&seq_f32);
+ */
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg);
+/**
+ * Free the allocated C array, if the arg hold sequence data type.
+ */
+static void
+simd_arg_free(simd_arg *arg);
+
+#endif // NPY_SIMD
+#endif // _SIMD_SIMD_INC_H_
diff --git a/numpy/core/src/_simd/_simd_vector.inc b/numpy/core/src/_simd/_simd_vector.inc
new file mode 100644
index 000000000..2a1378f22
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_vector.inc
@@ -0,0 +1,178 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static Py_ssize_t
+simd__vector_length(PySIMDVectorObject *self)
+{
+    return simd_data_getinfo(self->dtype)->nlanes;
+}
+static PyObject *
+simd__vector_item(PySIMDVectorObject *self, Py_ssize_t i)
+{
+    const simd_data_info *info = simd_data_getinfo(self->dtype);
+    int nlanes = info->nlanes;
+    if (i >= nlanes) {
+        PyErr_SetString(PyExc_IndexError, "vector index out of range");
+        return NULL;
+    }
+    npyv_lanetype_u8 *src = self->data + i * info->lane_size;
+    simd_data data;
+    memcpy(&data.u64, src, info->lane_size);
+    return simd_scalar_to_number(data, info->to_scalar);
+}
+
+static PySequenceMethods simd__vector_as_sequence = {
+    .sq_length = (lenfunc) simd__vector_length,
+    .sq_item = (ssizeargfunc) simd__vector_item
+};
+
+static PyObject *
+simd__vector_name(PySIMDVectorObject *self)
+{
+    return PyUnicode_FromString(simd_data_getinfo(self->dtype)->pyname);
+}
+static PyGetSetDef simd__vector_getset[] = {
+    { "__name__", (getter)simd__vector_name, NULL, NULL, NULL },
+    { NULL, NULL, NULL, NULL, NULL }
+};
+
+static PyObject *
+simd__vector_repr(PySIMDVectorObject *self)
+{
+    PyObject *obj = PySequence_List((PyObject*)self);
+    if (obj != NULL) {
+        const char *type_name = simd_data_getinfo(self->dtype)->pyname;
+        PyObject *repr = PyUnicode_FromFormat("<%s of %R>", type_name, obj);
+        Py_DECREF(obj);
+        return repr;
+    }
+    return obj;
+}
+static PyObject *
+simd__vector_compare(PyObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *obj;
+    if (PyTuple_Check(other)) {
+        obj = PySequence_Tuple(self);
+    } else if (PyList_Check(other)) {
+        obj = PySequence_List(self);
+    } else {
+        obj = PySequence_Fast(self, "invalid argument, expected a vector");
+    }
+    if (obj != NULL) {
+        PyObject *rich = PyObject_RichCompare(obj, other, cmp_op);
+        Py_DECREF(obj);
+        return rich;
+    }
+    return obj;
+}
+static PyTypeObject PySIMDVectorType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)),
+    .tp_basicsize = sizeof(PySIMDVectorObject),
+    .tp_repr = (reprfunc)simd__vector_repr,
+    .tp_as_sequence = &simd__vector_as_sequence,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_richcompare = simd__vector_compare,
+    .tp_getset = simd__vector_getset
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    PySIMDVectorObject *vec = PyObject_New(PySIMDVectorObject, &PySIMDVectorType);
+    if (vec == NULL) {
+        return (PySIMDVectorObject*)PyErr_NoMemory();
+    }
+    vec->dtype = dtype;
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(dtype) {
+        case simd_data_vb8:
+            data.vu8 = npyv_cvt_u8_b8(data.vb8);
+            break;
+        case simd_data_vb16:
+            data.vu16 = npyv_cvt_u16_b16(data.vb16);
+            break;
+        case simd_data_vb32:
+            data.vu32 = npyv_cvt_u32_b32(data.vb32);
+            break;
+        default:
+            data.vu64 = npyv_cvt_u64_b64(data.vb64);
+        }
+    }
+    npyv_store_u8(vec->data, data.vu8);
+    return vec;
+}
+
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *vec, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    simd_data data = {.u64 = 0};
+    if (!PyObject_IsInstance(
+        (PyObject *)vec, (PyObject *)&PySIMDVectorType
+    )) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required", info->pyname
+        );
+        return data;
+    }
+    if (vec->dtype != dtype) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required, got(%s)",
+            info->pyname, simd_data_getinfo(vec->dtype)->pyname
+        );
+        return data;
+    }
+
+    data.vu8 = npyv_load_u8(vec->data);
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(dtype) {
+        case simd_data_vb8:
+            data.vb8 = npyv_cvt_b8_u8(data.vu8);
+            break;
+        case simd_data_vb16:
+            data.vb16 = npyv_cvt_b16_u16(data.vu16);
+            break;
+        case simd_data_vb32:
+            data.vb32 = npyv_cvt_b32_u32(data.vu32);
+            break;
+        default:
+            data.vb64 = npyv_cvt_b64_u64(data.vu64);
+        }
+    }
+    return data;
+}
+
+static int
+PySIMDVectorType_Init(PyObject *module)
+{
+    Py_INCREF(&PySIMDVectorType);
+    if (PyType_Ready(&PySIMDVectorType)) {
+        return -1;
+    }
+    if (PyModule_AddObject(
+        module, "vector_type",(PyObject *)&PySIMDVectorType
+    )) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c
index d626d1260..c55f6bdb4 100644
--- a/numpy/core/src/common/array_assign.c
+++ b/numpy/core/src/common/array_assign.c
@@ -14,7 +14,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
-
 #include "npy_config.h"
 #include "npy_pycompat.h"
 
@@ -65,19 +64,22 @@ broadcast_strides(int ndim, npy_intp const *shape,
     return 0;
 
 broadcast_error: {
-        PyObject *errmsg;
-
-        errmsg = PyUString_FromFormat("could not broadcast %s from shape ",
-                                strides_name);
-        PyUString_ConcatAndDel(&errmsg,
-                build_shape_string(strides_ndim, strides_shape));
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" into shape "));
-        PyUString_ConcatAndDel(&errmsg,
-                build_shape_string(ndim, shape));
-        PyErr_SetObject(PyExc_ValueError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *shape1 = convert_shape_to_string(strides_ndim,
+                                                   strides_shape, "");
+        if (shape1 == NULL) {
+            return -1;
+        }
 
+        PyObject *shape2 = convert_shape_to_string(ndim, shape, "");
+        if (shape2 == NULL) {
+            Py_DECREF(shape1);
+            return -1;
+        }
+        PyErr_Format(PyExc_ValueError,
+                "could not broadcast %s from shape %S into shape %S",
+                strides_name, shape1, shape2);
+        Py_DECREF(shape1);
+        Py_DECREF(shape2);
         return -1;
     }
 }
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index f2f12a55b..12aa61822 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -30,10 +30,9 @@
  * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
  *
  */
-typedef void (PyArray_StridedUnaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src, npy_intp src_stride,
-                                    npy_intp N, npy_intp src_itemsize,
-                                    NpyAuxData *transferdata);
+typedef int (PyArray_StridedUnaryOp)(
+        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
+        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
 
 /*
  * This is for pointers to functions which behave exactly as
@@ -43,31 +42,10 @@ typedef void (PyArray_StridedUnaryOp)(char *dst, npy_intp dst_stride,
  * In particular, the 'i'-th element is operated on if and only if
  * mask[i*mask_stride] is true.
  */
-typedef void (PyArray_MaskedStridedUnaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src, npy_intp src_stride,
-                                    npy_bool *mask, npy_intp mask_stride,
-                                    npy_intp N, npy_intp src_itemsize,
-                                    NpyAuxData *transferdata);
-
-/*
- * This function pointer is for binary operations that input two
- * arbitrarily strided one-dimensional array segments and output
- * an arbitrarily strided array segment of the same size.
- * It may be a fully general function, or a specialized function
- * when the strides or item size have particular known values.
- *
- * Examples of binary operations are the basic arithmetic operations,
- * logical operators AND, OR, and many others.
- *
- * The 'transferdata' parameter is slightly special, following a
- * generic auxiliary data pattern defined in ndarraytypes.h
- * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
- *
- */
-typedef void (PyArray_StridedBinaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src0, npy_intp src0_stride,
-                                    char *src1, npy_intp src1_stride,
-                                    npy_intp N, NpyAuxData *transferdata);
+typedef int (PyArray_MaskedStridedUnaryOp)(
+        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
+        npy_bool *mask, npy_intp mask_stride,
+        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
 
 /*
  * Gives back a function pointer to a specialized function for copying
@@ -271,6 +249,7 @@ PyArray_CastRawArrays(npy_intp count,
  * The return value is the number of elements it couldn't copy.  A return value
  * of 0 means all elements were copied, a larger value means the end of
  * the n-dimensional array was reached before 'count' elements were copied.
+ * A negative return value indicates an error occurred.
  *
  * ndim:
  *      The number of dimensions of the n-dimensional array.
diff --git a/numpy/core/src/common/npy_binsearch.h.src b/numpy/core/src/common/npy_binsearch.h.src
index ce3b34b0e..052c44482 100644
--- a/numpy/core/src/common/npy_binsearch.h.src
+++ b/numpy/core/src/common/npy_binsearch.h.src
@@ -40,12 +40,12 @@ typedef struct {
  *         cfloat, cdouble, clongdouble, datetime, timedelta#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
                         npy_intp arr_len, npy_intp key_len,
                         npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
                         PyArrayObject *unused);
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            const char *sort, char *ret,
                            npy_intp arr_len, npy_intp key_len,
@@ -54,12 +54,12 @@ argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            PyArrayObject *unused);
 /**end repeat1**/
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 npy_binsearch_@side@(const char *arr, const char *key, char *ret,
                      npy_intp arr_len, npy_intp key_len,
                      npy_intp arr_str, npy_intp key_str,
                      npy_intp ret_str, PyArrayObject *cmp);
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 npy_argbinsearch_@side@(const char *arr, const char *key,
                         const char *sort, char *ret,
                         npy_intp arr_len, npy_intp key_len,
diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h
index 97308238a..072993ec2 100644
--- a/numpy/core/src/common/npy_cblas.h
+++ b/numpy/core/src/common/npy_cblas.h
@@ -47,8 +47,10 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 
 #ifdef HAVE_BLAS_ILP64
 #define CBLAS_INT npy_int64
+#define CBLAS_INT_MAX NPY_MAX_INT64
 #else
 #define CBLAS_INT int
+#define CBLAS_INT_MAX INT_MAX
 #endif
 
 #define BLASNAME(name) CBLAS_FUNC(name)
@@ -59,6 +61,39 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 #undef BLASINT
 #undef BLASNAME
 
+
+/*
+ * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
+ * (BLAS won't handle negative or zero strides the way we want).
+ */
+static NPY_INLINE CBLAS_INT
+blas_stride(npy_intp stride, unsigned itemsize)
+{
+    /*
+     * Should probably check pointer alignment also, but this may cause
+     * problems if we require complex to be 16 byte aligned.
+     */
+    if (stride > 0 && (stride % itemsize) == 0) {
+        stride /= itemsize;
+        if (stride <= CBLAS_INT_MAX) {
+            return stride;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Define a chunksize for CBLAS.
+ *
+ * The chunksize is the greatest power of two less than CBLAS_INT_MAX.
+ */
+#if NPY_MAX_INTP > CBLAS_INT_MAX
+# define NPY_CBLAS_CHUNK  (CBLAS_INT_MAX / 2 + 1)
+#else
+# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index 27328aa73..61cc3c7f1 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -19,6 +19,15 @@
 
 #endif
 
+/* Disable broken functions on z/OS */
+#if defined (__MVS__)
+
+#undef HAVE_POWF
+#undef HAVE_EXPF
+#undef HAVE___THREAD
+
+#endif
+
 /* Disable broken MS math functions */
 #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__MINGW32_VERSION)
 
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 846d1ebb9..a0f82fa3d 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -17,7 +17,7 @@
  * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
  */
 /**
- * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION',
  * due the nature of command argument '--disable-optimization',
  * which is explicitly disabling the module ccompiler_opt.
  */
@@ -29,7 +29,7 @@
          * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
          * since c99 supports bool variables which may lead to ambiguous errors.
         */
-        // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+        // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token.
         #define NPY__DISPATCH_DEFBOOL
         typedef bool npy__dispatch_bkbool;
     #endif
@@ -134,10 +134,10 @@
  *    NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
  *    NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
  *
- * By assuming the provided config header drived from a dispatch-able source,
+ * By assuming the provided config header derived from a dispatch-able source,
  * that configured with "@targets baseline sse41 vsx3 asimdhp",
  * they supported by the compiler and enabled via '--cpu-dspatch',
- * then the prototype declrations at the above example will equlivent to the follows:
+ * then the prototype declrations at the above example will equivalent to the follows:
  *
  * - x86:
  *      void dispatch_me(const int*, int*); // baseline
@@ -179,7 +179,7 @@
 /**
  * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
  * if it was provided within the configration statments.
  */
 #define NPY_CPU_DISPATCH_DECLARE_XB(...) \
@@ -206,7 +206,7 @@
  *  In order to call or to assign the pointer of it from outside the dispatch-able source,
  *  you have to use this Macro as follows:
  *
- *    // bring the genreated config header of the dispatch-abel source
+ *    // bring the generated config header of the dispatch-able source
  *    #ifndef NPY_DISABLE_OPTIMIZATION
  *        #include "dispatchable_source_name.dispatch.h"
  *    #endif
@@ -217,44 +217,49 @@
  *    func_type the_callee(const int *src, int *dst, func_type *cb)
  *    {
  *        // direct call
- *        NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst))
+ *        NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst));
  *        // assign the pointer
- *        NPY_CPU_DISPATCH_CALL(*cb = dispatch_me)
+ *        *cb = NPY_CPU_DISPATCH_CALL(dispatch_me);
+ *        // or
+ *        NPY_CPU_DISPATCH_CALL(*cb = dispatch_me);
  *        // return the pointer
- *        NPY_CPU_DISPATCH_CALL(return dispatch_me)
+ *        return NPY_CPU_DISPATCH_CALL(dispatch_me);
  *    }
  */
 #define NPY_CPU_DISPATCH_CALL(...) \
-    if (0) {/*DUMMY*/} \
     NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
     NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
 // Preprocessor callbacks
 #define NPY_CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
-    else if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+    (TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
 #define NPY_CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
-    else { LEFT __VA_ARGS__; }
+    (LEFT __VA_ARGS__)
 /**
  * Macro NPY_CPU_DISPATCH_CALL_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
- * if it was provided within the configration statments.
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
+ * if it was provided within the configration statements.
+ * Returns void.
  */
+#define NPY_CPU_DISPATCH_CALL_XB_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (void) (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
 #define NPY_CPU_DISPATCH_CALL_XB(...) \
-    if (0) {/*DUMMY*/} \
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__)
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \
+    ((void) 0 /* discarded expression value */)
 /**
  * Macro NPY_CPU_DISPATCH_CALL_ALL(LEFT, ...)
  *
  * Same as `NPY_CPU_DISPATCH_CALL` but dispatching all the required optimizations for
  * the exported functions and variables instead of highest interested one.
+ * Returns void.
  */
 #define NPY_CPU_DISPATCH_CALL_ALL(...) \
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
-    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__)
+    (NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__))
 // Preprocessor callbacks
 #define NPY_CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
-    if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+    ((TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
 #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
-    { LEFT __VA_ARGS__; }
+    ( LEFT __VA_ARGS__ )
 
 #endif // NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index dfcf98c74..69bbc83a2 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -19,11 +19,11 @@ npy__cpu_init_features(void);
  * Multiple features can be present, and separated by space, comma, or tab.
  * Raises an error if parsing fails or if the feature was not enabled
 */
-static void
+static int
 npy__cpu_try_disable_env(void);
 
 /* Ensure the build's CPU baseline features are supported at runtime */
-static void
+static int
 npy__cpu_validate_baseline(void);
 
 /******************** Public Definitions *********************/
@@ -40,11 +40,12 @@ NPY_VISIBILITY_HIDDEN int
 npy_cpu_init(void)
 {
     npy__cpu_init_features();
-    npy__cpu_validate_baseline();
-    npy__cpu_try_disable_env();
-
-    if (PyErr_Occurred())
+    if (npy__cpu_validate_baseline() < 0) {
+        return -1;
+    }
+    if (npy__cpu_try_disable_env() < 0) {
         return -1;
+    }
     return 0;
 }
 
@@ -142,7 +143,7 @@ npy__cpu_dispatch_fid(const char *feature)
     return 0;
 }
 
-static void
+static int
 npy__cpu_validate_baseline(void)
 {
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
@@ -165,16 +166,18 @@ npy__cpu_validate_baseline(void)
             "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
             baseline_failure
         );
+        return -1;
     }
 #endif
+    return 0;
 }
 
-static void
+static int
 npy__cpu_try_disable_env(void)
 {
     char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
     if (disenv == NULL || disenv[0] == 0) {
-        return;
+        return 0;
     }
     #define NPY__CPU_ENV_ERR_HEAD \
         "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
@@ -187,7 +190,7 @@ npy__cpu_try_disable_env(void)
             "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
             var_len, NPY__MAX_VAR_LEN - 1
         );
-        return;
+        return -1;
     }
     char disable_features[NPY__MAX_VAR_LEN];
     memcpy(disable_features, disenv, var_len);
@@ -210,7 +213,7 @@ npy__cpu_try_disable_env(void)
                 "(" NPY_WITH_CPU_BASELINE ").",
                 feature
             );
-            break;
+            return -1;
         }
         // check if the feature is part of dispatched features
         int feature_id = npy__cpu_dispatch_fid(feature);
@@ -236,36 +239,43 @@ npy__cpu_try_disable_env(void)
     *nexist_cur = '\0';
     if (nexist[0] != '\0') {
         *(nexist_cur-1) = '\0'; // trim the last space
-        PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot disable CPU features (%s), since "
-            "they are not part of the dispatched optimizations\n"
-            "(" NPY_WITH_CPU_DISPATCH ").",
-            nexist
-        );
+        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+                NPY__CPU_ENV_ERR_HEAD
+                "You cannot disable CPU features (%s), since "
+                "they are not part of the dispatched optimizations\n"
+                "(" NPY_WITH_CPU_DISPATCH ").",
+                nexist
+        ) < 0) {
+            return -1;
+        }
     }
 
     *notsupp_cur = '\0';
     if (notsupp[0] != '\0') {
         *(notsupp_cur-1) = '\0'; // trim the last space
-        PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot disable CPU features (%s), since "
-            "they are not supported by your machine.",
-            notsupp
-        );
+        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+                NPY__CPU_ENV_ERR_HEAD
+                "You cannot disable CPU features (%s), since "
+                "they are not supported by your machine.",
+                notsupp
+        ) < 0) {
+            return -1;
+        }
     }
 #else
-    PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-        NPY__CPU_ENV_ERR_HEAD
-        "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
-    #ifdef NPY_DISABLE_OPTIMIZATION
-        "the NumPy library was compiled with optimization disabled."
-    #else
-        "the NumPy library was compiled without any dispatched optimizations."
-    #endif
-    );
+    if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+            NPY__CPU_ENV_ERR_HEAD
+            "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+        #ifdef NPY_DISABLE_OPTIMIZATION
+            "the NumPy library was compiled with optimization disabled."
+        #else
+            "the NumPy library was compiled without any dispatched optimizations."
+        #endif
+    ) < 0) {
+        return -1;
+    }
 #endif
+    return 0;
 }
 
 /****************************************************************
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 693a9857d..28dd00032 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -1,8 +1,8 @@
 #ifndef _NPY_CPU_FEATURES_H_
 #define _NPY_CPU_FEATURES_H_
 
-#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 #include <Python.h> // for PyObject
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/numpy/core/src/common/npy_partition.h.src b/numpy/core/src/common/npy_partition.h.src
index 97dc2536b..72c2095f1 100644
--- a/numpy/core/src/common/npy_partition.h.src
+++ b/numpy/core/src/common/npy_partition.h.src
@@ -42,12 +42,12 @@
  *         npy_cdouble, npy_clongdouble#
  */
 
-NPY_VISIBILITY_HIDDEN int introselect_@suff@(@type@ *v, npy_intp num,
+NPY_NO_EXPORT int introselect_@suff@(@type@ *v, npy_intp num,
                                              npy_intp kth,
                                              npy_intp * pivots,
                                              npy_intp * npiv,
                                              void *NOT_USED);
-NPY_VISIBILITY_HIDDEN int aintroselect_@suff@(@type@ *v, npy_intp* tosort, npy_intp num,
+NPY_NO_EXPORT int aintroselect_@suff@(@type@ *v, npy_intp* tosort, npy_intp num,
                                               npy_intp kth,
                                               npy_intp * pivots,
                                               npy_intp * npiv,
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index 16a105499..ddbde0c9b 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -33,14 +33,14 @@ static NPY_INLINE int npy_get_msb(npy_uintp unum)
  *         cfloat, cdouble, clongdouble, datetime, timedelta#
  */
 
-int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
-int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
-int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
-int timsort_@suff@(void *vec, npy_intp cnt, void *null);
-int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
@@ -50,8 +50,8 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
  *         longlong, ulonglong#
  */
 
-int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
-int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
@@ -69,14 +69,14 @@ int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
  * #suff = string, unicode#
  */
 
-int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
-int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
-int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
-int timsort_@suff@(void *vec, npy_intp cnt, void *arr);
-int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
 
 /**end repeat**/
 
@@ -88,13 +88,13 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
  */
 
 
-int npy_quicksort(void *vec, npy_intp cnt, void *arr);
-int npy_heapsort(void *vec, npy_intp cnt, void *arr);
-int npy_mergesort(void *vec, npy_intp cnt, void *arr);
-int npy_timsort(void *vec, npy_intp cnt, void *arr);
-int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_timsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
 
 #endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 9d8b4ab5e..3a6dc9535 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -72,4 +72,71 @@
 #define npyv_div_f32 _mm256_div_ps
 #define npyv_div_f64 _mm256_div_pd
 
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm256_fmadd_ps
+    #define npyv_muladd_f64 _mm256_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm256_fmsub_ps
+    #define npyv_mulsub_f64 _mm256_fmsub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm256_fnmadd_ps
+    #define npyv_nmuladd_f64 _mm256_fnmadd_pd
+    // negate multiply and subtract, -(a*b) - c
+    #define npyv_nmulsub_f32 _mm256_fnmsub_ps
+    #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+        return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+    }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+        return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+    }
+#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+    __m256 sum_halves = _mm256_hadd_ps(a, a);
+    sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
+    __m128 lo = _mm256_castps256_ps128(sum_halves);
+    __m128 hi = _mm256_extractf128_ps(sum_halves, 1);
+    __m128 sum = _mm_add_ps(lo, hi);
+    return _mm_cvtss_f32(sum);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+    __m256d sum_halves = _mm256_hadd_pd(a, a);
+    __m128d lo = _mm256_castpd256_pd128(sum_halves);
+    __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+    __m128d sum = _mm_add_pd(lo, hi);
+    return _mm_cvtsd_f64(sum);
+}
+
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index c99d628ee..6f0d3c0d9 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -5,6 +5,8 @@
 #define NPY_SIMD 256
 #define NPY_SIMD_WIDTH 32
 #define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm256_i32gather_*
+#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
 typedef __m256i npyv_u8;
 typedef __m256i npyv_s8;
@@ -65,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 9fd86016d..f72678b54 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(A) A
 #define npyv_cvt_u64_b64(A) A
 #define npyv_cvt_s64_b64(A) A
-#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A)
-#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A)
+#define npyv_cvt_f32_b32 _mm256_castsi256_ps
+#define npyv_cvt_f64_b64 _mm256_castsi256_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(BL)   BL
@@ -26,7 +26,21 @@
 #define npyv_cvt_b32_s32(BL) BL
 #define npyv_cvt_b64_u64(BL) BL
 #define npyv_cvt_b64_s64(BL) BL
-#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL)
-#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL)
+#define npyv_cvt_b32_f32 _mm256_castps_si256
+#define npyv_cvt_b64_f64 _mm256_castpd_si256
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint32)_mm256_movemask_epi8(a); }
+
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+    return (npy_uint16)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm256_movemask_ps(_mm256_castsi256_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
 
 #endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
new file mode 100644
index 000000000..b3eba6f5f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MATH_H
+#define _NPY_SIMD_AVX2_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm256_sqrt_ps
+#define npyv_sqrt_f64 _mm256_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm256_and_ps(
+        a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm256_and_pd(
+        a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm256_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm256_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 5ea7414fd..e27bf15fe 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -2,6 +2,8 @@
     #error "Not a standalone header"
 #endif
 
+#include "misc.h"
+
 #ifndef _NPY_SIMD_AVX2_MEMORY_H
 #define _NPY_SIMD_AVX2_MEMORY_H
 
@@ -66,5 +68,289 @@ NPYV_IMPL_AVX2_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1))
 #define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+    return _mm256_i32gather_epi32((const int*)ptr, idx, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+#if 0 // slower
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+#endif
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+    __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+    __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+    __m128d a01 = _mm_loadh_pd(a0, ptr + stride);
+    __m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    __m128i a0 = _mm256_castsi256_si128(a);
+    __m128i a1 = _mm256_extracti128_si256(a, 1);
+    ptr[stride * 0] = _mm_cvtsi128_si32(a0);
+    ptr[stride * 1] = _mm_extract_epi32(a0, 1);
+    ptr[stride * 2] = _mm_extract_epi32(a0, 2);
+    ptr[stride * 3] = _mm_extract_epi32(a0, 3);
+    ptr[stride * 4] = _mm_cvtsi128_si32(a1);
+    ptr[stride * 5] = _mm_extract_epi32(a1, 1);
+    ptr[stride * 6] = _mm_extract_epi32(a1, 2);
+    ptr[stride * 7] = _mm_extract_epi32(a1, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm256_castps_si256(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+    __m128d a0 = _mm256_castpd256_pd128(a);
+    __m128d a1 = _mm256_extractf128_pd(a, 1);
+    _mm_storel_pd(ptr + stride * 0, a0);
+    _mm_storeh_pd(ptr + stride * 1, a0);
+    _mm_storel_pd(ptr + stride * 2, a1);
+    _mm_storeh_pd(ptr + stride * 3, a1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi32(fill);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
+    return _mm256_maskload_epi32((const int*)ptr, mask);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi64x(fill);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_maskload_epi64((const void*)ptr, mask);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m256i vfill = _mm256_set1_epi32(fill);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    const __m256i idx   = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
+    return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi64x(fill);
+    const __m256i idx   = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
+    _mm256_maskstore_epi32((int*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    _mm256_maskstore_epi64((void*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    __m128i a0 = _mm256_castsi256_si128(a);
+    __m128i a1 = _mm256_extracti128_si256(a, 1);
+    switch(nlane) {
+    default:
+        ptr[stride*7] = _mm_extract_epi32(a1, 3);
+    case 7:
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+    case 6:
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+    case 5:
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+    case 4:
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+    case 3:
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+    case 1:
+        ptr[stride*0] = _mm_extract_epi32(a0, 0);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+    double *dptr = (double*)ptr;
+    switch(nlane) {
+    default:
+        _mm_storeh_pd(dptr + stride * 3, a1);
+    case 3:
+        _mm_storel_pd(dptr + stride * 2, a1);
+    case 2:
+        _mm_storeh_pd(dptr + stride * 1, a0);
+    case 1:
+        _mm_storel_pd(dptr + stride * 0, a0);
+    }
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                     \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_AVX2_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 5a9e68e32..4d6ec8f75 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,4 +94,36 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm256_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index fcaef0efd..7372ca29e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -113,4 +113,63 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
 
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 _mm512_fmadd_ps
+#define npyv_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 _mm512_fmsub_ps
+#define npyv_mulsub_f64 _mm512_fmsub_pd
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 _mm512_fnmadd_ps
+#define npyv_nmuladd_f64 _mm512_fnmadd_pd
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 _mm512_fnmsub_ps
+#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+
+/***************************
+ * Reduce Sum
+ * there are three ways to implement reduce sum for AVX512:
+ * 1- split(256) /add /split(128) /add /hadd /hadd /extract
+ * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
+ * 3- _mm512_reduce_add_ps/pd
+ * The first one is been widely used by many projects
+ * 
+ * the second one is used by Intel Compiler, maybe because the
+ * latency of hadd increased by (2-3) starting from Skylake-X which makes two
+ * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
+ * 
+ * The third one is almost the same as the second one but only works for
+ * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
+ ***************************/
+#ifdef NPY_HAVE_AVX512F_REDUCE
+    #define npyv_sum_f32 _mm512_reduce_add_ps
+    #define npyv_sum_f64 _mm512_reduce_add_pd
+#else
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512 sum32 = _mm512_add_ps(a, h64);
+        __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum16 = _mm512_add_ps(sum32, h32);
+        __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum8  = _mm512_add_ps(sum16, h16);
+        __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512 sum4  = _mm512_add_ps(sum8, h4);
+        return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+    }
+    NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+    {
+        __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512d sum32 = _mm512_add_pd(a, h64);
+        __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512d sum16 = _mm512_add_pd(sum32, h32);
+        __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512d sum8  = _mm512_add_pd(sum16, h16);
+        return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+    }
+#endif
+
 #endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 96fdf72b9..2de33765a 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -4,6 +4,9 @@
 #define NPY_SIMD 512
 #define NPY_SIMD_WIDTH 64
 #define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
+#define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
+#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
 
 typedef __m512i npyv_u8;
 typedef __m512i npyv_s8;
@@ -69,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0f7e27de3..6ad299dd5 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,4 +51,35 @@
 #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
 #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
 
+// convert boolean vectors to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint64)_cvtmask64_u64(a);
+#elif defined(NPY_HAVE_AVX512BW)
+    return (npy_uint64)a;
+#else
+    int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
+    int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
+    return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint32)_cvtmask32_u32(a);
+#elif defined(NPY_HAVE_AVX512BW)
+    return (npy_uint32)a;
+#else
+    __m256i pack = _mm256_packs_epi16(
+        npyv512_lower_si256(a), npyv512_higher_si256(a)
+    );
+    return (npy_uint32)_mm256_movemask_epi8(_mm256_permute4x64_epi64(pack, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint16)a; }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)a; }
+
 #endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
new file mode 100644
index 000000000..1db710670
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MATH_H
+#define _NPY_SIMD_AVX512_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm512_sqrt_ps
+#define npyv_sqrt_f64 _mm512_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_ps(a, a, 8);
+#else
+    return npyv_and_f32(
+        a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
+    );
+#endif
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_pd(a, a, 8);
+#else
+    return npyv_and_f64(
+        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+    );
+#endif
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm512_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm512_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index e212c4555..bffd6e907 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -90,5 +90,243 @@ NPYV_IMPL_AVX512_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC))
 #define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m512i steps = npyv_set_s32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    return _mm512_i32gather_epi32(idx, (const __m512i*)ptr, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    return _mm512_i64gather_epi64(idx, (const __m512i*)ptr, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+    const __m512i steps = _mm512_setr_epi32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    _mm512_i32scatter_epi32((__m512i*)ptr, idx, a, 4);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    _mm512_i64scatter_epi64((__m512i*)ptr, idx, a, 8);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set1_epi32(fill);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m512i steps = npyv_set_s32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    const __m512i vfill = _mm512_set1_epi32(fill);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+    const __m512i steps = _mm512_setr_epi32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                   \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_AVX512_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index cdbae7aac..f043004ec 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,4 +167,60 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm512_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index ec8b8ecd0..87e00d5d1 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -63,16 +63,84 @@
 /***************************
  * Division
  ***************************/
-#ifdef __aarch64__
+#if NPY_SIMD_F64
     #define npyv_div_f32 vdivq_f32
 #else
-    NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+    NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
     {
-        float32x4_t recip = vrecpeq_f32(b);
-        recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
-        return vmulq_f32(a, recip);
+        // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+        // estimate to 1/b
+        npyv_f32 recipe = vrecpeq_f32(b);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (2-d * x[n])
+         * converges to (1/d) if x0 is the result of VRECPE applied to d.
+         *
+         *  NOTE: at least 3 iterations is needed to improve precision
+         */
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        // a/b = a*recip(b)
+        return vmulq_f32(a, recipe);
     }
 #endif
 #define npyv_div_f64 vdivq_f64
 
+/***************************
+ * FUSED F32
+ ***************************/
+#ifdef NPY_HAVE_NEON_VFPV4 // FMA
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmaq_f32(vnegq_f32(c), a, b); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmsq_f32(c, a, b); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmsq_f32(vnegq_f32(c), a, b); }
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlaq_f32(vnegq_f32(c), a, b); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlsq_f32(c, a, b); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlsq_f32(vnegq_f32(c), a, b); }
+#endif
+/***************************
+ * FUSED F64
+ ***************************/
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmaq_f64(c, a, b); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmaq_f64(vnegq_f64(c), a, b); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmsq_f64(c, a, b); }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmsq_f64(vnegq_f64(c), a, b); }
+#endif // NPY_SIMD_F64
+
+// Horizontal add: Calculates the sum of all vector elements.
+#if NPY_SIMD_F64
+    #define npyv_sum_f32 vaddvq_f32
+    #define npyv_sum_f64 vaddvq_f64
+#else
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+        return vget_lane_f32(vpadd_f32(r, r), 0);
+    }
+#endif
+
 #endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index b286931d1..f9840b1cb 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -7,26 +7,68 @@
 
 // convert boolean vectors to integer vectors
 #define npyv_cvt_u8_b8(A)   A
-#define npyv_cvt_s8_b8(A)   vreinterpretq_s8_u8(A)
+#define npyv_cvt_s8_b8   vreinterpretq_s8_u8
 #define npyv_cvt_u16_b16(A) A
-#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
+#define npyv_cvt_s16_b16 vreinterpretq_s16_u16
 #define npyv_cvt_u32_b32(A) A
-#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
+#define npyv_cvt_s32_b32 vreinterpretq_s32_u32
 #define npyv_cvt_u64_b64(A) A
-#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
-#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
-#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
+#define npyv_cvt_s64_b64 vreinterpretq_s64_u64
+#define npyv_cvt_f32_b32 vreinterpretq_f32_u32
+#define npyv_cvt_f64_b64 vreinterpretq_f64_u64
 
 // convert integer vectors to boolean vectors
 #define npyv_cvt_b8_u8(BL)   BL
-#define npyv_cvt_b8_s8(BL)   vreinterpretq_u8_s8(BL)
+#define npyv_cvt_b8_s8   vreinterpretq_u8_s8
 #define npyv_cvt_b16_u16(BL) BL
-#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
+#define npyv_cvt_b16_s16 vreinterpretq_u16_s16
 #define npyv_cvt_b32_u32(BL) BL
-#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
+#define npyv_cvt_b32_s32 vreinterpretq_u32_s32
 #define npyv_cvt_b64_u64(BL) BL
-#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
-#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
-#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
+#define npyv_cvt_b64_s64 vreinterpretq_u64_s64
+#define npyv_cvt_b32_f32 vreinterpretq_u32_f32
+#define npyv_cvt_b64_f64 vreinterpretq_u64_f64
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u8 seq_scale = vandq_u8(a, scale);
+#if NPY_SIMD_F64
+    npy_uint8 sumlo = vaddv_u8(vget_low_u8(seq_scale));
+    npy_uint8 sumhi = vaddv_u8(vget_high_u8(seq_scale));
+    return sumlo + ((int)sumhi << 8);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale)));
+    return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u16 seq_scale = vandq_u16(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u16(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale));
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
+    npyv_u32 seq_scale = vandq_u32(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u32(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(seq_scale);
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = vshrq_n_u64(a, 63);
+    return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
+}
 
 #endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
new file mode 100644
index 000000000..a2bbdf2a5
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -0,0 +1,86 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MATH_H
+#define _NPY_SIMD_NEON_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Absolute
+#define npyv_abs_f32 vabsq_f32
+#define npyv_abs_f64 vabsq_f64
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vmulq_f32(a, a); }
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+    { return vmulq_f64(a, a); }
+#endif
+
+// Square root
+#if NPY_SIMD_F64
+    #define npyv_sqrt_f32 vsqrtq_f32
+    #define npyv_sqrt_f64 vsqrtq_f64
+#else
+    // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+    NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
+    {
+        const npyv_f32 zero = vdupq_n_f32(0.0f);
+        const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
+        npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
+        // guard agianst floating-point division-by-zero error
+        npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
+        // estimate to (1/√a)
+        npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
+         * converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
+         *
+         * NOTE: at least 3 iterations is needed to improve precision
+         */
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        // a * (1/√a)
+        npyv_f32 sqrt = vmulq_f32(a, rsqrte);
+        // return zero if the a is zero
+        // - return zero if a is zero.
+        // - return positive infinity if a is positive infinity
+        return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
+    }
+#endif // NPY_SIMD_F64
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+#if NPY_SIMD_F64
+    const npyv_f32 one = vdupq_n_f32(1.0f);
+    return npyv_div_f32(one, a);
+#else
+    npyv_f32 recipe = vrecpeq_f32(a);
+    /**
+     * Newton-Raphson iteration:
+     *  x[n+1] = x[n] * (2-d * x[n])
+     * converges to (1/d) if x0 is the result of VRECPE applied to d.
+     *
+     * NOTE: at least 3 iterations is needed to improve precision
+     */
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    return recipe;
+#endif
+}
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+    {
+        const npyv_f64 one = vdupq_n_f64(1.0);
+        return npyv_div_f64(one, a);
+    }
+#endif // NPY_SIMD_F64
+
+#endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index afa703584..1e258f1bc 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -5,6 +5,8 @@
 #ifndef _NPY_SIMD_NEON_MEMORY_H
 #define _NPY_SIMD_NEON_MEMORY_H
 
+#include "misc.h"
+
 /***************************
  * load/store
  ***************************/
@@ -45,5 +47,290 @@ NPYV_IMPL_NEON_MEM(f32, float)
 #if NPY_SIMD_F64
 NPYV_IMPL_NEON_MEM(f64, double)
 #endif
+/***************************
+ * Non-contiguous Load
+ ***************************/
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+    switch (stride) {
+    case 2:
+        return vld2q_s32((const int32_t*)ptr).val[0];
+    case 3:
+        return vld3q_s32((const int32_t*)ptr).val[0];
+    case 4:
+        return vld4q_s32((const int32_t*)ptr).val[0];
+    default:;
+        int32x2_t ax = vcreate_s32(*ptr);
+        int32x4_t a = vcombine_s32(ax, ax);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride,   a, 1);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3);
+        return a;
+    }
+}
+
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_u32_s32(
+        npyv_loadn_s32((const npy_int32*)ptr, stride)
+    );
+}
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_f32_s32(
+        npyv_loadn_s32((const npy_int32*)ptr, stride)
+    );
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{
+    return vcombine_s64(
+        vld1_s64((const int64_t*)ptr), vld1_s64((const int64_t*)ptr + stride)
+    );
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_u64_s64(
+        npyv_loadn_s64((const npy_int64*)ptr, stride)
+    );
+}
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_f64_s64(
+        npyv_loadn_s64((const npy_int64*)ptr, stride)
+    );
+}
+#endif
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    vst1q_lane_s32((int32_t*)ptr, a, 0);
+    vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+    vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+    vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{
+    vst1q_lane_s64((int64_t*)ptr, a, 0);
+    vst1q_lane_s64((int64_t*)ptr + stride, a, 1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); }
+
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
+#endif
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0);
+    case 2:
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill));
+    case 3:
+        return vcombine_s32(
+            vld1_s32((const int32_t*)ptr),
+            vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0)
+        );
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+    }
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s64(ptr, nlane, 0); }
+
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    int32x4_t vfill = vdupq_n_s32(fill);
+    switch(nlane) {
+    case 3:
+        vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2);
+    case 2:
+        vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1);
+    case 1:
+        vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0);
+        return vfill;
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    }
+}
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        vst1q_lane_s32((int32_t*)ptr, a, 0);
+        break;
+    case 2:
+        vst1_s32((int32_t*)ptr, vget_low_s32(a));
+        break;
+    case 3:
+        vst1_s32((int32_t*)ptr, vget_low_s32(a));
+        vst1q_lane_s32((int32_t*)ptr + 2, a, 2);
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        vst1q_lane_s64((int64_t*)ptr, a, 0);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    default:
+        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+    case 3:
+        vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+    case 2:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+    case 1:
+        vst1q_lane_s32((int32_t*)ptr, a, 0);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        vst1q_lane_s64((int64_t*)ptr, a, 0);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                     \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
+#endif
 
 #endif // _NPY_SIMD_NEON_MEMORY_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index 280a34297..c8ddc92ad 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 712a77982..50b06ed11 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -107,4 +107,13 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
 
+// Reverse elements of each 64-bit lane
+#define npyv_rev64_u8  vrev64q_u8
+#define npyv_rev64_s8  vrev64q_s8
+#define npyv_rev64_u16 vrev64q_u16
+#define npyv_rev64_s16 vrev64q_s16
+#define npyv_rev64_u32 vrev64q_u32
+#define npyv_rev64_s32 vrev64q_s32
+#define npyv_rev64_f32 vrev64q_f32
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 2f39c8427..8804223c9 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -49,6 +49,55 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_WIDTH 0
     #define NPY_SIMD_F64 0
 #endif
+/**
+ * Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
+ * a maximum number of strides sizes when dealing with non-contiguous memory access.
+ *
+ * Therefore the following functions must be used to check the maximum
+ * acceptable limit of strides before using any of non-contiguous load/store intrinsics.
+ *
+ * For instance:
+ *  npy_intp ld_stride = step[0] / sizeof(float);
+ *  npy_intp st_stride = step[1] / sizeof(float);
+ *
+ *  if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) {
+ *      for (;;)
+ *          npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride);
+ *          // ...
+ *          npyv_storen_f32(st_pointer, st_stride, a);
+ *  }
+ *  else {
+ *      for (;;)
+ *          // C scalars
+ *  }
+ */
+#ifndef NPY_SIMD_MAXLOAD_STRIDE32
+    #define NPY_SIMD_MAXLOAD_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE32
+    #define NPY_SIMD_MAXSTORE_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXLOAD_STRIDE64
+    #define NPY_SIMD_MAXLOAD_STRIDE64 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE64
+    #define NPY_SIMD_MAXSTORE_STRIDE64 0
+#endif
+#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
+    NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \
+    { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \
+    NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \
+    { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; }
+#if NPY_SIMD
+    NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+    NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
+#if NPY_SIMD_F64
+    NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 12d0af05c..8440cc52e 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -91,5 +91,87 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm_div_ps
 #define npyv_div_f64 _mm_div_pd
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm_fmadd_ps
+    #define npyv_muladd_f64 _mm_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm_fmsub_ps
+    #define npyv_mulsub_f64 _mm_fmsub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm_fnmadd_ps
+    #define npyv_nmuladd_f64 _mm_fnmadd_pd
+    // negate multiply and subtract, -(a*b) - c
+    #define npyv_nmulsub_f32 _mm_fnmsub_ps
+    #define npyv_nmulsub_f64 _mm_fnmsub_pd
+#elif defined(NPY_HAVE_FMA4)
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm_macc_ps
+    #define npyv_muladd_f64 _mm_macc_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm_msub_ps
+    #define npyv_mulsub_f64 _mm_msub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm_nmacc_ps
+    #define npyv_nmuladd_f64 _mm_nmacc_pd
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+#endif // NPY_HAVE_FMA3
+#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+        return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+    }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+        return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+    }
+#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+#ifdef NPY_HAVE_SSE3
+    __m128 sum_halves = _mm_hadd_ps(a, a);
+    return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
+#else
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4); 
+#endif
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+#ifdef NPY_HAVE_SSE3
+    return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+    return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
+}
 
 #endif // _NPY_SIMD_SSE_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ea9660d13..ab4beea96 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(BL) BL
 #define npyv_cvt_u64_b64(BL) BL
 #define npyv_cvt_s64_b64(BL) BL
-#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL)
-#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL)
+#define npyv_cvt_f32_b32 _mm_castsi128_ps
+#define npyv_cvt_f64_b64 _mm_castsi128_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(A)   A
@@ -26,7 +26,20 @@
 #define npyv_cvt_b32_s32(A) A
 #define npyv_cvt_b64_u64(A) A
 #define npyv_cvt_b64_s64(A) A
-#define npyv_cvt_b32_f32(A) _mm_castps_si128(A)
-#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A)
+#define npyv_cvt_b32_f32 _mm_castps_si128
+#define npyv_cvt_b64_f64 _mm_castpd_si128
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint16)_mm_movemask_epi8(a); }
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(a, a);
+    return (npy_uint8)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm_movemask_ps(_mm_castsi128_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
 
 #endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
new file mode 100644
index 000000000..b7203cd89
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MATH_H
+#define _NPY_SIMD_SSE_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm_sqrt_ps
+#define npyv_sqrt_f64 _mm_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm_and_ps(
+        a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm_and_pd(
+        a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 1a555d6f0..1074c3b02 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -5,6 +5,8 @@
 #ifndef _NPY_SIMD_SSE_MEMORY_H
 #define _NPY_SIMD_SSE_MEMORY_H
 
+#include "misc.h"
+
 /***************************
  * load/store
  ***************************/
@@ -70,5 +72,427 @@ NPYV_IMPL_SSE_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
 #define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+    __m128i a = _mm_cvtsi32_si128(*ptr);
+#ifdef NPY_HAVE_SSE41
+    a = _mm_insert_epi32(a, ptr[stride],   1);
+    a = _mm_insert_epi32(a, ptr[stride*2], 2);
+    a = _mm_insert_epi32(a, ptr[stride*3], 3);
+#else
+    __m128i a1 = _mm_cvtsi32_si128(ptr[stride]);
+    __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]);
+    __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]);
+    a = _mm_unpacklo_epi32(a, a1);
+    a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3));
+#endif
+    return a;
+}
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return npyv_loadn_s32((const npy_int32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); }
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    ptr[stride * 0] = _mm_cvtsi128_si32(a);
+#ifdef NPY_HAVE_SSE41
+    ptr[stride * 1] = _mm_extract_epi32(a, 1);
+    ptr[stride * 2] = _mm_extract_epi32(a, 2);
+    ptr[stride * 3] = _mm_extract_epi32(a, 3);
+#else
+    ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+    ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+#endif
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+    _mm_storel_pd(ptr, a);
+    _mm_storeh_pd(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+#if defined(__clang__) && __clang_major__ > 7
+    /**
+     * Clang >=8 perform aggressive optimization that tends to
+     * zero the bits of upper half part of vectors even
+     * when we try to fill it up with certain scalars,
+     * which my lead to zero division errors.
+    */
+    #define NPYV__CLANG_ZEROUPPER
+#endif
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane > 3) {
+        return npyv_load_s32(ptr);
+    }
+    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+    for (npy_uint64 i = 0; i < nlane; ++i) {
+        data[i] = ptr[i];
+    }
+    return npyv_loada_s32(data);
+#else
+    #ifndef NPY_HAVE_SSE41
+        const short *wptr = (const short*)ptr;
+    #endif
+    const __m128i vfill = npyv_setall_s32(fill);
+    __m128i a;
+    switch(nlane) {
+    case 2:
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    #ifdef NPY_HAVE_SSE41
+        case 1:
+            return _mm_insert_epi32(vfill, ptr[0], 0);
+        case 3:
+            a = _mm_loadl_epi64((const __m128i*)ptr);
+            a = _mm_insert_epi32(a, ptr[2], 2);
+            a = _mm_insert_epi32(a, fill, 3);
+            return a;
+    #else
+        case 1:
+            a = _mm_insert_epi16(vfill, wptr[0], 0);
+            return _mm_insert_epi16(a, wptr[1], 1);
+        case 3:
+            a = _mm_loadl_epi64((const __m128i*)ptr);
+            a = _mm_unpacklo_epi64(a, vfill);
+            a = _mm_insert_epi16(a, wptr[4], 4);
+            a = _mm_insert_epi16(a, wptr[5], 5);
+            return a;
+    #endif // NPY_HAVE_SSE41
+        default:
+            return npyv_load_s32(ptr);
+        }
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return _mm_cvtsi32_si128(*ptr);
+    case 2:
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    case 3:;
+        npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+    #ifdef NPY_HAVE_SSE41
+        return _mm_insert_epi32(a, ptr[2], 2);
+    #else
+        return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+    #endif
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane <= 2) {
+        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+        for (npy_uint64 i = 0; i < nlane; ++i) {
+            data[i] = ptr[i];
+        }
+        return npyv_loada_s64(data);
+    }
+#else
+    if (nlane == 1) {
+        const __m128i vfill = npyv_setall_s64(fill);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+#endif
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_load_s64(ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane > 3) {
+        return npyv_loadn_s32(ptr, stride);
+    }
+    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+    for (npy_uint64 i = 0; i < nlane; ++i) {
+        data[i] = ptr[stride*i];
+    }
+    return npyv_loada_s32(data);
+#else
+    __m128i vfill = npyv_setall_s32(fill);
+    #ifndef NPY_HAVE_SSE41
+        const short *wptr = (const short*)ptr;
+    #endif
+    switch(nlane) {
+    #ifdef NPY_HAVE_SSE41
+        case 3:
+            vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2);
+        case 2:
+            vfill = _mm_insert_epi32(vfill, ptr[stride], 1);
+        case 1:
+            vfill = _mm_insert_epi32(vfill, ptr[0], 0);
+            break;
+    #else
+        case 3:
+            vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill);
+        case 2:
+            vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32(
+                _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride])
+            ), vfill);
+            break;
+        case 1:
+            vfill = _mm_insert_epi16(vfill, wptr[0], 0);
+            vfill = _mm_insert_epi16(vfill, wptr[1], 1);
+            break;
+    #endif // NPY_HAVE_SSE41
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    } // switch
+    return vfill;
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return _mm_cvtsi32_si128(ptr[0]);
+    case 2:;
+        npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+        return _mm_insert_epi32(a, ptr[stride], 1);
+#else
+        return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+#endif // NPY_HAVE_SSE41
+    case 3:;
+        a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+        a = _mm_insert_epi32(a, ptr[stride], 1);
+        a = _mm_insert_epi32(a, ptr[stride*2], 2);
+        return a;
+#else
+        a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+        a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+        return a;
+#endif // NPY_HAVE_SSE41
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    }
+}
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane <= 2) {
+        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+        for (npy_uint64 i = 0; i < nlane; ++i) {
+            data[i] = ptr[i*stride];
+        }
+        return npyv_loada_s64(data);
+    }
+#else
+    if (nlane == 1) {
+        const __m128i vfill = npyv_setall_s64(fill);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+#endif
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        *ptr = _mm_cvtsi128_si32(a);
+        break;
+    case 2:
+        _mm_storel_epi64((__m128i *)ptr, a);
+        break;
+    case 3:
+        _mm_storel_epi64((__m128i *)ptr, a);
+    #ifdef NPY_HAVE_SSE41
+        ptr[2] = _mm_extract_epi32(a, 2);
+    #else
+        ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    #endif
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        _mm_storel_epi64((__m128i *)ptr, a);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+#ifdef NPY_HAVE_SSE41
+    default:
+        ptr[stride*3] = _mm_extract_epi32(a, 3);
+    case 3:
+        ptr[stride*2] = _mm_extract_epi32(a, 2);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
+#else
+    default:
+        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+    case 3:
+        ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    case 2:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+#endif
+    case 1:
+        ptr[stride*0] = _mm_cvtsi128_si32(a);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        _mm_storel_epi64((__m128i *)ptr, a);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_SSE_MEMORY_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index 3f68b4ad7..d96ab9c56 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,4 +81,45 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i lo = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
+    return _mm_shufflehi_epi16(lo, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i rev16 = npyv_rev64_u16(a);
+    // swap 8bit pairs
+    return _mm_or_si128(_mm_slli_epi16(rev16, 8), _mm_srli_epi16(rev16, 8));
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 364b4baf1..132d3d347 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index dd23b5b11..2f6762e63 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -100,4 +100,32 @@
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 vec_madd
+#define npyv_muladd_f64 vec_madd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 vec_msub
+#define npyv_mulsub_f64 vec_msub
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
+#define npyv_nmuladd_f64 vec_nmsub
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
+#define npyv_nmulsub_f64 vec_nmadd
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
+    return vec_extract(sum, 0) + vec_extract(sum, 1);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
 #endif // _NPY_SIMD_VSX_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 6ed135990..5803e1cdd 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,4 +29,26 @@
 #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
 #define npyv_cvt_b64_f64(A) ((npyv_b64) A)
 
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
+    return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
+}
+
 #endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
new file mode 100644
index 000000000..7c8610b19
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -0,0 +1,36 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MATH_H
+#define _NPY_SIMD_VSX_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 vec_sqrt
+#define npyv_sqrt_f64 vec_sqrt
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+    const npyv_f32 one = npyv_setall_f32(1.0f);
+    return vec_div(one, a);
+}
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{
+    const npyv_f64 one = npyv_setall_f64(1.0);
+    return vec_div(one, a);
+}
+
+// Absolute
+#define npyv_abs_f32 vec_abs
+#define npyv_abs_f64 vec_abs
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vec_mul(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return vec_mul(a, a); }
+
+#endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h
index e0d908bf9..08a0a9276 100644
--- a/numpy/core/src/common/simd/vsx/memory.h
+++ b/numpy/core/src/common/simd/vsx/memory.h
@@ -4,147 +4,343 @@
 
 #ifndef _NPY_SIMD_VSX_MEMORY_H
 #define _NPY_SIMD_VSX_MEMORY_H
+
+#include "misc.h"
+
 /****************************
- * load/store
+ * Private utilities
  ****************************/
 // TODO: test load by cast
 #define VSX__CAST_lOAD 0
 #if VSX__CAST_lOAD
-    #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR)))
+    #define npyv__load(T_VEC, PTR) (*((T_VEC*)(PTR)))
 #else
     /**
      * CLANG fails to load unaligned addresses via vec_xl, vec_xst
      * so we failback to vec_vsx_ld, vec_vsx_st
      */
     #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
-        #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR)
+        #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR)
     #else
-        #define npyv__load(PTR, T_VEC) vec_xl(0, PTR)
+        #define npyv__load(T_VEC, PTR) vec_xl(0, PTR)
     #endif
 #endif
-// unaligned load
-#define npyv_load_u8(PTR)  npyv__load(PTR, npyv_u8)
-#define npyv_load_s8(PTR)  npyv__load(PTR, npyv_s8)
-#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16)
-#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16)
-#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32)
-#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32)
-#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32)
-#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64)
-#if VSX__CAST_lOAD
-    #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64)
-    #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64)
+// unaligned store
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
 #else
-    #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR))
-    #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR))
+    #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR)
 #endif
-// aligned load
-#define npyv_loada_u8(PTR)  vec_ld(0, PTR)
-#define npyv_loada_s8  npyv_loada_u8
-#define npyv_loada_u16 npyv_loada_u8
-#define npyv_loada_s16 npyv_loada_u8
-#define npyv_loada_u32 npyv_loada_u8
-#define npyv_loada_s32 npyv_loada_u8
-#define npyv_loada_u64 npyv_load_u64
-#define npyv_loada_s64 npyv_load_s64
-#define npyv_loada_f32 npyv_loada_u8
-#define npyv_loada_f64 npyv_load_f64
-// stream load
-#define npyv_loads_u8  npyv_loada_u8
-#define npyv_loads_s8  npyv_loada_s8
-#define npyv_loads_u16 npyv_loada_u16
-#define npyv_loads_s16 npyv_loada_s16
-#define npyv_loads_u32 npyv_loada_u32
-#define npyv_loads_s32 npyv_loada_s32
-#define npyv_loads_u64 npyv_loada_u64
-#define npyv_loads_s64 npyv_loada_s64
-#define npyv_loads_f32 npyv_loada_f32
-#define npyv_loads_f64 npyv_loada_f64
-// load lower part
+
 // avoid aliasing rules
 #ifdef __cplusplus
     template<typename T_PTR>
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr)
-    { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; }
+    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
+    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 #else
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr)
-    { npy_uint64 *ptr64 = ptr; return ptr64; }
+    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 #endif // __cplusplus
-#if defined(__clang__) && !defined(__IBMC__)
-    // vec_promote doesn't support doubleword on clang
-    #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR))
-#else
-    #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0)
-#endif
-#define npyv_loadl_u8(PTR)  ((npyv_u8)npyv_loadl_u64(PTR))
-#define npyv_loadl_s8(PTR)  ((npyv_s8)npyv_loadl_u64(PTR))
-#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR))
-#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR))
-#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR))
-#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR))
-#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR))
-// unaligned store
-#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
-    #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
-#else
-    #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR)
-#endif
-#define npyv_store_s8  npyv_store_u8
-#define npyv_store_u16 npyv_store_u8
-#define npyv_store_s16 npyv_store_u8
-#define npyv_store_u32 npyv_store_u8
-#define npyv_store_s32 npyv_store_u8
-#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_f32 npyv_store_u8
-#define npyv_store_f64 npyv_store_u8
-// aligned store
-#define npyv_storea_u8(PTR, VEC)  vec_st(VEC, 0, PTR)
-#define npyv_storea_s8  npyv_storea_u8
-#define npyv_storea_u16 npyv_storea_u8
-#define npyv_storea_s16 npyv_storea_u8
-#define npyv_storea_u32 npyv_storea_u8
-#define npyv_storea_s32 npyv_storea_u8
-#define npyv_storea_u64 npyv_store_u64
-#define npyv_storea_s64 npyv_store_s64
-#define npyv_storea_f32 npyv_storea_u8
-#define npyv_storea_f64 npyv_store_f64
-// stream store
-#define npyv_stores_u8  npyv_storea_u8
-#define npyv_stores_s8  npyv_storea_s8
-#define npyv_stores_u16 npyv_storea_u16
-#define npyv_stores_s16 npyv_storea_s16
-#define npyv_stores_u32 npyv_storea_u32
-#define npyv_stores_s32 npyv_storea_s32
-#define npyv_stores_u64 npyv_storea_u64
-#define npyv_stores_s64 npyv_storea_s64
-#define npyv_stores_f32 npyv_storea_f32
-#define npyv_stores_f64 npyv_storea_f64
+
+// load lower part
+NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
+{
+    #if defined(__clang__) && !defined(__IBMC__)
+        // vec_promote doesn't support doubleword on clang
+        return npyv_setall_u64(*npyv__ptr2u64(ptr));
+    #else
+        return vec_promote(*npyv__ptr2u64(ptr), 0);
+    #endif
+}
 // store lower part
-#define npyv_storel_u8(PTR, VEC) \
+#define npyv__storel(PTR, VEC) \
     *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0)
-#define npyv_storel_s8  npyv_storel_u8
-#define npyv_storel_u16 npyv_storel_u8
-#define npyv_storel_s16 npyv_storel_u8
-#define npyv_storel_u32 npyv_storel_u8
-#define npyv_storel_s32 npyv_storel_u8
-#define npyv_storel_s64 npyv_storel_u8
-#define npyv_storel_u64 npyv_storel_u8
-#define npyv_storel_f32 npyv_storel_u8
-#define npyv_storel_f64 npyv_storel_u8
-// store higher part
-#define npyv_storeh_u8(PTR, VEC) \
+
+#define npyv__storeh(PTR, VEC) \
     *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1)
-#define npyv_storeh_s8  npyv_storeh_u8
-#define npyv_storeh_u16 npyv_storeh_u8
-#define npyv_storeh_s16 npyv_storeh_u8
-#define npyv_storeh_u32 npyv_storeh_u8
-#define npyv_storeh_s32 npyv_storeh_u8
-#define npyv_storeh_s64 npyv_storeh_u8
-#define npyv_storeh_u64 npyv_storeh_u8
-#define npyv_storeh_f32 npyv_storeh_u8
-#define npyv_storeh_f64 npyv_storeh_u8
+
+/****************************
+ * load/store
+ ****************************/
+#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST)                                                 \
+    NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr)              \
+    { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \
+    NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); }                    \
+    NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return npyv_loada_##SFX(ptr); }                                                   \
+    NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return (npyv_##SFX)npyv__loadl(ptr); }                                            \
+    NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)         \
+    { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); }                \
+    NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); }                              \
+    NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv_storea_##SFX(ptr, vec); }                                                    \
+    NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv__storel(ptr, vec); }                                                         \
+    NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv__storeh(ptr, vec); }
+
+NPYV_IMPL_VSX_MEM(u8,  u8)
+NPYV_IMPL_VSX_MEM(s8,  s8)
+NPYV_IMPL_VSX_MEM(u16, u16)
+NPYV_IMPL_VSX_MEM(s16, s16)
+NPYV_IMPL_VSX_MEM(u32, u32)
+NPYV_IMPL_VSX_MEM(s32, s32)
+NPYV_IMPL_VSX_MEM(u64, f64)
+NPYV_IMPL_VSX_MEM(s64, f64)
+NPYV_IMPL_VSX_MEM(f32, f32)
+NPYV_IMPL_VSX_MEM(f64, f64)
+
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return npyv_set_u32(
+        ptr[stride * 0], ptr[stride * 1],
+        ptr[stride * 2], ptr[stride * 3]
+    );
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_set_u64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_set_s64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return npyv_set_f64(ptr[0], ptr[stride]); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    ptr[stride * 0] = vec_extract(a, 0);
+    ptr[stride * 1] = vec_extract(a, 1);
+    ptr[stride * 2] = vec_extract(a, 2);
+    ptr[stride * 3] = vec_extract(a, 3);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    ptr[stride * 0] = vec_extract(a, 0);
+    ptr[stride * 1] = vec_extract(a, 1);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    npyv_s32 vfill = npyv_setall_s32(fill);
+    switch(nlane) {
+    case 1:
+        return vec_insert(ptr[0], vfill, 0);
+    case 2:
+        return (npyv_s32)vec_insert(
+            *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+        );
+    case 3:
+        vfill = vec_insert(ptr[2], vfill, 2);
+        return (npyv_s32)vec_insert(
+            *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+        );
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s64(ptr[0], fill);
+    }
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{  return npyv_load_till_s64(ptr, nlane, 0); }
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    npyv_s32 vfill = npyv_setall_s32(fill);
+    switch(nlane) {
+    case 3:
+        vfill = vec_insert(ptr[stride*2], vfill, 2);
+    case 2:
+        vfill = vec_insert(ptr[stride], vfill, 1);
+    case 1:
+        vfill = vec_insert(*ptr, vfill, 0);
+        break;
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    } // switch
+    return vfill;
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s64(*ptr, fill);
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        *ptr = vec_extract(a, 0);
+        break;
+    case 2:
+        npyv_storel_s32(ptr, a);
+        break;
+    case 3:
+        npyv_storel_s32(ptr, a);
+        ptr[2] = vec_extract(a, 2);
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        npyv_storel_s64(ptr, a);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    default:
+        ptr[stride*3] = vec_extract(a, 3);
+    case 3:
+        ptr[stride*2] = vec_extract(a, 2);
+    case 2:
+        ptr[stride*1] = vec_extract(a, 1);
+    case 1:
+        ptr[stride*0] = vec_extract(a, 0);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        npyv_storel_s64(ptr, a);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_VSX_MEMORY_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h
index bfb9115fa..6533e5093 100644
--- a/numpy/core/src/common/simd/vsx/reorder.h
+++ b/numpy/core/src/common/simd/vsx/reorder.h
@@ -62,4 +62,45 @@ NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#if defined(NPY_HAVE_VSX3) && ((defined(__GNUC__) && __GNUC__ > 7) || defined(__IBMC__))
+    return (npyv_u8)vec_revb((npyv_u64)a);
+#elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+    npyv_u8 ret;
+    __asm__ ("xxbrd %x0,%x1" : "=wa" (ret) : "wa" (a));
+    return ret;
+#else
+    const npyv_u8 idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return vec_perm(a, a, idx);
+#endif
+}
+NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a)
+{ return (npyv_s8)npyv_rev64_u8((npyv_u8)a); }
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a)
+{ return (npyv_s16)npyv_rev64_u16((npyv_u16)a); }
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        4, 5, 6, 7, 0, 1, 2, 3,/*64*/12, 13, 14, 15, 8, 9, 10, 11
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
+{ return (npyv_s32)npyv_rev64_u32((npyv_u32)a); }
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+
 #endif // _NPY_SIMD_VSX_REORDER_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 5525dc1e6..27dde98e7 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/multiarray/_datetime.h b/numpy/core/src/multiarray/_datetime.h
index 4e7ade5ed..c0d2f1967 100644
--- a/numpy/core/src/multiarray/_datetime.h
+++ b/numpy/core/src/multiarray/_datetime.h
@@ -200,17 +200,15 @@ convert_pyobject_to_datetime_metadata(PyObject *obj,
                                         PyArray_DatetimeMetaData *out_meta);
 
 /*
- * 'ret' is a PyUString containing the datetime string, and this
- * function appends the metadata string to it.
+ * Returns datetime metadata as a new reference a Unicode object.
+ * Returns NULL on error.
  *
  * If 'skip_brackets' is true, skips the '[]'.
  *
- * This function steals the reference 'ret'
  */
 NPY_NO_EXPORT PyObject *
-append_metastr_to_string(PyArray_DatetimeMetaData *meta,
-                                    int skip_brackets,
-                                    PyObject *ret);
+metastr_to_unicode(PyArray_DatetimeMetaData *meta, int skip_brackets);
+
 
 /*
  * Tests for and converts a Python datetime.datetime or datetime.date
@@ -375,4 +373,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
 NPY_NO_EXPORT PyArray_Descr *
 find_object_datetime_type(PyObject *obj, int type_num);
 
+NPY_NO_EXPORT int
+PyArray_InitializeDatetimeCasts(void);
+
 #endif
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index da631c830..3c8caefce 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -9,8 +9,7 @@
 #include "common.h"
 #include "mem_overlap.h"
 #include "npy_extint128.h"
-#include "common.h"
-
+#include "array_method.h"
 
 #if defined(MS_WIN32) || defined(__CYGWIN__)
 #define EXPORT(x) __declspec(dllexport) x
@@ -38,6 +37,7 @@ IsPythonScalar(PyObject * dummy, PyObject *args)
 
 #include "npy_pycompat.h"
 
+
 /** Function to test calling via ctypes */
 EXPORT(void*) forward_pointer(void *x)
 {
@@ -178,17 +178,20 @@ test_neighborhood_iterator(PyObject* NPY_UNUSED(self), PyObject* args)
     /* Compute boundaries for the neighborhood iterator */
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -337,17 +340,20 @@ test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
     /* Compute boundaries for the neighborhood iterator */
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b1, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -361,17 +367,20 @@ test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
 
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b2, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -612,6 +621,105 @@ fromstring_null_term_c_api(PyObject *dummy, PyObject *byte_obj)
 }
 
 
+/*
+ * Create a custom field dtype from an existing void one (and test some errors).
+ * The dtypes created by this function may be not be usable (or even crash
+ * while using).
+ */
+static PyObject *
+create_custom_field_dtype(PyObject *NPY_UNUSED(mod), PyObject *args)
+{
+    PyArray_Descr *dtype;
+    PyTypeObject *scalar_type;
+    PyTypeObject *original_type = NULL;
+    int error_path;
+
+    if (!PyArg_ParseTuple(args, "O!O!i",
+            &PyArrayDescr_Type, &dtype,
+            &PyType_Type, &scalar_type,
+            &error_path)) {
+        return NULL;
+    }
+    /* check that the result should be more or less valid */
+    if (dtype->type_num != NPY_VOID || dtype->fields == NULL ||
+            !PyDict_CheckExact(dtype->fields) ||
+            PyTuple_Size(dtype->names) != 1 ||
+            !PyDataType_REFCHK(dtype) ||
+            dtype->elsize != sizeof(PyObject *)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Bad dtype passed to test function, must be an object "
+                "containing void with a single field.");
+        return NULL;
+    }
+
+    /* Copy and then appropriate this dtype */
+    original_type = Py_TYPE(dtype);
+    dtype = PyArray_DescrNew(dtype);
+    if (dtype == NULL) {
+        return NULL;
+    }
+
+    Py_INCREF(scalar_type);
+    Py_SETREF(dtype->typeobj, scalar_type);
+    if (error_path == 1) {
+        /* Test that we reject this, if fields was not already set */
+        Py_SETREF(dtype->fields, NULL);
+    }
+    else if (error_path == 2) {
+        /*
+         * Test that we reject this if the type is not set to something that
+         * we are pretty sure can be safely replaced.
+         */
+        Py_SET_TYPE(dtype, scalar_type);
+    }
+    else if (error_path != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "invalid error argument to test function.");
+    }
+    if (PyArray_RegisterDataType(dtype) < 0) {
+        /* Fix original type in the error_path == 2 case and delete it */
+        Py_SET_TYPE(dtype, original_type);
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    Py_INCREF(dtype);  /* hold on to the original (leaks a reference) */
+    return (PyObject *)dtype;
+}
+
+
+PyObject *
+corrupt_or_fix_bufferinfo(PyObject *dummy, PyObject *obj)
+{
+    void **buffer_info_ptr;
+    if (PyArray_Check(obj)) {
+        buffer_info_ptr = &((PyArrayObject_fields *)obj)->_buffer_info;
+    }
+    else if (PyArray_IsScalar(obj, Void)) {
+        buffer_info_ptr = &((PyVoidScalarObject *)obj)->_buffer_info;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "argument must be an array or void scalar");
+        return NULL;
+    }
+    if (*buffer_info_ptr == NULL) {
+        /* set to an invalid value (as a subclass might accidentally) */
+        *buffer_info_ptr = obj;
+        assert(((uintptr_t)obj & 7) == 0);
+    }
+    else if (*buffer_info_ptr == obj) {
+        /* Reset to a NULL (good value) */
+        *buffer_info_ptr = NULL;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "buffer was already exported, this test doesn't support that");
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
 /* check no elison for avoided increfs */
 static PyObject *
 incref_elide(PyObject *dummy, PyObject *args)
@@ -905,6 +1013,79 @@ get_c_wrapping_array(PyObject* NPY_UNUSED(self), PyObject* arg)
 }
 
 
+static PyObject *
+get_all_cast_information(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
+{
+    PyObject *result = PyList_New(0);
+    if (result == NULL) {
+        return NULL;
+    }
+    PyObject *classes = PyObject_CallMethod(
+            (PyObject *)&PyArrayDescr_Type, "__subclasses__", "");
+    if (classes == NULL) {
+        return NULL;
+    }
+    Py_SETREF(classes, PySequence_Fast(classes, NULL));
+    if (classes == NULL) {
+        goto fail;
+    }
+
+    Py_ssize_t nclass = PySequence_Length(classes);
+    for (Py_ssize_t  i = 0; i < nclass; i++) {
+        PyArray_DTypeMeta *from_dtype = (
+                (PyArray_DTypeMeta *)PySequence_Fast_GET_ITEM(classes, i));
+        if (from_dtype->abstract) {
+            /*
+             * TODO: In principle probably needs to recursively check this,
+             *       also we may allow casts to abstract dtypes at some point.
+             */
+            continue;
+        }
+
+        PyObject *to_dtype, *cast_obj;
+        Py_ssize_t pos = 0;
+
+        while (PyDict_Next(from_dtype->castingimpls, &pos, &to_dtype, &cast_obj)) {
+            if (cast_obj == Py_None) {
+                continue;
+            }
+            PyArrayMethodObject *cast = (PyArrayMethodObject *)cast_obj;
+
+            /* Pass some information about this cast out! */
+            PyObject *cast_info = Py_BuildValue("{sOsOsisisisisisssi}",
+                    "from", from_dtype,
+                    "to", to_dtype,
+                    "legacy", (cast->name != NULL &&
+                               strncmp(cast->name, "legacy_", 7) == 0),
+                    "casting", cast->casting & ~_NPY_CAST_IS_VIEW,
+                    "requires_pyapi", cast->flags & NPY_METH_REQUIRES_PYAPI,
+                    "supports_unaligned",
+                        cast->flags & NPY_METH_SUPPORTS_UNALIGNED,
+                    "no_floatingpoint_errors",
+                        cast->flags & NPY_METH_NO_FLOATINGPOINT_ERRORS,
+                    "name", cast->name,
+                    "cast_is_view",
+                        cast->casting & _NPY_CAST_IS_VIEW);
+            if (cast_info == NULL) {
+                goto fail;
+            }
+            int res = PyList_Append(result, cast_info);
+            Py_DECREF(cast_info);
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+    Py_DECREF(classes);
+    return result;
+
+  fail:
+    Py_XDECREF(classes);
+    Py_XDECREF(result);
+    return NULL;
+}
+
+
 /*
  * Test C-api level item getting.
  */
@@ -1155,11 +1336,11 @@ array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject
     }
 
     for (j = 0; j < nterms; ++j) {
-        terms[j].a = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(A, j));
+        terms[j].a = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(A, j));
         if (error_converting(terms[j].a)) {
             goto fail;
         }
-        terms[j].ub = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(U, j));
+        terms[j].ub = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(U, j));
         if (error_converting(terms[j].ub)) {
             goto fail;
         }
@@ -1735,8 +1916,8 @@ get_struct_alignments(PyObject *NPY_UNUSED(self), PyObject *args) {
 /**begin repeat
  * #N = 1,2,3#
  */
-    alignment = PyInt_FromLong(_ALIGN(struct TestStruct@N@));
-    size = PyInt_FromLong(sizeof(struct TestStruct@N@));
+    alignment = PyLong_FromLong(_ALIGN(struct TestStruct@N@));
+    size = PyLong_FromLong(sizeof(struct TestStruct@N@));
     val = PyTuple_Pack(2, alignment, size);
     Py_DECREF(alignment);
     Py_DECREF(size);
@@ -1902,7 +2083,7 @@ PrintFloat_Printf_g(PyObject *obj, int precision)
         PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
     }
 
-    return PyUString_FromString(str);
+    return PyUnicode_FromString(str);
 }
 
 
@@ -1938,6 +2119,18 @@ getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
     return ret;
 }
 
+
+static PyObject *
+uses_new_casts(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+#if NPY_USE_NEW_CASTINGIMPL
+    Py_RETURN_TRUE;
+#else
+    Py_RETURN_FALSE;
+#endif
+}
+
+
 static PyObject *
 run_byteorder_converter(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -1952,7 +2145,7 @@ run_byteorder_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SWAP: return PyUnicode_FromString("NPY_SWAP");
         case NPY_IGNORE: return PyUnicode_FromString("NPY_IGNORE");
     }
-    return PyInt_FromLong(byteorder);
+    return PyLong_FromLong(byteorder);
 }
 
 static PyObject *
@@ -1967,7 +2160,7 @@ run_sortkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_HEAPSORT: return PyUnicode_FromString("NPY_HEAPSORT");
         case NPY_STABLESORT: return PyUnicode_FromString("NPY_STABLESORT");
     }
-    return PyInt_FromLong(kind);
+    return PyLong_FromLong(kind);
 }
 
 static PyObject *
@@ -1980,7 +2173,7 @@ run_selectkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
     switch (kind) {
         case NPY_INTROSELECT: return PyUnicode_FromString("NPY_INTROSELECT");
     }
-    return PyInt_FromLong(kind);
+    return PyLong_FromLong(kind);
 }
 
 static PyObject *
@@ -1994,7 +2187,7 @@ run_searchside_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SEARCHLEFT: return PyUnicode_FromString("NPY_SEARCHLEFT");
         case NPY_SEARCHRIGHT: return PyUnicode_FromString("NPY_SEARCHRIGHT");
     }
-    return PyInt_FromLong(side);
+    return PyLong_FromLong(side);
 }
 
 static PyObject *
@@ -2010,7 +2203,7 @@ run_order_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_FORTRANORDER: return PyUnicode_FromString("NPY_FORTRANORDER");
         case NPY_KEEPORDER: return PyUnicode_FromString("NPY_KEEPORDER");
     }
-    return PyInt_FromLong(order);
+    return PyLong_FromLong(order);
 }
 
 static PyObject *
@@ -2025,7 +2218,7 @@ run_clipmode_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_WRAP: return PyUnicode_FromString("NPY_WRAP");
         case NPY_RAISE: return PyUnicode_FromString("NPY_RAISE");
     }
-    return PyInt_FromLong(mode);
+    return PyLong_FromLong(mode);
 }
 
 static PyObject *
@@ -2041,8 +2234,8 @@ run_casting_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SAFE_CASTING: return PyUnicode_FromString("NPY_SAFE_CASTING");
         case NPY_SAME_KIND_CASTING: return PyUnicode_FromString("NPY_SAME_KIND_CASTING");
         case NPY_UNSAFE_CASTING: return PyUnicode_FromString("NPY_UNSAFE_CASTING");
+        default: return PyLong_FromLong(casting);
     }
-    return PyInt_FromLong(casting);
 }
 
 static PyObject *
@@ -2083,6 +2276,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"fromstring_null_term_c_api",
         fromstring_null_term_c_api,
         METH_O, NULL},
+    {"create_custom_field_dtype",
+        create_custom_field_dtype,
+        METH_VARARGS, NULL},
+    {"corrupt_or_fix_bufferinfo",
+        corrupt_or_fix_bufferinfo,
+        METH_O, NULL},
     {"incref_elide",
         incref_elide,
         METH_VARARGS, NULL},
@@ -2119,6 +2318,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"get_c_wrapping_array",
         get_c_wrapping_array,
         METH_O, NULL},
+    {"get_all_cast_information",
+        get_all_cast_information,
+        METH_NOARGS,
+        "Return a list with info on all available casts. Some of the info"
+        "may differ for an actual cast if it uses value-based casting "
+        "(flexible types)."},
     {"array_indexing",
         array_indexing,
         METH_VARARGS, NULL},
@@ -2179,6 +2384,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"getset_numericops",
         getset_numericops,
         METH_NOARGS, NULL},
+    {"uses_new_casts",
+            uses_new_casts,
+            METH_NOARGS, NULL},
 /**begin repeat
  * #name = cabs, carg#
  */
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index 795fc7315..887deff53 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -2,17 +2,12 @@
 #include <Python.h>
 #include "structmember.h"
 
-#if PY_VERSION_HEX >= 0x03060000
 #include <pymem.h>
 /* public api in 3.7 */
 #if PY_VERSION_HEX < 0x03070000
 #define PyTraceMalloc_Track _PyTraceMalloc_Track
 #define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
 #endif
-#else
-#define PyTraceMalloc_Track(...)
-#define PyTraceMalloc_Untrack(...)
-#endif
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index b8dc7d516..361964a5c 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -132,17 +132,22 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
 
     NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
         /* Process the innermost dimension */
-        stransfer(dst_data, dst_strides_it[0], src_data, src_strides_it[0],
-                    shape_it[0], src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_strides_it[0], src_data, src_strides_it[0],
+                shape_it[0], src_itemsize, transferdata) < 0) {
+            goto fail;
+        }
     } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
                             dst_data, dst_strides_it,
                             src_data, src_strides_it);
 
     NPY_END_THREADS;
-
     NPY_AUXDATA_FREE(transferdata);
-
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return 0;
+fail:
+    NPY_END_THREADS;
+    NPY_AUXDATA_FREE(transferdata);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 41eb75f1c..023772776 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -82,16 +82,21 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape,
 
     NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
         /* Process the innermost dimension */
-        stransfer(dst_data, dst_strides_it[0], src_data, 0,
-                    shape_it[0], src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_strides_it[0], src_data, 0,
+                shape_it[0], src_itemsize, transferdata) < 0) {
+            goto fail;
+        }
     } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
                             shape_it, dst_data, dst_strides_it);
 
     NPY_END_THREADS;
-
     NPY_AUXDATA_FREE(transferdata);
-
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return 0;
+fail:
+    NPY_END_THREADS;
+    NPY_AUXDATA_FREE(transferdata);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 9d367da1f..53d891049 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -128,7 +128,9 @@ _prime_global_pytype_to_type_dict(void)
 
 
 /**
- * Add a new mapping from a python type to the DType class.
+ * Add a new mapping from a python type to the DType class. For a user
+ * defined legacy dtype, this function does nothing unless the pytype
+ * subclass from `np.generic`.
  *
  * This assumes that the DType class is guaranteed to hold on the
  * python type (this assumption is guaranteed).
@@ -145,21 +147,29 @@ _PyArray_MapPyTypeToDType(
 {
     PyObject *Dtype_obj = (PyObject *)DType;
 
-    if (userdef) {
+    if (userdef && !PyObject_IsSubclass(
+                    (PyObject *)pytype, (PyObject *)&PyGenericArrType_Type)) {
         /*
-         * It seems we did not strictly enforce this in the legacy dtype
-         * API, but assume that it is always true. Further, this could be
-         * relaxed in the future. In particular we should have a new
-         * superclass of ``np.generic`` in order to note enforce the array
-         * scalar behaviour.
+         * We expect that user dtypes (for now) will subclass some numpy
+         * scalar class to allow automatic discovery.
          */
-        if (!PyObject_IsSubclass((PyObject *)pytype, (PyObject *)&PyGenericArrType_Type)) {
-            PyErr_Format(PyExc_RuntimeError,
-                    "currently it is only possible to register a DType "
-                    "for scalars deriving from `np.generic`, got '%S'.",
-                    (PyObject *)pytype);
-            return -1;
+        if (DType->legacy) {
+            /*
+             * For legacy user dtypes, discovery relied on subclassing, but
+             * arbitrary type objects are supported, so do nothing.
+             */
+            return 0;
         }
+        /*
+         * We currently enforce that user DTypes subclass from `np.generic`
+         * (this should become a `np.generic` base class and may be lifted
+         * entirely).
+         */
+        PyErr_Format(PyExc_RuntimeError,
+                "currently it is only possible to register a DType "
+                "for scalars deriving from `np.generic`, got '%S'.",
+                (PyObject *)pytype);
+        return -1;
     }
 
     /* Create the global dictionary if it does not exist */
@@ -288,7 +298,7 @@ discover_dtype_from_pyobject(
         Py_INCREF(DType);
         Py_DECREF(legacy_descr);
         /* TODO: Enable warning about subclass handling */
-        if (0 && !((*flags) & GAVE_SUBCLASS_WARNING)) {
+        if ((0) && !((*flags) & GAVE_SUBCLASS_WARNING)) {
             if (DEPRECATE_FUTUREWARNING(
                     "in the future NumPy will not automatically find the "
                     "dtype for subclasses of scalars known to NumPy (i.e. "
@@ -306,51 +316,6 @@ discover_dtype_from_pyobject(
 }
 
 
-/*
- * This function should probably become public API eventually.  At this
- * time it is implemented by falling back to `PyArray_AdaptFlexibleDType`.
- * We will use `CastingImpl[from, to].adjust_descriptors(...)` to implement
- * this logic.
- */
-static NPY_INLINE PyArray_Descr *
-cast_descriptor_to_fixed_dtype(
-        PyArray_Descr *descr, PyArray_DTypeMeta *fixed_DType)
-{
-    if (fixed_DType == NULL) {
-        /* Nothing to do, we only need to promote the new dtype */
-        Py_INCREF(descr);
-        return descr;
-    }
-
-    if (!fixed_DType->parametric) {
-        /*
-         * Don't actually do anything, the default is always the result
-         * of any cast.
-         */
-        return fixed_DType->default_descr(fixed_DType);
-    }
-    if (PyObject_TypeCheck((PyObject *)descr, (PyTypeObject *)fixed_DType)) {
-        Py_INCREF(descr);
-        return descr;
-    }
-    /*
-     * TODO: When this is implemented for all dtypes, the special cases
-     *       can be removed...
-     */
-    if (fixed_DType->legacy && fixed_DType->parametric &&
-            NPY_DTYPE(descr)->legacy) {
-        PyArray_Descr *flex_dtype = PyArray_DescrFromType(fixed_DType->type_num);
-        return PyArray_AdaptFlexibleDType(descr, flex_dtype);
-    }
-
-    PyErr_SetString(PyExc_NotImplementedError,
-            "Must use casting to find the correct dtype, this is "
-            "not yet implemented! "
-            "(It should not be possible to hit this code currently!)");
-    return NULL;
-}
-
-
 /**
  * Discover the correct descriptor from a known DType class and scalar.
  * If the fixed DType can discover a dtype instance/descr all is fine,
@@ -392,7 +357,7 @@ find_scalar_descriptor(
         return descr;
     }
 
-    Py_SETREF(descr, cast_descriptor_to_fixed_dtype(descr, fixed_DType));
+    Py_SETREF(descr, PyArray_CastDescrToDType(descr, fixed_DType));
     return descr;
 }
 
@@ -434,7 +399,7 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
             .flags = NPY_ARRAY_WRITEABLE,  /* assume array is not behaved. */
         };
     Py_SET_TYPE(&arr_fields, &PyArray_Type);
-    Py_REFCNT(&arr_fields) = 1;
+    Py_SET_REFCNT(&arr_fields, 1);
 
     if (NPY_UNLIKELY(descr->type_num == NPY_OBJECT)) {
         /*
@@ -495,12 +460,10 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
         res = -1;
         goto finish;
     }
-    stransfer(item, 0, data, 0, 1, tmp_descr->elsize, transferdata);
-    NPY_AUXDATA_FREE(transferdata);
-
-    if (needs_api && PyErr_Occurred()) {
+    if (stransfer(item, 0, data, 0, 1, tmp_descr->elsize, transferdata) < 0) {
         res = -1;
     }
+    NPY_AUXDATA_FREE(transferdata);
 
   finish:
     if (PyDataType_REFCHK(tmp_descr)) {
@@ -550,7 +513,7 @@ update_shape(int curr_ndim, int *max_ndim,
             success = -1;
             if (!sequence) {
                 /* Remove dimensions that we cannot use: */
-                *max_ndim -= new_ndim + i;
+                *max_ndim -= new_ndim - i;
             }
             else {
                 assert(i == 0);
@@ -585,7 +548,7 @@ npy_new_coercion_cache(
         cache = _coercion_cache_cache[_coercion_cache_num];
     }
     else {
-        cache = PyObject_MALLOC(sizeof(coercion_cache_obj));
+        cache = PyMem_Malloc(sizeof(coercion_cache_obj));
     }
     if (cache == NULL) {
         PyErr_NoMemory();
@@ -617,7 +580,7 @@ npy_unlink_coercion_cache(coercion_cache_obj *current)
         _coercion_cache_num++;
     }
     else {
-        PyObject_FREE(current);
+        PyMem_Free(current);
     }
     return next;
 }
@@ -639,12 +602,13 @@ npy_free_coercion_cache(coercion_cache_obj *next) {
  *
  * @param out_descr The current descriptor.
  * @param descr The newly found descriptor to promote with
+ * @param fixed_DType The user provided (fixed) DType or NULL
  * @param flags dtype discover flags to signal failed promotion.
  * @return -1 on error, 0 on success.
  */
 static NPY_INLINE int
 handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
-        enum _dtype_discovery_flags *flags)
+        PyArray_DTypeMeta *fixed_DType, enum _dtype_discovery_flags *flags)
 {
     assert(!(*flags & DESCRIPTOR_WAS_SET));
 
@@ -654,7 +618,11 @@ handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
         return 0;
     }
     PyArray_Descr *new_descr = PyArray_PromoteTypes(descr, *out_descr);
-    if (new_descr == NULL) {
+    if (NPY_UNLIKELY(new_descr == NULL)) {
+        if (fixed_DType != NULL) {
+            /* If a DType is fixed, promotion must not fail. */
+            return -1;
+        }
         PyErr_Clear();
         *flags |= PROMOTION_FAILED;
         /* Continue with object, since we may need the dimensionality */
@@ -669,13 +637,15 @@ handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
  * Handle a leave node (known scalar) during dtype and shape discovery.
  *
  * @param obj The python object or nested sequence to convert
- * @param max_dims The maximum number of dimensions.
  * @param curr_dims The current number of dimensions (depth in the recursion)
+ * @param max_dims The maximum number of dimensions.
  * @param out_shape The discovered output shape, will be filled
- * @param coercion_cache The coercion cache object to use.
- * @param DType the DType class that should be used, or NULL, if not provided.
+ * @param fixed_DType The user provided (fixed) DType or NULL
  * @param flags used signal that this is a ragged array, used internally and
  *        can be expanded if necessary.
+ * @param DType the DType class that should be used, or NULL, if not provided.
+ *
+ * @return 0 on success -1 on error
  */
 static NPY_INLINE int
 handle_scalar(
@@ -700,7 +670,7 @@ handle_scalar(
     if (descr == NULL) {
         return -1;
     }
-    if (handle_promotion(out_descr, descr, flags) < 0) {
+    if (handle_promotion(out_descr, descr, fixed_DType, flags) < 0) {
         Py_DECREF(descr);
         return -1;
     }
@@ -729,8 +699,13 @@ find_descriptor_from_array(
     enum _dtype_discovery_flags flags = 0;
     *out_descr = NULL;
 
-    if (NPY_UNLIKELY(DType != NULL && DType->parametric &&
-            PyArray_ISOBJECT(arr))) {
+    if (DType == NULL) {
+        *out_descr = PyArray_DESCR(arr);
+        Py_INCREF(*out_descr);
+        return 0;
+    }
+
+    if (NPY_UNLIKELY(DType->parametric && PyArray_ISOBJECT(arr))) {
         /*
          * We have one special case, if (and only if) the input array is of
          * object DType and the dtype is not fixed already but parametric.
@@ -779,7 +754,7 @@ find_descriptor_from_array(
         }
         Py_DECREF(iter);
     }
-    else if (DType != NULL && NPY_UNLIKELY(DType->type_num == NPY_DATETIME) &&
+    else if (NPY_UNLIKELY(DType->type_num == NPY_DATETIME) &&
                 PyArray_ISSTRING(arr)) {
         /*
          * TODO: This branch should be deprecated IMO, the workaround is
@@ -808,8 +783,7 @@ find_descriptor_from_array(
          * If this is not an object array figure out the dtype cast,
          * or simply use the returned DType.
          */
-        *out_descr = cast_descriptor_to_fixed_dtype(
-                     PyArray_DESCR(arr), DType);
+        *out_descr = PyArray_CastDescrToDType(PyArray_DESCR(arr), DType);
         if (*out_descr == NULL) {
             return -1;
         }
@@ -992,7 +966,7 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             /* object array with no elements, no need to promote/adjust. */
             return max_dims;
         }
-        if (handle_promotion(out_descr, cast_descr, flags) < 0) {
+        if (handle_promotion(out_descr, cast_descr, fixed_DType, flags) < 0) {
             Py_DECREF(cast_descr);
             return -1;
         }
@@ -1221,7 +1195,58 @@ PyArray_DiscoverDTypeAndShape(
         }
         else if (fixed_DType->type_num != NPY_OBJECT) {
             /* Only object DType supports ragged cases unify error */
-            if (!too_deep) {
+
+            /*
+             * We used to let certain ragged arrays pass if they also
+             * support e.g. conversion using `float(arr)`, which currently
+             * works for arrays with only one element.
+             * Thus we catch at least most of such cases here and give a
+             * DeprecationWarning instead of an error.
+             * Note that some of these will actually error later on when
+             * attempting to do the actual assign.
+             */
+            int deprecate_single_element_ragged = 0;
+            coercion_cache_obj *current = *coercion_cache_head;
+            while (current != NULL) {
+                if (current->sequence) {
+                    if (current->depth == ndim) {
+                        /*
+                         * Assume that only array-likes will allow the deprecated
+                         * behaviour
+                         */
+                        deprecate_single_element_ragged = 0;
+                        break;
+                    }
+                    /* check next converted sequence/array-like */
+                    current = current->next;
+                    continue;
+                }
+                PyArrayObject *arr = (PyArrayObject *)(current->arr_or_sequence);
+                assert(PyArray_NDIM(arr) + current->depth >= ndim);
+                if (PyArray_NDIM(arr) != ndim - current->depth) {
+                    /* This array is not compatible with the final shape */
+                    if (PyArray_SIZE(arr) != 1) {
+                        deprecate_single_element_ragged = 0;
+                        break;
+                    }
+                    deprecate_single_element_ragged = 1;
+                }
+                current = current->next;
+            }
+
+            if (deprecate_single_element_ragged) {
+                /* Deprecated 2020-07-24, NumPy 1.20 */
+                if (DEPRECATE(
+                        "setting an array element with a sequence. "
+                        "This was supported in some cases where the elements "
+                        "are arrays with a single element. For example "
+                        "`np.array([1, np.array([2])], dtype=int)`. "
+                        "In the future this will raise the same ValueError as "
+                        "`np.array([1, [2]], dtype=int)`.") < 0) {
+                    goto fail;
+                }
+            }
+            else if (!too_deep) {
                 PyObject *shape = PyArray_IntTupleFromIntp(ndim, out_shape);
                 PyErr_Format(PyExc_ValueError,
                         "setting an array element with a sequence. The "
@@ -1276,15 +1301,9 @@ PyArray_DiscoverDTypeAndShape(
          * the correct default.
          */
         if (fixed_DType != NULL) {
-            if (fixed_DType->default_descr == NULL) {
-                Py_INCREF(fixed_DType->singleton);
-                *out_descr = fixed_DType->singleton;
-            }
-            else {
-                *out_descr = fixed_DType->default_descr(fixed_DType);
-                if (*out_descr == NULL) {
-                    goto fail;
-                }
+            *out_descr = fixed_DType->default_descr(fixed_DType);
+            if (*out_descr == NULL) {
+                goto fail;
             }
         }
     }
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
new file mode 100644
index 000000000..cae452454
--- /dev/null
+++ b/numpy/core/src/multiarray/array_method.c
@@ -0,0 +1,614 @@
+/*
+ * This file implements an abstraction layer for "Array methods", which
+ * work with a specific DType class input and provide low-level C function
+ * pointers to do fast operations on the given input functions.
+ * It thus adds an abstraction layer around individual ufunc loops.
+ *
+ * Unlike methods, a ArrayMethod can have multiple inputs and outputs.
+ * This has some serious implication for garbage collection, and as far
+ * as I (@seberg) understands, it is not possible to always guarantee correct
+ * cyclic garbage collection of dynamically created DTypes with methods.
+ * The keyword (or rather the solution) for this seems to be an "ephemeron"
+ * which I believe should allow correct garbage collection but seems
+ * not implemented in Python at this time.
+ * The vast majority of use-cases will not require correct garbage collection.
+ * Some use cases may require the user to be careful.
+ *
+ * Generally there are two main ways to solve this issue:
+ *
+ * 1. A method with a single input (or inputs of all the same DTypes) can
+ *    be "owned" by that DType (it becomes unusable when the DType is deleted).
+ *    This holds especially for all casts, which must have a defined output
+ *    DType and must hold on to it strongly.
+ * 2. A method which can infer the output DType(s) from the input types does
+ *    not need to keep the output type alive. (It can use NULL for the type,
+ *    or an abstract base class which is known to be persistent.)
+ *    It is then sufficient for a ufunc (or other owner) to only hold a
+ *    weak reference to the input DTypes.
+ */
+
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <npy_pycompat.h>
+#include "arrayobject.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "convert_datatype.h"
+
+
+/*
+ * The default descriptor resolution function.  The logic is as follows:
+ *
+ * 1. The output is ensured to be canonical (currently native byte order),
+ *    if it is of the correct DType.
+ * 2. If any DType is was not defined, it is replaced by the common DType
+ *    of all inputs. (If that common DType is parametric, this is an error.)
+ *
+ * We could allow setting the output descriptors specifically to simplify
+ * this step.
+ */
+static NPY_CASTING
+default_resolve_descriptors(
+        PyArrayMethodObject *method,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **input_descrs,
+        PyArray_Descr **output_descrs)
+{
+    int nin = method->nin;
+    int nout = method->nout;
+    int all_defined = 1;
+
+    for (int i = 0; i < nin + nout; i++) {
+        PyArray_DTypeMeta *dtype = dtypes[i];
+        if (dtype == NULL) {
+            output_descrs[i] = NULL;
+            all_defined = 0;
+            continue;
+        }
+        if (NPY_DTYPE(input_descrs[i]) == dtype) {
+            output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
+        }
+        else {
+            output_descrs[i] = dtype->default_descr(dtype);
+        }
+        if (NPY_UNLIKELY(output_descrs[i] == NULL)) {
+            goto fail;
+        }
+    }
+    if (all_defined) {
+        return method->casting;
+    }
+
+    if (NPY_UNLIKELY(nin == 0 || dtypes[0] == NULL)) {
+        /* Registration should reject this, so this would be indicates a bug */
+        PyErr_SetString(PyExc_RuntimeError,
+                "Invalid use of default resolver without inputs or with "
+                "input or output DType incorrectly missing.");
+        goto fail;
+    }
+    /* We find the common dtype of all inputs, and use it for the unknowns */
+    PyArray_DTypeMeta *common_dtype = dtypes[0];
+    assert(common_dtype != NULL);
+    for (int i = 1; i < nin; i++) {
+        Py_SETREF(common_dtype, PyArray_CommonDType(common_dtype, dtypes[i]));
+        if (common_dtype == NULL) {
+            goto fail;
+        }
+    }
+    for (int i = nin; i < nin + nout; i++) {
+        if (output_descrs[i] != NULL) {
+            continue;
+        }
+        if (NPY_DTYPE(input_descrs[i]) == common_dtype) {
+            output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
+        }
+        else {
+            output_descrs[i] = common_dtype->default_descr(common_dtype);
+        }
+        if (NPY_UNLIKELY(output_descrs[i] == NULL)) {
+            goto fail;
+        }
+    }
+
+    return method->casting;
+
+  fail:
+    for (int i = 0; i < nin + nout; i++) {
+        Py_XDECREF(output_descrs[i]);
+    }
+    return -1;
+}
+
+
+/**
+ * The default method to fetch the correct loop for a cast or ufunc
+ * (at the time of writing only casts).
+ * The default version can return loops explicitly registered during method
+ * creation. It does specialize contiguous loops, although has to check
+ * all descriptors itemsizes for this.
+ *
+ * @param context
+ * @param aligned
+ * @param move_references UNUSED.
+ * @param strides
+ * @param descriptors
+ * @param out_loop
+ * @param out_transferdata
+ * @param flags
+ * @return 0 on success -1 on failure.
+ */
+static int
+default_get_strided_loop(
+        PyArrayMethod_Context *NPY_UNUSED(context),
+        int NPY_UNUSED(aligned), int NPY_UNUSED(move_references),
+        npy_intp *NPY_UNUSED(strides),
+        PyArray_StridedUnaryOp **NPY_UNUSED(out_loop),
+        NpyAuxData **NPY_UNUSED(out_transferdata),
+        NPY_ARRAYMETHOD_FLAGS *NPY_UNUSED(flags))
+{
+    PyErr_SetString(PyExc_NotImplementedError,
+            "default loop getter is not implemented");
+    return -1;
+}
+
+
+/**
+ * Validate that the input is usable to create a new ArrayMethod.
+ *
+ * @param spec
+ * @return 0 on success -1 on error.
+ */
+static int
+validate_spec(PyArrayMethod_Spec *spec)
+{
+    int nargs = spec->nin + spec->nout;
+    /* Check the passed spec for invalid fields/values */
+    if (spec->nin < 0 || spec->nout < 0 || nargs > NPY_MAXARGS) {
+        PyErr_Format(PyExc_ValueError,
+                "ArrayMethod inputs and outputs must be greater zero and"
+                "not exceed %d. (method: %s)", NPY_MAXARGS, spec->name);
+        return -1;
+    }
+    switch (spec->casting & ~_NPY_CAST_IS_VIEW) {
+        case NPY_NO_CASTING:
+        case NPY_EQUIV_CASTING:
+        case NPY_SAFE_CASTING:
+        case NPY_SAME_KIND_CASTING:
+        case NPY_UNSAFE_CASTING:
+            break;
+        default:
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod has invalid casting `%d`. (method: %s)",
+                    spec->casting, spec->name);
+            return -1;
+    }
+
+    for (int i = 0; i < nargs; i++) {
+        if (spec->dtypes[i] == NULL && i < spec->nin) {
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod must have well defined input DTypes. "
+                    "(method: %s)", spec->name);
+            return -1;
+        }
+        if (!PyObject_TypeCheck(spec->dtypes[i], &PyArrayDTypeMeta_Type)) {
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod provided object %R is not a DType."
+                    "(method: %s)", spec->dtypes[i], spec->name);
+            return -1;
+        }
+        if (spec->dtypes[i]->abstract && i < spec->nin) {
+            PyErr_Format(PyExc_TypeError,
+                    "abstract DType %S are currently not allowed for inputs."
+                    "(method: %s defined at %s)", spec->dtypes[i], spec->name);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Initialize a new BoundArrayMethodObject from slots.  Slots which are
+ * not provided may be filled with defaults.
+ *
+ * @param res The new PyBoundArrayMethodObject to be filled.
+ * @param spec The specification list passed by the user.
+ * @param private Private flag to limit certain slots to use in NumPy.
+ * @return -1 on error 0 on success
+ */
+static int
+fill_arraymethod_from_slots(
+        PyBoundArrayMethodObject *res, PyArrayMethod_Spec *spec,
+        int private)
+{
+    PyArrayMethodObject *meth = res->method;
+
+    /* Set the defaults */
+    meth->get_strided_loop = &default_get_strided_loop;
+    meth->resolve_descriptors = &default_resolve_descriptors;
+
+    /* Fill in the slots passed by the user */
+    /*
+     * TODO: This is reasonable for now, but it would be nice to find a
+     *       shorter solution, and add some additional error checking (e.g.
+     *       the same slot used twice). Python uses an array of slot offsets.
+     */
+    for (PyType_Slot *slot = &spec->slots[0]; slot->slot != 0; slot++) {
+        switch (slot->slot) {
+            case NPY_METH_resolve_descriptors:
+                meth->resolve_descriptors = slot->pfunc;
+                continue;
+            case NPY_METH_get_loop:
+                if (private) {
+                    /* Only allow override for private functions initially */
+                    meth->get_strided_loop = slot->pfunc;
+                    continue;
+                }
+                break;
+            case NPY_METH_strided_loop:
+                meth->strided_loop = slot->pfunc;
+                continue;
+            case NPY_METH_contiguous_loop:
+                meth->contiguous_loop = slot->pfunc;
+                continue;
+            case NPY_METH_unaligned_strided_loop:
+                meth->unaligned_strided_loop = slot->pfunc;
+                continue;
+            case NPY_METH_unaligned_contiguous_loop:
+                meth->unaligned_contiguous_loop = slot->pfunc;
+                continue;
+            default:
+                break;
+        }
+        PyErr_Format(PyExc_RuntimeError,
+                "invalid slot number %d to ArrayMethod: %s",
+                slot->slot, spec->name);
+        return -1;
+    }
+
+    /* Check whether the slots are valid: */
+    if (meth->resolve_descriptors == &default_resolve_descriptors) {
+        for (int i = 0; i < meth->nin + meth->nout; i++) {
+            if (res->dtypes[i] == NULL) {
+                if (i < meth->nin) {
+                    PyErr_Format(PyExc_TypeError,
+                            "All input DTypes must be specified when using "
+                            "the default `resolve_descriptors` function. "
+                            "(method: %s)", spec->name);
+                    return -1;
+                }
+                else if (meth->nin == 0) {
+                    PyErr_Format(PyExc_TypeError,
+                            "Must specify output DTypes or use custom "
+                            "`resolve_descriptors` when there are no inputs. "
+                            "(method: %s defined at %s)", spec->name);
+                    return -1;
+                }
+            }
+            if (i >= meth->nin && res->dtypes[i]->parametric) {
+                PyErr_Format(PyExc_TypeError,
+                        "must provide a `resolve_descriptors` function if any "
+                        "output DType is parametric. (method: %s)",
+                        spec->name);
+                return -1;
+            }
+        }
+    }
+    if (meth->get_strided_loop != &default_get_strided_loop) {
+        /* Do not check the actual loop fields. */
+        return 0;
+    }
+
+    /* Check whether the provided loops make sense. */
+    if (meth->strided_loop == NULL) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide a strided inner loop function. (method: %s)",
+                spec->name);
+        return -1;
+    }
+    if (meth->contiguous_loop == NULL) {
+        meth->contiguous_loop = meth->strided_loop;
+    }
+    if (meth->unaligned_contiguous_loop != NULL &&
+            meth->unaligned_strided_loop == NULL) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide unaligned strided inner loop when providing "
+                "a contiguous version. (method: %s)", spec->name);
+        return -1;
+    }
+    if ((meth->unaligned_strided_loop == NULL) !=
+            !(meth->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide unaligned strided inner loop when providing "
+                "a contiguous version. (method: %s)", spec->name);
+        return -1;
+    }
+
+    return 0;
+}
+
+
+/**
+ * Create a new ArrayMethod (internal version).
+ *
+ * @param name A name for the individual method, may be NULL.
+ * @param spec A filled context object to pass generic information about
+ *        the method (such as usually needing the API, and the DTypes).
+ *        Unused fields must be NULL.
+ * @param slots Slots with the correct pair of IDs and (function) pointers.
+ * @param private Some slots are currently considered private, if not true,
+ *        these will be rejected.
+ *
+ * @returns A new (bound) ArrayMethod object.
+ */
+NPY_NO_EXPORT PyBoundArrayMethodObject *
+PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private)
+{
+    int nargs = spec->nin + spec->nout;
+
+    if (spec->name == NULL) {
+        spec->name = "<unknown>";
+    }
+
+    if (validate_spec(spec) < 0) {
+        return NULL;
+    }
+
+    PyBoundArrayMethodObject *res;
+    res = PyObject_New(PyBoundArrayMethodObject, &PyBoundArrayMethod_Type);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->method = NULL;
+
+    res->dtypes = PyMem_Malloc(sizeof(PyArray_DTypeMeta *) * nargs);
+    if (res->dtypes == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (int i = 0; i < nargs ; i++) {
+        Py_XINCREF(spec->dtypes[i]);
+        res->dtypes[i] = spec->dtypes[i];
+    }
+
+    res->method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (res->method == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memset((char *)(res->method) + sizeof(PyObject), 0,
+           sizeof(PyArrayMethodObject) - sizeof(PyObject));
+
+    res->method->nin = spec->nin;
+    res->method->nout = spec->nout;
+    res->method->flags = spec->flags;
+    res->method->casting = spec->casting;
+    if (fill_arraymethod_from_slots(res, spec, private) < 0) {
+        Py_DECREF(res);
+        return NULL;
+    }
+
+    ssize_t length = strlen(spec->name);
+    res->method->name = PyMem_Malloc(length + 1);
+    if (res->method->name == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(res->method->name, spec->name);
+
+    return res;
+}
+
+
+static void
+arraymethod_dealloc(PyObject *self)
+{
+    PyArrayMethodObject *meth;
+    meth = ((PyArrayMethodObject *)self);
+
+    PyMem_Free(meth->name);
+
+    Py_TYPE(self)->tp_free(self);
+}
+
+
+NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy._ArrayMethod",
+    .tp_basicsize = sizeof(PyArrayMethodObject),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_dealloc = arraymethod_dealloc,
+};
+
+
+
+static PyObject *
+boundarraymethod_repr(PyBoundArrayMethodObject *self)
+{
+    int nargs = self->method->nin + self->method->nout;
+    PyObject *dtypes = PyTuple_New(nargs);
+    if (dtypes == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < nargs; i++) {
+        Py_INCREF(self->dtypes[i]);
+        PyTuple_SET_ITEM(dtypes, i, (PyObject *)self->dtypes[i]);
+    }
+    return PyUnicode_FromFormat(
+            "<np._BoundArrayMethod `%s` for dtypes %S>",
+            self->method->name, dtypes);
+}
+
+
+static void
+boundarraymethod_dealloc(PyObject *self)
+{
+    PyBoundArrayMethodObject *meth;
+    meth = ((PyBoundArrayMethodObject *)self);
+    int nargs = meth->method->nin + meth->method->nout;
+
+    for (int i = 0; i < nargs; i++) {
+        Py_XDECREF(meth->dtypes[i]);
+    }
+    PyMem_Free(meth->dtypes);
+
+    Py_XDECREF(meth->method);
+
+    Py_TYPE(self)->tp_free(self);
+}
+
+
+/*
+ * Calls resolve_descriptors() and returns the casting level and the resolved
+ * descriptors as a tuple. If the operation is impossible returns (-1, None).
+ * May raise an error, but usually should not.
+ * The function validates the casting attribute compared to the returned
+ * casting level.
+ */
+static PyObject *
+boundarraymethod__resolve_descripors(
+        PyBoundArrayMethodObject *self, PyObject *descr_tuple)
+{
+    int nin = self->method->nin;
+    int nout = self->method->nout;
+
+    PyArray_Descr *given_descrs[NPY_MAXARGS];
+    PyArray_Descr *loop_descrs[NPY_MAXARGS];
+
+    if (!PyTuple_CheckExact(descr_tuple) ||
+            PyTuple_Size(descr_tuple) != nin + nout) {
+        PyErr_Format(PyExc_ValueError,
+                "_resolve_descriptors() takes exactly one tuple with as many "
+                "elements as the method takes arguments (%d+%d).", nin, nout);
+        return NULL;
+    }
+
+    for (int i = 0; i < nin + nout; i++) {
+        PyObject *tmp = PyTuple_GetItem(descr_tuple, i);
+        if (tmp == NULL) {
+            return NULL;
+        }
+        else if (tmp == Py_None) {
+            if (i < nin) {
+                PyErr_SetString(PyExc_ValueError,
+                        "only output dtypes may be omitted (set to None).");
+                return NULL;
+            }
+            given_descrs[i] = NULL;
+        }
+        else if (PyArray_DescrCheck(tmp)) {
+            if (Py_TYPE(tmp) != (PyTypeObject *)self->dtypes[i]) {
+                PyErr_Format(PyExc_ValueError,
+                        "input dtype %S was not an exact instance of the bound "
+                        "DType class %S.", tmp, self->dtypes[i]);
+                return NULL;
+            }
+            given_descrs[i] = (PyArray_Descr *)tmp;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                    "dtype tuple can only contain dtype instances or None.");
+            return NULL;
+        }
+    }
+
+    NPY_CASTING casting = self->method->resolve_descriptors(
+            self->method, self->dtypes, given_descrs, loop_descrs);
+
+    if (casting < 0 && PyErr_Occurred()) {
+        return NULL;
+    }
+    else if (casting < 0) {
+        return Py_BuildValue("iO", casting, Py_None);
+    }
+
+    PyObject *result_tuple = PyTuple_New(nin + nout);
+    if (result_tuple == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < nin + nout; i++) {
+        /* transfer ownership to the tuple. */
+        PyTuple_SET_ITEM(result_tuple, i, (PyObject *)loop_descrs[i]);
+    }
+
+    /*
+     * The casting flags should be the most generic casting level (except the
+     * cast-is-view flag.  If no input is parametric, it must match exactly.
+     */
+    int parametric = 0;
+    for (int i = 0; i < nin + nout; i++) {
+        if (self->dtypes[i]->parametric) {
+            parametric = 1;
+            break;
+        }
+    }
+    if (!parametric) {
+        /*
+         * Non-parametric can only mismatch if it switches from no to equiv
+         * (e.g. due to byteorder changes).
+         */
+        if (self->method->casting != (casting & ~_NPY_CAST_IS_VIEW) &&
+                !(self->method->casting == NPY_NO_CASTING &&
+                  casting == NPY_EQUIV_CASTING)) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "resolve_descriptors cast level did not match stored one "
+                    "(expected %d, got %d) for method %s",
+                    self->method->casting, (casting & ~_NPY_CAST_IS_VIEW),
+                    self->method->name);
+            Py_DECREF(result_tuple);
+            return NULL;
+        }
+    }
+    else {
+        NPY_CASTING cast = casting & ~_NPY_CAST_IS_VIEW;
+        if (cast != PyArray_MinCastSafety(cast, self->method->casting)) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "resolve_descriptors cast level did not match stored one "
+                    "(expected %d, got %d) for method %s",
+                    self->method->casting, (casting & ~_NPY_CAST_IS_VIEW),
+                    self->method->name);
+            Py_DECREF(result_tuple);
+            return NULL;
+        }
+    }
+
+    return Py_BuildValue("iN", casting, result_tuple);
+}
+
+
+PyMethodDef boundarraymethod_methods[] = {
+    {"_resolve_descriptors", (PyCFunction)boundarraymethod__resolve_descripors,
+     METH_O, "Resolve the given dtypes."},
+    {NULL, 0, 0, NULL},
+};
+
+
+static PyObject *
+boundarraymethod__supports_unaligned(PyBoundArrayMethodObject *self)
+{
+    return PyBool_FromLong(self->method->flags & NPY_METH_SUPPORTS_UNALIGNED);
+}
+
+
+PyGetSetDef boundarraymethods_getters[] = {
+    {"_supports_unaligned",
+     (getter)boundarraymethod__supports_unaligned, NULL,
+     "whether the method supports unaligned inputs/outputs.", NULL},
+    {NULL, NULL, NULL, NULL, NULL},
+};
+
+
+NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy._BoundArrayMethod",
+    .tp_basicsize = sizeof(PyBoundArrayMethodObject),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_repr = (reprfunc)boundarraymethod_repr,
+    .tp_dealloc = boundarraymethod_dealloc,
+    .tp_methods = boundarraymethod_methods,
+    .tp_getset = boundarraymethods_getters,
+};
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
new file mode 100644
index 000000000..15ea948ce
--- /dev/null
+++ b/numpy/core/src/multiarray/array_method.h
@@ -0,0 +1,150 @@
+#ifndef _NPY_ARRAY_METHOD_H
+#define _NPY_ARRAY_METHOD_H
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+#include <numpy/ndarraytypes.h>
+#include <lowlevel_strided_loops.h>
+
+
+typedef enum {
+    /* Flag for whether the GIL is required */
+    NPY_METH_REQUIRES_PYAPI = 1 << 1,
+    /*
+     * Some functions cannot set floating point error flags, this flag
+     * gives us the option (not requirement) to skip floating point error
+     * setup/check. No function should set error flags and ignore them
+     * since it would interfere with chaining operations (e.g. casting).
+     */
+    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
+    /* Whether the method supports unaligned access (not runtime) */
+    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
+
+    /* All flags which can change at runtime */
+    NPY_METH_RUNTIME_FLAGS = (
+            NPY_METH_REQUIRES_PYAPI |
+            NPY_METH_NO_FLOATINGPOINT_ERRORS),
+} NPY_ARRAYMETHOD_FLAGS;
+
+
+struct PyArrayMethodObject_tag;
+
+/*
+ * This struct is specific to an individual (possibly repeated) call of
+ * the ArrayMethods strided operator, and as such is passed into the various
+ * methods of the ArrayMethod object (the resolve_descriptors function,
+ * the get_loop function and the individual lowlevel strided operator calls).
+ * It thus has to be persistent for one end-user call, and then be discarded.
+ *
+ * TODO: Before making this public, we should review which information should
+ *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
+ */
+typedef struct {
+    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
+    struct PyArrayMethodObject_tag *method;
+
+    /* Operand descriptors, filled in by resolve_descriptors */
+    PyArray_Descr **descriptors;
+} PyArrayMethod_Context;
+
+
+typedef NPY_CASTING (resolve_descriptors_function)(
+        struct PyArrayMethodObject_tag *method,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs);
+
+
+typedef int (get_loop_function)(
+        PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        npy_intp *strides,
+        PyArray_StridedUnaryOp **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/*
+ * This struct will be public and necessary for creating a new ArrayMethod
+ * object (casting and ufuncs).
+ * We could version the struct, although since we allow passing arbitrary
+ * data using the slots, and have flags, that may be enough?
+ * (See also PyBoundArrayMethodObject.)
+ */
+typedef struct {
+    const char *name;
+    int nin, nout;
+    NPY_CASTING casting;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    PyArray_DTypeMeta **dtypes;
+    PyType_Slot *slots;
+} PyArrayMethod_Spec;
+
+
+/*
+ * Structure of the ArrayMethod. This structure should probably not be made
+ * public. If necessary, we can make certain operations on it public
+ * (e.g. to allow users indirect access to `get_strided_loop`).
+ *
+ * NOTE: In some cases, it may not be clear whether information should be
+ * stored here or on the bound version. E.g. `nin` and `nout` (and in the
+ * future the gufunc `signature`) is already stored on the ufunc so that
+ * storing these here duplicates the information.
+ */
+typedef struct PyArrayMethodObject_tag {
+    PyObject_HEAD
+    char *name;
+    int nin, nout;
+    /* Casting is normally "safe" for functions, but is important for casts */
+    NPY_CASTING casting;
+    /* default flags. The get_strided_loop function can override these */
+    NPY_ARRAYMETHOD_FLAGS flags;
+    resolve_descriptors_function *resolve_descriptors;
+    get_loop_function *get_strided_loop;
+    /* Typical loop functions (contiguous ones are used in current casts) */
+    PyArray_StridedUnaryOp *strided_loop;
+    PyArray_StridedUnaryOp *contiguous_loop;
+    PyArray_StridedUnaryOp *unaligned_strided_loop;
+    PyArray_StridedUnaryOp *unaligned_contiguous_loop;
+} PyArrayMethodObject;
+
+
+/*
+ * We will sometimes have to create a ArrayMethod and allow passing it around,
+ * similar to `instance.method` returning a bound method, e.g. a function like
+ * `ufunc.resolve()` can return a bound object.
+ * The current main purpose of the BoundArrayMethod is that it holds on to the
+ * `dtypes` (the classes), so that the `ArrayMethod` (e.g. for casts) will
+ * not create references cycles.  In principle, it could hold any information
+ * which is also stored on the ufunc (and thus does not need to be repeated
+ * on the `ArrayMethod` itself.
+ */
+typedef struct {
+    PyObject_HEAD
+    PyArray_DTypeMeta **dtypes;
+    PyArrayMethodObject *method;
+} PyBoundArrayMethodObject;
+
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type;
+extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
+
+/*
+ * SLOTS IDs For the ArrayMethod creation, one public, the IDs are fixed.
+ * TODO: Before making it public, consider adding a large constant to private
+ *       slots.
+ */
+#define NPY_METH_resolve_descriptors 1
+#define NPY_METH_get_loop 2
+#define NPY_METH_strided_loop 3
+#define NPY_METH_contiguous_loop 4
+#define NPY_METH_unaligned_strided_loop 5
+#define NPY_METH_unaligned_contiguous_loop 6
+
+
+NPY_NO_EXPORT PyBoundArrayMethodObject *
+PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private);
+
+#endif  /*_NPY_ARRAY_METHOD_H*/
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index 9ea8efdd9..2c07cdebc 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -26,7 +26,6 @@ static PyObject *
 get_array_function(PyObject *obj)
 {
     static PyObject *ndarray_array_function = NULL;
-    PyObject *array_function;
 
     if (ndarray_array_function == NULL) {
         ndarray_array_function = get_ndarray_array_function();
@@ -38,7 +37,7 @@ get_array_function(PyObject *obj)
         return ndarray_array_function;
     }
 
-    array_function = PyArray_LookupSpecial(obj, "__array_function__");
+    PyObject *array_function = PyArray_LookupSpecial(obj, "__array_function__");
     if (array_function == NULL && PyErr_Occurred()) {
         PyErr_Clear(); /* TODO[gh-14801]: propagate crashes during attribute access? */
     }
@@ -53,9 +52,7 @@ get_array_function(PyObject *obj)
 static void
 pyobject_array_insert(PyObject **array, int length, int index, PyObject *item)
 {
-    int j;
-
-    for (j = length; j > index; j--) {
+    for (int j = length; j > index; j--) {
         array[j] = array[j - 1];
     }
     array[index] = item;
@@ -74,18 +71,16 @@ get_implementing_args_and_methods(PyObject *relevant_args,
                                   PyObject **methods)
 {
     int num_implementing_args = 0;
-    Py_ssize_t i;
-    int j;
 
     PyObject **items = PySequence_Fast_ITEMS(relevant_args);
     Py_ssize_t length = PySequence_Fast_GET_SIZE(relevant_args);
 
-    for (i = 0; i < length; i++) {
+    for (Py_ssize_t i = 0; i < length; i++) {
         int new_class = 1;
         PyObject *argument = items[i];
 
         /* Have we seen this type before? */
-        for (j = 0; j < num_implementing_args; j++) {
+        for (int j = 0; j < num_implementing_args; j++) {
             if (Py_TYPE(argument) == Py_TYPE(implementing_args[j])) {
                 new_class = 0;
                 break;
@@ -109,7 +104,7 @@ get_implementing_args_and_methods(PyObject *relevant_args,
 
                 /* "subclasses before superclasses, otherwise left to right" */
                 arg_index = num_implementing_args;
-                for (j = 0; j < num_implementing_args; j++) {
+                for (int j = 0; j < num_implementing_args; j++) {
                     PyObject *other_type;
                     other_type = (PyObject *)Py_TYPE(implementing_args[j]);
                     if (PyObject_IsInstance(argument, other_type)) {
@@ -129,7 +124,7 @@ get_implementing_args_and_methods(PyObject *relevant_args,
     return num_implementing_args;
 
 fail:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(methods[j]);
     }
@@ -161,13 +156,10 @@ NPY_NO_EXPORT PyObject *
 array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
                            PyObject *kwargs)
 {
-    Py_ssize_t j;
-    PyObject *implementation, *result;
-
     PyObject **items = PySequence_Fast_ITEMS(types);
     Py_ssize_t length = PySequence_Fast_GET_SIZE(types);
 
-    for (j = 0; j < length; j++) {
+    for (Py_ssize_t j = 0; j < length; j++) {
         int is_subclass = PyObject_IsSubclass(
             items[j], (PyObject *)&PyArray_Type);
         if (is_subclass == -1) {
@@ -179,11 +171,11 @@ array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
         }
     }
 
-    implementation = PyObject_GetAttr(func, npy_ma_str_implementation);
+    PyObject *implementation = PyObject_GetAttr(func, npy_ma_str_implementation);
     if (implementation == NULL) {
         return NULL;
     }
-    result = PyObject_Call(implementation, args, kwargs);
+    PyObject *result = PyObject_Call(implementation, args, kwargs);
     Py_DECREF(implementation);
     return result;
 }
@@ -208,32 +200,32 @@ call_array_function(PyObject* argument, PyObject* method,
 }
 
 
-/*
- * Implements the __array_function__ protocol for a function, as described in
- * in NEP-18. See numpy.core.overrides for a full docstring.
+/**
+ * Internal handler for the array-function dispatching. The helper returns
+ * either the result, or NotImplemented (as a borrowed reference).
+ *
+ * @param public_api The public API symbol used for dispatching
+ * @param relevant_args Arguments which may implement __array_function__
+ * @param args Original arguments
+ * @param kwargs Original keyword arguments
+ *
+ * @returns The result of the dispatched version, or a borrowed reference
+ *          to NotImplemented to indicate the default implementation should
+ *          be used.
  */
 NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+array_implement_array_function_internal(
+    PyObject *public_api, PyObject *relevant_args,
+    PyObject *args, PyObject *kwargs)
 {
-    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
-
-    PyObject *types = NULL;
     PyObject *implementing_args[NPY_MAXARGS];
     PyObject *array_function_methods[NPY_MAXARGS];
+    PyObject *types = NULL;
 
-    int j, any_overrides;
-    int num_implementing_args = 0;
     PyObject *result = NULL;
 
     static PyObject *errmsg_formatter = NULL;
 
-    if (!PyArg_UnpackTuple(
-            positional_args, "implement_array_function", 5, 5,
-            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
-        return NULL;
-    }
-
     relevant_args = PySequence_Fast(
         relevant_args,
         "dispatcher for __array_function__ did not return an iterable");
@@ -242,7 +234,7 @@ array_implement_array_function(
     }
 
     /* Collect __array_function__ implementations */
-    num_implementing_args = get_implementing_args_and_methods(
+    int num_implementing_args = get_implementing_args_and_methods(
         relevant_args, implementing_args, array_function_methods);
     if (num_implementing_args == -1) {
         goto cleanup;
@@ -254,15 +246,19 @@ array_implement_array_function(
      * arguments implement __array_function__ at all (e.g., if they are all
      * built-in types).
      */
-    any_overrides = 0;
-    for (j = 0; j < num_implementing_args; j++) {
+    int any_overrides = 0;
+    for (int j = 0; j < num_implementing_args; j++) {
         if (!is_default_array_function(array_function_methods[j])) {
             any_overrides = 1;
             break;
         }
     }
     if (!any_overrides) {
-        result = PyObject_Call(implementation, args, kwargs);
+        /*
+         * When the default implementation should be called, return
+         * `Py_NotImplemented` to indicate this.
+         */
+        result = Py_NotImplemented;
         goto cleanup;
     }
 
@@ -275,14 +271,14 @@ array_implement_array_function(
     if (types == NULL) {
         goto cleanup;
     }
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
         Py_INCREF(arg_type);
         PyTuple_SET_ITEM(types, j, arg_type);
     }
 
     /* Call __array_function__ methods */
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *argument = implementing_args[j];
         PyObject *method = array_function_methods[j];
 
@@ -319,7 +315,7 @@ array_implement_array_function(
     }
 
 cleanup:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(array_function_methods[j]);
     }
@@ -330,6 +326,109 @@ cleanup:
 
 
 /*
+ * Implements the __array_function__ protocol for a Python function, as described in
+ * in NEP-18. See numpy.core.overrides for a full docstring.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
+
+    if (!PyArg_UnpackTuple(
+            positional_args, "implement_array_function", 5, 5,
+            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
+        return NULL;
+    }
+
+    /* Remove `like=` kwarg, which is NumPy-exclusive and thus not present
+     * in downstream libraries. If `like=` is specified but doesn't
+     * implement `__array_function__`, raise a `TypeError`.
+     */
+    if (kwargs != NULL && PyDict_Contains(kwargs, npy_ma_str_like)) {
+        PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
+        if (like_arg && !get_array_function(like_arg)) {
+            return PyErr_Format(PyExc_TypeError,
+                    "The `like` argument must be an array-like that implements "
+                    "the `__array_function__` protocol.");
+        }
+        PyDict_DelItem(kwargs, npy_ma_str_like);
+    }
+
+    PyObject *res = array_implement_array_function_internal(
+        public_api, relevant_args, args, kwargs);
+
+    if (res == Py_NotImplemented) {
+        return PyObject_Call(implementation, args, kwargs);
+    }
+    return res;
+}
+
+
+/*
+ * Implements the __array_function__ protocol for C array creation functions
+ * only. Added as an extension to NEP-18 in an effort to bring NEP-35 to
+ * life with minimal dispatch overhead.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_c_array_function_creation(
+    const char *function_name, PyObject *args, PyObject *kwargs)
+{
+    if (kwargs == NULL) {
+        return Py_NotImplemented;
+    }
+
+    /* Remove `like=` kwarg, which is NumPy-exclusive and thus not present
+     * in downstream libraries. If that key isn't present, return NULL and
+     * let originating call to continue. If the key is present but doesn't
+     * implement `__array_function__`, raise a `TypeError`.
+     */
+    if (!PyDict_Contains(kwargs, npy_ma_str_like)) {
+        return Py_NotImplemented;
+    }
+
+    PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
+    if (like_arg == NULL) {
+        return NULL;
+    }
+    else if (!get_array_function(like_arg)) {
+        return PyErr_Format(PyExc_TypeError,
+                "The `like` argument must be an array-like that implements "
+                "the `__array_function__` protocol.");
+    }
+    PyObject *relevant_args = PyTuple_Pack(1, like_arg);
+    PyDict_DelItem(kwargs, npy_ma_str_like);
+
+    PyObject *numpy_module = PyImport_Import(npy_ma_str_numpy);
+    if (numpy_module == NULL) {
+        Py_DECREF(relevant_args);
+        return NULL;
+    }
+
+    PyObject *public_api = PyObject_GetAttrString(numpy_module, function_name);
+    Py_DECREF(numpy_module);
+    if (public_api == NULL) {
+        Py_DECREF(relevant_args);
+        return NULL;
+    }
+    if (!PyCallable_Check(public_api)) {
+        Py_DECREF(relevant_args);
+        Py_DECREF(public_api);
+        return PyErr_Format(PyExc_RuntimeError,
+                            "numpy.%s is not callable.",
+                            function_name);
+    }
+
+    PyObject* result = array_implement_array_function_internal(
+            public_api, relevant_args, args, kwargs);
+
+    Py_DECREF(relevant_args);
+    Py_DECREF(public_api);
+    return result;
+}
+
+
+/*
  * Python wrapper for get_implementing_args_and_methods, for testing purposes.
  */
 NPY_NO_EXPORT PyObject *
@@ -337,8 +436,6 @@ array__get_implementing_args(
     PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
 {
     PyObject *relevant_args;
-    int j;
-    int num_implementing_args = 0;
     PyObject *implementing_args[NPY_MAXARGS];
     PyObject *array_function_methods[NPY_MAXARGS];
     PyObject *result = NULL;
@@ -355,7 +452,7 @@ array__get_implementing_args(
         return NULL;
     }
 
-    num_implementing_args = get_implementing_args_and_methods(
+    int num_implementing_args = get_implementing_args_and_methods(
         relevant_args, implementing_args, array_function_methods);
     if (num_implementing_args == -1) {
         goto cleanup;
@@ -366,14 +463,14 @@ array__get_implementing_args(
     if (result == NULL) {
         goto cleanup;
     }
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *argument = implementing_args[j];
         Py_INCREF(argument);
         PyList_SET_ITEM(result, j, argument);
     }
 
 cleanup:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(array_function_methods[j]);
     }
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index 0d224e2b6..fdcf1746d 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -10,6 +10,10 @@ array__get_implementing_args(
     PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
 
 NPY_NO_EXPORT PyObject *
+array_implement_c_array_function_creation(
+    const char *function_name, PyObject *args, PyObject *kwargs);
+
+NPY_NO_EXPORT PyObject *
 array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
                            PyObject *kwargs);
 
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 95c650674..a2474d79f 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -416,7 +416,7 @@ WARN_IN_DEALLOC(PyObject* warning, const char * msg) {
     if (PyErr_WarnEx(warning, msg, 1) < 0) {
         PyObject * s;
 
-        s = PyUString_FromString("array_dealloc");
+        s = PyUnicode_FromString("array_dealloc");
         if (s) {
             PyErr_WriteUnraisable(s);
             Py_DECREF(s);
@@ -434,7 +434,9 @@ array_dealloc(PyArrayObject *self)
 {
     PyArrayObject_fields *fa = (PyArrayObject_fields *)self;
 
-    _dealloc_cached_buffer_info((PyObject*)self);
+    if (_buffer_info_free(fa->_buffer_info, (PyObject *)self) < 0) {
+        PyErr_WriteUnraisable(NULL);
+    }
 
     if (fa->weakreflist != NULL) {
         PyObject_ClearWeakRefs((PyObject *)self);
@@ -1745,7 +1747,7 @@ array_free(PyObject * v)
 NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy.ndarray",
-    .tp_basicsize = NPY_SIZEOF_PYARRAYOBJECT,
+    .tp_basicsize = sizeof(PyArrayObject_fields),
     /* methods */
     .tp_dealloc = (destructor)array_dealloc,
     .tp_repr = (reprfunc)array_repr,
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 1c93fa0ef..ecaca72a1 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -47,7 +47,7 @@ static NPY_INLINE npy_bool
 PySequence_NoString_Check(PyObject *op) {
     return
         PySequence_Check(op) &&
-        !PyString_Check(op) &&
+        !PyBytes_Check(op) &&
         !PyUnicode_Check(op) &&
         !PyArray_IsZeroDim(op);
 }
@@ -175,7 +175,7 @@ MyPyLong_AsUnsigned@Type@ (PyObject *obj)
  *
  * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, LONG, UINT, ULONG,
  *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE#
- * #func1 = PyBool_FromLong, PyInt_FromLong*6, PyLong_FromUnsignedLong*2,
+ * #func1 = PyBool_FromLong, PyLong_FromLong*6, PyLong_FromUnsignedLong*2,
  *          PyLong_FromLongLong, PyLong_FromUnsignedLongLong,
  *          MyPyFloat_FromHalf, PyFloat_FromDouble*2#
  * #func2 = PyObject_IsTrue, MyPyLong_AsLong*6, MyPyLong_AsUnsignedLong*2,
@@ -302,9 +302,42 @@ static int
             oop.real = NPY_NAN;
             oop.imag = NPY_NAN;
         }
+        else if (PyBytes_Check(op) || PyUnicode_Check(op)) {
+            /*
+             * Unlike most numeric conversion functions PyComplex_AsCComplex
+             * does not handle strings, so we have to use its constructor.
+             */
+            PyObject *pycomplex, *args;
+            if (PyBytes_Check(op)) {
+                /* The complex constructor expects unicode */
+                PyObject *unicode;
+                unicode = PyUnicode_FromEncodedObject(op, NULL, NULL);
+                if (unicode == NULL) {
+                    return -1;
+                }
+                args = PyTuple_Pack(1, unicode);
+                Py_DECREF(unicode);
+            }
+            else {
+                args = PyTuple_Pack(1, op);
+            }
+            if (args == NULL) {
+                return -1;
+            }
+            pycomplex = PyComplex_Type.tp_new(&PyComplex_Type, args, NULL);
+            Py_DECREF(args);
+            if (pycomplex == NULL) {
+                return -1;
+            }
+            oop = PyComplex_AsCComplex(pycomplex);
+            Py_DECREF(pycomplex);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
         else {
-            oop = PyComplex_AsCComplex (op);
-            if (PyErr_Occurred()) {
+            oop = PyComplex_AsCComplex(op);
+            if (error_converting(oop.real)) {
                 return -1;
             }
         }
@@ -615,7 +648,7 @@ static PyObject *
 OBJECT_getitem(void *ip, void *NPY_UNUSED(ap))
 {
     PyObject *obj;
-    NPY_COPY_PYOBJECT_PTR(&obj, ip);
+    memcpy(&obj, ip, sizeof(obj));
     if (obj == NULL) {
         Py_RETURN_NONE;
     }
@@ -631,12 +664,12 @@ OBJECT_setitem(PyObject *op, void *ov, void *NPY_UNUSED(ap))
 {
     PyObject *obj;
 
-    NPY_COPY_PYOBJECT_PTR(&obj, ov);
+    memcpy(&obj, ov, sizeof(obj));
 
     Py_INCREF(op);
     Py_XDECREF(obj);
 
-    NPY_COPY_PYOBJECT_PTR(ov, &op);
+    memcpy(ov, &op, sizeof(op));
 
     return PyErr_Occurred() ? -1 : 0;
 }
@@ -832,7 +865,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
             npy_intp names_size = PyTuple_GET_SIZE(descr->names);
 
             if (names_size != PyTuple_Size(op)) {
-                errmsg = PyUString_FromFormat(
+                errmsg = PyUnicode_FromFormat(
                         "could not assign tuple of length %zd to structure "
                         "with %" NPY_INTP_FMT " fields.",
                         PyTuple_Size(op), names_size);
@@ -2204,11 +2237,11 @@ OBJECT_copyswapn(PyObject **dst, npy_intp dstride, PyObject **src,
             dstp = (unsigned char*)dst;
             srcp = (unsigned char*)src;
             for (i = 0; i < n; i++) {
-                NPY_COPY_PYOBJECT_PTR(&tmp, srcp);
+                memcpy(&tmp, srcp, sizeof(tmp));
                 Py_XINCREF(tmp);
-                NPY_COPY_PYOBJECT_PTR(&tmp, dstp);
+                memcpy(&tmp, dstp, sizeof(tmp));
                 Py_XDECREF(tmp);
-                NPY_COPY_PYOBJECT_PTR(dstp, srcp);
+                memcpy(dstp, srcp, sizeof(tmp));
                 dstp += dstride;
                 srcp += sstride;
             }
@@ -2232,11 +2265,11 @@ OBJECT_copyswap(PyObject **dst, PyObject **src, int NPY_UNUSED(swap),
         }
         else {
             PyObject *tmp;
-            NPY_COPY_PYOBJECT_PTR(&tmp, src);
+            memcpy(&tmp, src, sizeof(tmp));
             Py_XINCREF(tmp);
-            NPY_COPY_PYOBJECT_PTR(&tmp, dst);
+            memcpy(&tmp, dst, sizeof(tmp));
             Py_XDECREF(tmp);
-            NPY_COPY_PYOBJECT_PTR(dst, src);
+            memcpy(dst, src, sizeof(tmp));
         }
     }
 }
@@ -2653,7 +2686,7 @@ OBJECT_nonzero (PyObject **ip, PyArrayObject *ap)
     }
     else {
         PyObject *obj;
-        NPY_COPY_PYOBJECT_PTR(&obj, ip);
+        memcpy(&obj, ip, sizeof(obj));
         if (obj == NULL) {
             return NPY_FALSE;
         }
@@ -3105,8 +3138,8 @@ BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
     #if defined(__ARM_NEON__) || defined (__ARM_NEON)
         uint8x16_t zero = vdupq_n_u8(0);
         for(; i < n - (n % 32); i+=32) {
-            uint8x16_t d1 = vld1q_u8((char *)&ip[i]);
-            uint8x16_t d2 = vld1q_u8((char *)&ip[i + 16]);
+            uint8x16_t d1 = vld1q_u8((uint8_t *)&ip[i]);
+            uint8x16_t d2 = vld1q_u8((uint8_t *)&ip[i + 16]);
             d1 = vceqq_u8(d1, zero);
             d2 = vceqq_u8(d2, zero);
             if(_mm_movemask_epi8_neon(vminq_u8(d1, d2)) != 0xFFFF) {
@@ -4428,7 +4461,7 @@ set_typeinfo(PyObject *dict)
             return -1;
         }
     }
-    key = PyInt_FromLong(NPY_@name2@);
+    key = PyLong_FromLong(NPY_@name2@);
     if (key == NULL) {
         return -1;
     }
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 8b482dc03..813850224 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -267,7 +267,7 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
 
             child = (PyArray_Descr*)PyTuple_GetItem(item, 0);
             offset_obj = PyTuple_GetItem(item, 1);
-            new_offset = PyInt_AsLong(offset_obj);
+            new_offset = PyLong_AsLong(offset_obj);
             if (error_converting(new_offset)) {
                 return -1;
             }
@@ -428,35 +428,27 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
 
 
 /*
- * Global information about all active buffers
+ * Information about all active buffers is stored as a linked list on
+ * the ndarray. The initial pointer is currently tagged to have a chance of
+ * detecting incompatible subclasses.
  *
  * Note: because for backward compatibility we cannot define bf_releasebuffer,
  * we must manually keep track of the additional data required by the buffers.
  */
 
 /* Additional per-array data required for providing the buffer interface */
-typedef struct {
+typedef struct _buffer_info_t_tag {
     char *format;
     int ndim;
     Py_ssize_t *strides;
     Py_ssize_t *shape;
+    struct _buffer_info_t_tag *next;
 } _buffer_info_t;
 
-/*
- * { id(array): [list of pointers to _buffer_info_t, the last one is latest] }
- *
- * Because shape, strides, and format can be different for different buffers,
- * we may need to keep track of multiple buffer infos for each array.
- *
- * However, when none of them has changed, the same buffer info may be reused.
- *
- * Thread-safety is provided by GIL.
- */
-static PyObject *_buffer_info_cache = NULL;
 
 /* Fill in the info structure */
 static _buffer_info_t*
-_buffer_info_new(PyObject *obj)
+_buffer_info_new(PyObject *obj, int flags)
 {
     /*
      * Note that the buffer info is cached as PyLongObjects making them appear
@@ -474,18 +466,18 @@ _buffer_info_new(PyObject *obj)
             PyErr_NoMemory();
             goto fail;
         }
+        info->ndim = 0;
+        info->shape = NULL;
+        info->strides = NULL;
+
         descr = PyArray_DescrFromScalar(obj);
         if (descr == NULL) {
             goto fail;
         }
-        info->ndim = 0;
-        info->shape = NULL;
-        info->strides = NULL;
     }
     else {
         assert(PyArray_Check(obj));
         PyArrayObject * arr = (PyArrayObject *)obj;
-        descr = PyArray_DESCR(arr);
 
         info = PyObject_Malloc(sizeof(_buffer_info_t) +
                                sizeof(Py_ssize_t) * PyArray_NDIM(arr) * 2);
@@ -504,25 +496,67 @@ _buffer_info_new(PyObject *obj)
             info->shape = (npy_intp *)((char *)info + sizeof(_buffer_info_t));
             assert((size_t)info->shape % sizeof(npy_intp) == 0);
             info->strides = info->shape + PyArray_NDIM(arr);
-            for (k = 0; k < PyArray_NDIM(arr); ++k) {
-                info->shape[k] = PyArray_DIMS(arr)[k];
-                info->strides[k] = PyArray_STRIDES(arr)[k];
+
+#if NPY_RELAXED_STRIDES_CHECKING
+            /*
+             * When NPY_RELAXED_STRIDES_CHECKING is used, some buffer users
+             * may expect a contiguous buffer to have well formatted strides
+             * also when a dimension is 1, but we do not guarantee this
+             * internally. Thus, recalculate strides for contiguous arrays.
+             * (This is unnecessary, but has no effect in the case where
+             * NPY_RELAXED_STRIDES CHECKING is disabled.)
+             */
+            int f_contiguous = (flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS;
+            if (PyArray_IS_C_CONTIGUOUS(arr) && !(
+                    f_contiguous && PyArray_IS_F_CONTIGUOUS(arr))) {
+                Py_ssize_t sd = PyArray_ITEMSIZE(arr);
+                for (k = info->ndim-1; k >= 0; --k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = sd;
+                    sd *= info->shape[k];
+                }
+            }
+            else if (PyArray_IS_F_CONTIGUOUS(arr)) {
+                Py_ssize_t sd = PyArray_ITEMSIZE(arr);
+                for (k = 0; k < info->ndim; ++k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = sd;
+                    sd *= info->shape[k];
+                }
+            }
+            else {
+#else  /* NPY_RELAXED_STRIDES_CHECKING */
+            /* We can always use the arrays strides directly */
+            {
+#endif
+
+                for (k = 0; k < PyArray_NDIM(arr); ++k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = PyArray_STRIDES(arr)[k];
+                }
             }
         }
+        descr = PyArray_DESCR(arr);
         Py_INCREF(descr);
     }
 
     /* Fill in format */
-    err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
-    Py_DECREF(descr);
-    if (err != 0) {
-        goto fail;
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
+        Py_DECREF(descr);
+        if (err != 0) {
+            goto fail;
+        }
+        if (_append_char(&fmt, '\0') < 0) {
+            goto fail;
+        }
+        info->format = fmt.s;
     }
-    if (_append_char(&fmt, '\0') < 0) {
-        goto fail;
+    else {
+        Py_DECREF(descr);
+        info->format = NULL;
     }
-    info->format = fmt.s;
-
+    info->next = NULL;
     return info;
 
 fail:
@@ -538,9 +572,10 @@ _buffer_info_cmp(_buffer_info_t *a, _buffer_info_t *b)
     Py_ssize_t c;
     int k;
 
-    c = strcmp(a->format, b->format);
-    if (c != 0) return c;
-
+    if (a->format != NULL && b->format != NULL) {
+        c = strcmp(a->format, b->format);
+        if (c != 0) return c;
+    }
     c = a->ndim - b->ndim;
     if (c != 0) return c;
 
@@ -554,113 +589,161 @@ _buffer_info_cmp(_buffer_info_t *a, _buffer_info_t *b)
     return 0;
 }
 
-static void
-_buffer_info_free(_buffer_info_t *info)
+
+/*
+ * Tag the buffer info pointer by adding 2 (unless it is NULL to simplify
+ * object initialization).
+ * The linked list of buffer-infos was appended to the array struct in
+ * NumPy 1.20. Tagging the pointer gives us a chance to raise/print
+ * a useful error message instead of crashing hard if a C-subclass uses
+ * the same field.
+ */
+static NPY_INLINE void *
+buffer_info_tag(void *buffer_info)
 {
-    if (info->format) {
-        PyObject_Free(info->format);
+    if (buffer_info == NULL) {
+        return buffer_info;
+    }
+    else {
+        return (void *)((uintptr_t)buffer_info + 3);
     }
-    PyObject_Free(info);
 }
 
-/* Get buffer info from the global dictionary */
-static _buffer_info_t*
-_buffer_get_info(PyObject *obj)
-{
-    PyObject *key = NULL, *item_list = NULL, *item = NULL;
-    _buffer_info_t *info = NULL, *old_info = NULL;
 
-    if (_buffer_info_cache == NULL) {
-        _buffer_info_cache = PyDict_New();
-        if (_buffer_info_cache == NULL) {
-            return NULL;
-        }
-    }
-
-    /* Compute information */
-    info = _buffer_info_new(obj);
-    if (info == NULL) {
-        return NULL;
+static NPY_INLINE int
+_buffer_info_untag(
+        void *tagged_buffer_info, _buffer_info_t **buffer_info, PyObject *obj)
+{
+    if (tagged_buffer_info == NULL) {
+        *buffer_info = NULL;
+        return 0;
     }
-
-    /* Check if it is identical with an old one; reuse old one, if yes */
-    key = PyLong_FromVoidPtr((void*)obj);
-    if (key == NULL) {
-        goto fail;
+    if (NPY_UNLIKELY(((uintptr_t)tagged_buffer_info & 0x7) != 3)) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Object of type %S appears to be C subclassed NumPy array, "
+                "void scalar, or allocated in a non-standard way."
+                "NumPy reserves the right to change the size of these "
+                "structures. Projects are required to take this into account "
+                "by either recompiling against a specific NumPy version or "
+                "padding the struct and enforcing a maximum NumPy version.",
+                Py_TYPE(obj));
+        return -1;
     }
-    item_list = PyDict_GetItem(_buffer_info_cache, key);
+    *buffer_info = (void *)((uintptr_t)tagged_buffer_info - 3);
+    return 0;
+}
 
-    if (item_list != NULL) {
-        Py_INCREF(item_list);
-        if (PyList_GET_SIZE(item_list) > 0) {
-            item = PyList_GetItem(item_list, PyList_GET_SIZE(item_list) - 1);
-            old_info = (_buffer_info_t*)PyLong_AsVoidPtr(item);
 
-            if (_buffer_info_cmp(info, old_info) == 0) {
-                _buffer_info_free(info);
-                info = old_info;
-            }
-        }
-    }
-    else {
-        item_list = PyList_New(0);
-        if (item_list == NULL) {
-            goto fail;
-        }
-        if (PyDict_SetItem(_buffer_info_cache, key, item_list) != 0) {
-            goto fail;
+/*
+ * NOTE: for backward compatibility (esp. with PyArg_ParseTuple("s#", ...))
+ * we do *not* define bf_releasebuffer at all.
+ *
+ * Instead, any extra data allocated with the buffer is released only in
+ * array_dealloc.
+ *
+ * Ensuring that the buffer stays in place is taken care by refcounting;
+ * ndarrays do not reallocate if there are references to them, and a buffer
+ * view holds one reference.
+ *
+ * This is stored in the array's _buffer_info slot (currently as a void *).
+ */
+static void
+_buffer_info_free_untagged(void *_buffer_info)
+{
+    _buffer_info_t *next = _buffer_info;
+    while (next != NULL) {
+        _buffer_info_t *curr = next;
+        next = curr->next;
+        if (curr->format) {
+            PyObject_Free(curr->format);
         }
+        /* Shape is allocated as part of info */
+        PyObject_Free(curr);
     }
+}
 
-    if (info != old_info) {
-        /* Needs insertion */
-        item = PyLong_FromVoidPtr((void*)info);
-        if (item == NULL) {
-            goto fail;
-        }
-        PyList_Append(item_list, item);
-        Py_DECREF(item);
-    }
 
-    Py_DECREF(item_list);
-    Py_DECREF(key);
-    return info;
-
-fail:
-    if (info != NULL && info != old_info) {
-        _buffer_info_free(info);
+/*
+ * Checks whether the pointer is tagged, and then frees the cache list.
+ * (The tag check is only for transition due to changed structure size in 1.20)
+ */
+NPY_NO_EXPORT int
+_buffer_info_free(void *buffer_info, PyObject *obj)
+{
+    _buffer_info_t *untagged_buffer_info;
+    if (_buffer_info_untag(buffer_info, &untagged_buffer_info, obj) < 0) {
+        return -1;
     }
-    Py_XDECREF(item_list);
-    Py_XDECREF(key);
-    return NULL;
+    _buffer_info_free_untagged(untagged_buffer_info);
+    return 0;
 }
 
-/* Clear buffer info from the global dictionary */
-static void
-_buffer_clear_info(PyObject *arr)
+
+/*
+ * Get the buffer info returning either the old one (passed in) or a new
+ * buffer info which adds holds on to (and thus replaces) the old one.
+ */
+static _buffer_info_t*
+_buffer_get_info(void **buffer_info_cache_ptr, PyObject *obj, int flags)
 {
-    PyObject *key, *item_list, *item;
-    _buffer_info_t *info;
-    int k;
+    _buffer_info_t *info = NULL;
+    _buffer_info_t *stored_info;  /* First currently stored buffer info */
 
-    if (_buffer_info_cache == NULL) {
-        return;
+    if (_buffer_info_untag(*buffer_info_cache_ptr, &stored_info, obj) < 0) {
+        return NULL;
     }
+    _buffer_info_t *old_info = stored_info;
 
-    key = PyLong_FromVoidPtr((void*)arr);
-    item_list = PyDict_GetItem(_buffer_info_cache, key);
-    if (item_list != NULL) {
-        for (k = 0; k < PyList_GET_SIZE(item_list); ++k) {
-            item = PyList_GET_ITEM(item_list, k);
-            info = (_buffer_info_t*)PyLong_AsVoidPtr(item);
-            _buffer_info_free(info);
+    /* Compute information (it would be nice to skip this in simple cases) */
+    info = _buffer_info_new(obj, flags);
+    if (info == NULL) {
+        return NULL;
+    }
+
+    if (old_info != NULL && _buffer_info_cmp(info, old_info) != 0) {
+        _buffer_info_t *next_info = old_info->next;
+        old_info = NULL;  /* Can't use this one, but possibly next */
+
+         if (info->ndim > 1 && next_info != NULL) {
+             /*
+              * Some arrays are C- and F-contiguous and if they have more
+              * than one dimension, the buffer-info may differ between
+              * the two due to RELAXED_STRIDES_CHECKING.
+              * If we export both buffers, the first stored one may be
+              * the one for the other contiguity, so check both.
+              * This is generally very unlikely in all other cases, since
+              * in all other cases the first one will match unless array
+              * metadata was modified in-place (which is discouraged).
+              */
+             if (_buffer_info_cmp(info, next_info) == 0) {
+                 old_info = next_info;
+             }
+         }
+    }
+    if (old_info != NULL) {
+        /*
+         * The two info->format are considered equal if one of them
+         * has no format set (meaning the format is arbitrary and can
+         * be modified). If the new info has a format, but we reuse
+         * the old one, this transfers the ownership to the old one.
+         */
+        if (old_info->format == NULL) {
+            old_info->format = info->format;
+            info->format = NULL;
         }
-        PyDict_DelItem(_buffer_info_cache, key);
+        _buffer_info_free_untagged(info);
+        info = old_info;
+    }
+    else {
+        /* Insert new info as first item in the linked buffer-info list. */
+        info->next = stored_info;
+        *buffer_info_cache_ptr = buffer_info_tag(info);
     }
 
-    Py_DECREF(key);
+    return info;
 }
 
+
 /*
  * Retrieving buffers for ndarray
  */
@@ -705,8 +788,9 @@ array_getbuffer(PyObject *obj, Py_buffer *view, int flags)
         goto fail;
     }
 
-    /* Fill in information */
-    info = _buffer_get_info(obj);
+    /* Fill in information (and add it to _buffer_info if necessary) */
+    info = _buffer_get_info(
+            &((PyArrayObject_fields *)self)->_buffer_info, obj, flags);
     if (info == NULL) {
         goto fail;
     }
@@ -742,35 +826,6 @@ array_getbuffer(PyObject *obj, Py_buffer *view, int flags)
     }
     if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
         view->strides = info->strides;
-
-#ifdef NPY_RELAXED_STRIDES_CHECKING
-        /*
-         * If NPY_RELAXED_STRIDES_CHECKING is on, the array may be
-         * contiguous, but it won't look that way to Python when it
-         * tries to determine contiguity by looking at the strides
-         * (since one of the elements may be -1).  In that case, just
-         * regenerate strides from shape.
-         */
-        if (PyArray_CHKFLAGS(self, NPY_ARRAY_C_CONTIGUOUS) &&
-                !((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS)) {
-            Py_ssize_t sd = view->itemsize;
-            int i;
-
-            for (i = view->ndim-1; i >= 0; --i) {
-                view->strides[i] = sd;
-                sd *= view->shape[i];
-            }
-        }
-        else if (PyArray_CHKFLAGS(self, NPY_ARRAY_F_CONTIGUOUS)) {
-            Py_ssize_t sd = view->itemsize;
-            int i;
-
-            for (i = 0; i < view->ndim; ++i) {
-                view->strides[i] = sd;
-                sd *= view->shape[i];
-            }
-        }
-#endif
     }
     else {
         view->strides = NULL;
@@ -785,90 +840,48 @@ fail:
 }
 
 /*
- * Retrieving buffers for scalars
+ * Retrieving buffers for void scalar (which can contain any complex types),
+ * defined in buffer.c since it requires the complex format building logic.
  */
-int
+NPY_NO_EXPORT int
 void_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
-    _buffer_info_t *info = NULL;
-    PyArray_Descr *descr = NULL;
-    int elsize;
+    PyVoidScalarObject *scalar = (PyVoidScalarObject *)self;
 
     if (flags & PyBUF_WRITABLE) {
         PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
-        goto fail;
-    }
-
-    /* Fill in information */
-    info = _buffer_get_info(self);
-    if (info == NULL) {
-        goto fail;
-    }
-
-    view->ndim = info->ndim;
-    view->shape = info->shape;
-    view->strides = info->strides;
-
-    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
-        view->format = info->format;
-    } else {
-        view->format = NULL;
-    }
-
-    descr = PyArray_DescrFromScalar(self);
-    view->buf = (void *)scalar_value(self, descr);
-    elsize = descr->elsize;
-    view->len = elsize;
-    if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
-        elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
+        return -1;
     }
-    view->itemsize = elsize;
-
-    Py_DECREF(descr);
 
+    view->ndim = 0;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->len = scalar->descr->elsize;
+    view->itemsize = scalar->descr->elsize;
     view->readonly = 1;
     view->suboffsets = NULL;
-    view->obj = self;
     Py_INCREF(self);
-    return 0;
-
-fail:
-    view->obj = NULL;
-    return -1;
-}
-
-/*
- * NOTE: for backward compatibility (esp. with PyArg_ParseTuple("s#", ...))
- * we do *not* define bf_releasebuffer at all.
- *
- * Instead, any extra data allocated with the buffer is released only in
- * array_dealloc.
- *
- * Ensuring that the buffer stays in place is taken care by refcounting;
- * ndarrays do not reallocate if there are references to them, and a buffer
- * view holds one reference.
- */
-
-NPY_NO_EXPORT void
-_dealloc_cached_buffer_info(PyObject *self)
-{
-    int reset_error_state = 0;
-    PyObject *ptype, *pvalue, *ptraceback;
-
-    /* This function may be called when processing an exception --
-     * we need to stash the error state to avoid confusing PyDict
-     */
+    view->obj = self;
+    view->buf = scalar->obval;
 
-    if (PyErr_Occurred()) {
-        reset_error_state = 1;
-        PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+    if (((flags & PyBUF_FORMAT) != PyBUF_FORMAT)) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
     }
 
-    _buffer_clear_info(self);
-
-    if (reset_error_state) {
-        PyErr_Restore(ptype, pvalue, ptraceback);
+    /*
+     * If a format is being exported, we need to use _buffer_get_info
+     * to find the correct format.  This format must also be stored, since
+     * at least in theory it can change (in practice it should never change).
+     */
+    _buffer_info_t *info = _buffer_get_info(&scalar->_buffer_info, self, flags);
+    if (info == NULL) {
+        return -1;
     }
+    view->format = info->format;
+    return 0;
 }
 
 
@@ -931,7 +944,7 @@ _descriptor_from_pep3118_format(char const *s)
     }
     *p = '\0';
 
-    str = PyUString_FromStringAndSize(buf, strlen(buf));
+    str = PyUnicode_FromStringAndSize(buf, strlen(buf));
     if (str == NULL) {
         free(buf);
         return NULL;
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index 92ab75053..43d88271b 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -392,7 +392,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
         else {
             val = PyArray_DIM(arrnew,i);
         }
-        PyTuple_SET_ITEM(newshape, i, PyInt_FromLong((long)val));
+        PyTuple_SET_ITEM(newshape, i, PyLong_FromLong((long)val));
     }
     arr2 = (PyArrayObject *)PyArray_Reshape(arr1, newshape);
     Py_DECREF(arr1);
@@ -1023,7 +1023,7 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     if (min != NULL) {
         if (PyArray_ISUNSIGNED(self)) {
             int cmp;
-            zero = PyInt_FromLong(0);
+            zero = PyLong_FromLong(0);
             cmp = PyObject_RichCompareBool(min, zero, Py_LT);
             if (cmp == -1) {
                 Py_DECREF(zero);
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 2abc79167..841ed799d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -12,7 +12,6 @@
 #include "abstractdtypes.h"
 #include "usertypes.h"
 
-#include "common.h"
 #include "npy_buffer.h"
 
 #include "get_attr_string.h"
@@ -127,26 +126,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
     return 0;
 }
 
-
-/* new reference */
-NPY_NO_EXPORT PyArray_Descr *
-_array_typedescr_fromstr(char const *c_str)
-{
-    PyArray_Descr *descr = NULL;
-    PyObject *stringobj = PyString_FromString(c_str);
-
-    if (stringobj == NULL) {
-        return NULL;
-    }
-    if (PyArray_DescrConverter(stringobj, &descr) != NPY_SUCCEED) {
-        Py_DECREF(stringobj);
-        return NULL;
-    }
-    Py_DECREF(stringobj);
-    return descr;
-}
-
-
 NPY_NO_EXPORT char *
 index2ptr(PyArrayObject *mp, npy_intp i)
 {
@@ -169,7 +148,7 @@ NPY_NO_EXPORT int
 _zerofill(PyArrayObject *ret)
 {
     if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        PyObject *zero = PyInt_FromLong(0);
+        PyObject *zero = PyLong_FromLong(0);
         PyArray_FillObjectArray(ret, zero);
         Py_DECREF(zero);
         if (PyErr_Occurred()) {
@@ -254,7 +233,6 @@ NPY_NO_EXPORT PyObject *
 convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending)
 {
     npy_intp i;
-    PyObject *ret, *tmp;
 
     /*
      * Negative dimension indicates "newaxis", which can
@@ -264,40 +242,40 @@ convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending)
     for (i = 0; i < n && vals[i] < 0; i++);
 
     if (i == n) {
-        return PyUString_FromFormat("()%s", ending);
-    }
-    else {
-        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
-        if (ret == NULL) {
-            return NULL;
-        }
+        return PyUnicode_FromFormat("()%s", ending);
     }
 
+    PyObject *ret = PyUnicode_FromFormat("%" NPY_INTP_FMT, vals[i++]);
+    if (ret == NULL) {
+        return NULL;
+    }
     for (; i < n; ++i) {
+        PyObject *tmp;
+
         if (vals[i] < 0) {
-            tmp = PyUString_FromString(",newaxis");
+            tmp = PyUnicode_FromString(",newaxis");
         }
         else {
-            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
+            tmp = PyUnicode_FromFormat(",%" NPY_INTP_FMT, vals[i]);
         }
         if (tmp == NULL) {
             Py_DECREF(ret);
             return NULL;
         }
 
-        PyUString_ConcatAndDel(&ret, tmp);
+        Py_SETREF(ret, PyUnicode_Concat(ret, tmp));
+        Py_DECREF(tmp);
         if (ret == NULL) {
             return NULL;
         }
     }
 
     if (i == 1) {
-        tmp = PyUString_FromFormat(",)%s", ending);
+        Py_SETREF(ret, PyUnicode_FromFormat("(%S,)%s", ret, ending));
     }
     else {
-        tmp = PyUString_FromFormat(")%s", ending);
+        Py_SETREF(ret, PyUnicode_FromFormat("(%S)%s", ret, ending));
     }
-    PyUString_ConcatAndDel(&ret, tmp);
     return ret;
 }
 
@@ -310,7 +288,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j)
              *shape1 = NULL, *shape2 = NULL,
              *shape1_i = NULL, *shape2_j = NULL;
 
-    format = PyUString_FromString("shapes %s and %s not aligned:"
+    format = PyUnicode_FromString("shapes %s and %s not aligned:"
                                   " %d (dim %d) != %d (dim %d)");
 
     shape1 = convert_shape_to_string(PyArray_NDIM(a), PyArray_DIMS(a), "");
@@ -333,7 +311,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j)
         goto end;
     }
 
-    errmsg = PyUString_Format(format, fmt_args);
+    errmsg = PyUnicode_Format(format, fmt_args);
     if (errmsg != NULL) {
         PyErr_SetObject(PyExc_ValueError, errmsg);
     }
@@ -373,10 +351,7 @@ _unpack_field(PyObject *value, PyArray_Descr **descr, npy_intp *offset)
     *descr = (PyArray_Descr *)PyTuple_GET_ITEM(value, 0);
     off  = PyTuple_GET_ITEM(value, 1);
 
-    if (PyInt_Check(off)) {
-        *offset = PyInt_AsSsize_t(off);
-    }
-    else if (PyLong_Check(off)) {
+    if (PyLong_Check(off)) {
         *offset = PyLong_AsSsize_t(off);
     }
     else {
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 793cefaf8..ef9bc79da 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -2,7 +2,6 @@
 #define _NPY_PRIVATE_COMMON_H_
 #include "structmember.h"
 #include <numpy/npy_common.h>
-#include <numpy/npy_cpu.h>
 #include <numpy/ndarraytypes.h>
 #include <limits.h>
 #include "npy_import.h"
@@ -292,43 +291,6 @@ npy_memchr(char * haystack, char needle,
     return p;
 }
 
-/*
- * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
- * (BLAS won't handle negative or zero strides the way we want).
- */
-static NPY_INLINE int
-blas_stride(npy_intp stride, unsigned itemsize)
-{
-    /*
-     * Should probably check pointer alignment also, but this may cause
-     * problems if we require complex to be 16 byte aligned.
-     */
-    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
-        stride /= itemsize;
-#ifndef HAVE_BLAS_ILP64
-        if (stride <= INT_MAX) {
-#else
-        if (stride <= NPY_MAX_INT64) {
-#endif
-            return stride;
-        }
-    }
-    return 0;
-}
-
-/*
- * Define a chunksize for CBLAS. CBLAS counts in integers.
- */
-#if NPY_MAX_INTP > INT_MAX
-# ifndef HAVE_BLAS_ILP64
-#  define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
-# else
-#  define NPY_CBLAS_CHUNK  (NPY_MAX_INT64 / 2 + 1)
-# endif
-#else
-# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
-#endif
-
 #include "ucsnarrow.h"
 
 /*
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index a8e4aa789..da857071b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -249,7 +249,7 @@ arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     NPY_END_THREADS
     Py_DECREF(arr_x);
 
-    return PyInt_FromLong(monotonic);
+    return PyLong_FromLong(monotonic);
 }
 
 /*
@@ -1229,41 +1229,6 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
 
     char *kwlist[] = {"indices", "shape", "order", NULL};
 
-    /*
-     * TODO: remove this in favor of warning raised in the dispatcher when
-     * __array_function__ is enabled by default.
-     */
-
-    /*
-     * Continue to support the older "dims" argument in place
-     * of the "shape" argument. Issue an appropriate warning
-     * if "dims" is detected in keywords, then replace it with
-     * the new "shape" argument and continue processing as usual.
-     */
-    if (kwds) {
-        PyObject *dims_item, *shape_item;
-        dims_item = _PyDict_GetItemStringWithError(kwds, "dims");
-        if (dims_item == NULL && PyErr_Occurred()){
-            return NULL;
-        }
-        shape_item = _PyDict_GetItemStringWithError(kwds, "shape");
-        if (shape_item == NULL && PyErr_Occurred()){
-            return NULL;
-        }
-        if (dims_item != NULL && shape_item == NULL) {
-            if (DEPRECATE("'shape' argument should be"
-                          " used instead of 'dims'") < 0) {
-                return NULL;
-            }
-            if (PyDict_SetItemString(kwds, "shape", dims_item) < 0) {
-                return NULL;
-            }
-            if (PyDict_DelItemString(kwds, "dims") < 0) {
-                return NULL;
-            }
-        }
-    }
-
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:unravel_index",
                     kwlist,
                     &indices0,
@@ -1420,7 +1385,7 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *obj;
     PyObject *str;
-    #if (PY_VERSION_HEX >= 0x030700A2)
+    #if PY_VERSION_HEX >= 0x030700A2 && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300)
     const char *docstr;
     #else
     char *docstr;
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index e41fdc8f1..dd18f71fd 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -6,7 +6,6 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-#include "numpy/arrayobject.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -1152,7 +1151,7 @@ PyArray_IntTupleFromIntp(int len, npy_intp const *vals)
     }
     for (i = 0; i < len; i++) {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-        PyObject *o = PyInt_FromLong((long) vals[i]);
+        PyObject *o = PyLong_FromLong((long) vals[i]);
 #else
         PyObject *o = PyLong_FromLongLong((npy_longlong) vals[i]);
 #endif
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index e7cbeaa77..29a2bb0e8 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -8,9 +8,6 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-
-#include "npy_config.h"
-
 #include "npy_pycompat.h"
 
 #include "common.h"
@@ -248,13 +245,13 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
                     return -1;
                 }
                 PyTuple_SET_ITEM(tupobj,0,obj);
-                obj = PyUString_FromString((const char *)format);
+                obj = PyUnicode_FromString((const char *)format);
                 if (obj == NULL) {
                     Py_DECREF(tupobj);
                     Py_DECREF(it);
                     return -1;
                 }
-                strobj = PyUString_Format(obj, tupobj);
+                strobj = PyUnicode_Format(obj, tupobj);
                 Py_DECREF(obj);
                 Py_DECREF(tupobj);
                 if (strobj == NULL) {
@@ -403,7 +400,7 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
         }
     }
     /* Python integer */
-    else if (PyLong_Check(obj) || PyInt_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Try long long before unsigned long long */
         npy_longlong ll_v = PyLong_AsLongLong(obj);
         if (error_converting(ll_v)) {
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 94cd1e5fa..f9dd35a73 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -18,10 +18,13 @@
 #include "dtypemeta.h"
 #include "scalartypes.h"
 #include "mapping.h"
+#include "legacy_dtype_implementation.h"
 
 #include "convert_datatype.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "array_method.h"
+#include "usertypes.h"
 
 
 /*
@@ -35,6 +38,183 @@
  */
 NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[] = {0, 3, 5, 10, 10, 20, 20, 20, 20};
 
+
+static PyObject *
+PyArray_GetGenericToVoidCastingImpl(void);
+
+static PyObject *
+PyArray_GetVoidToGenericCastingImpl(void);
+
+static PyObject *
+PyArray_GetGenericToObjectCastingImpl(void);
+
+static PyObject *
+PyArray_GetObjectToGenericCastingImpl(void);
+
+
+/**
+ * Fetch the casting implementation from one DType to another.
+ *
+ * @params from
+ * @params to
+ *
+ * @returns A castingimpl (PyArrayDTypeMethod *), None or NULL with an
+ *          error set.
+ */
+static PyObject *
+PyArray_GetCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyObject *res = PyDict_GetItem(from->castingimpls, (PyObject *)to);
+    if (res != NULL || PyErr_Occurred()) {
+        Py_XINCREF(res);
+        return res;
+    }
+    /*
+     * The following code looks up CastingImpl based on the fact that anything
+     * can be cast to and from objects or structured (void) dtypes.
+     *
+     * The last part adds casts dynamically based on legacy definition
+     */
+    if (from->type_num == NPY_OBJECT) {
+        res = PyArray_GetObjectToGenericCastingImpl();
+    }
+    else if (to->type_num == NPY_OBJECT) {
+        res = PyArray_GetGenericToObjectCastingImpl();
+    }
+    else if (from->type_num == NPY_VOID) {
+        res = PyArray_GetVoidToGenericCastingImpl();
+    }
+    else if (to->type_num == NPY_VOID) {
+        res = PyArray_GetGenericToVoidCastingImpl();
+    }
+    else if (from->type_num < NPY_NTYPES && to->type_num < NPY_NTYPES) {
+        /* All builtin dtypes have their casts explicitly defined. */
+        PyErr_Format(PyExc_RuntimeError,
+                "builtin cast from %S to %s not found, this should not "
+                "be possible.", from, to);
+        return NULL;
+    }
+    else {
+        if (from->parametric || to->parametric) {
+            Py_RETURN_NONE;
+        }
+        /* Reject non-legacy dtypes (they need to use the new API) */
+        if (!from->legacy || !to->legacy) {
+            Py_RETURN_NONE;
+        }
+        if (from != to) {
+            /* A cast function must have been registered */
+            PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc(
+                    from->singleton, to->type_num);
+            if (castfunc == NULL) {
+                PyErr_Clear();
+                /* Remember that this cast is not possible */
+                if (PyDict_SetItem(from->castingimpls, (PyObject *) to, Py_None) < 0) {
+                    return NULL;
+                }
+                Py_RETURN_NONE;
+            }
+        }
+
+        /* PyArray_AddLegacyWrapping_CastingImpl find the correct casting level: */
+        /*
+         * TODO: Possibly move this to the cast registration time. But if we do
+         *       that, we have to also update the cast when the casting safety
+         *       is registered.
+         */
+        if (PyArray_AddLegacyWrapping_CastingImpl(from, to, -1) < 0) {
+            return NULL;
+        }
+        return PyArray_GetCastingImpl(from, to);
+    }
+
+    if (res == NULL) {
+        return NULL;
+    }
+    if (PyDict_SetItem(from->castingimpls, (PyObject *)to, res) < 0) {
+        Py_DECREF(res);
+        return NULL;
+    }
+    return res;
+}
+
+
+/**
+ * Fetch the (bound) casting implementation from one DType to another.
+ *
+ * @params from
+ * @params to
+ *
+ * @returns A bound casting implementation or None (or NULL for error).
+ */
+static PyObject *
+PyArray_GetBoundCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyObject *method = PyArray_GetCastingImpl(from, to);
+    if (method == NULL || method == Py_None) {
+        return method;
+    }
+
+    /* TODO: Create better way to wrap method into bound method */
+    PyBoundArrayMethodObject *res;
+    res = PyObject_New(PyBoundArrayMethodObject, &PyBoundArrayMethod_Type);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->method = (PyArrayMethodObject *)method;
+    res->dtypes = PyMem_Malloc(2 * sizeof(PyArray_DTypeMeta *));
+    if (res->dtypes == NULL) {
+        Py_DECREF(res);
+        return NULL;
+    }
+    Py_INCREF(from);
+    res->dtypes[0] = from;
+    Py_INCREF(to);
+    res->dtypes[1] = to;
+
+    return (PyObject *)res;
+}
+
+
+NPY_NO_EXPORT PyObject *
+_get_castingimpl(PyObject *NPY_UNUSED(module), PyObject *args)
+{
+    PyArray_DTypeMeta *from, *to;
+    if (!PyArg_ParseTuple(args, "O!O!:_get_castingimpl",
+            &PyArrayDTypeMeta_Type, &from, &PyArrayDTypeMeta_Type, &to)) {
+        return NULL;
+    }
+    return PyArray_GetBoundCastingImpl(from, to);
+}
+
+
+/**
+ * Find the minimal cast safety level given two cast-levels as input.
+ * Supports the NPY_CAST_IS_VIEW check, and should be preferred to allow
+ * extending cast-levels if necessary.
+ * It is not valid for one of the arguments to be -1 to indicate an error.
+ *
+ * @param casting1
+ * @param casting2
+ * @return The minimal casting error (can be -1).
+ */
+NPY_NO_EXPORT NPY_CASTING
+PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2)
+{
+    if (casting1 < 0 || casting2 < 0) {
+        return -1;
+    }
+    NPY_CASTING view = casting1 & casting2 & _NPY_CAST_IS_VIEW;
+    casting1 = casting1 & ~_NPY_CAST_IS_VIEW;
+    casting2 = casting2 & ~_NPY_CAST_IS_VIEW;
+    /* larger casting values are less safe */
+    if (casting1 > casting2) {
+        return casting1 | view;
+    }
+    return casting2 | view;
+}
+
+
 /*NUMPY_API
  * For backward compatibility
  *
@@ -92,11 +272,14 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
             PyObject *key;
             PyObject *cobj;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             cobj = PyDict_GetItem(obj, key);
             Py_DECREF(key);
-            if (cobj && NpyCapsule_Check(cobj)) {
-                castfunc = NpyCapsule_AsVoidPtr(cobj);
+            if (cobj && PyCapsule_CheckExact(cobj)) {
+                castfunc = PyCapsule_GetPointer(cobj, NULL);
+                if (castfunc == NULL) {
+                    return NULL;
+                }
             }
         }
     }
@@ -129,170 +312,6 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
     return NULL;
 }
 
-/*
- * Legacy function to find the correct dtype when casting from any built-in
- * dtype to NPY_STRING, NPY_UNICODE, NPY_VOID, and NPY_DATETIME with generic
- * units.
- *
- * This function returns a dtype based on flex_dtype and the values in
- * data_dtype. It also calls Py_DECREF on the flex_dtype. If the
- * flex_dtype is not flexible, it returns it as-is.
- *
- * Usually, if data_obj is not an array, dtype should be the result
- * given by the PyArray_GetArrayParamsFromObject function.
- *
- * If *flex_dtype is NULL, returns immediately, without setting an
- * exception, leaving any previous error handling intact.
- */
-NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype)
-{
-    PyArray_DatetimeMetaData *meta;
-    PyArray_Descr *retval = NULL;
-    int flex_type_num;
-
-    if (flex_dtype == NULL) {
-        return retval;
-    }
-
-    flex_type_num = flex_dtype->type_num;
-
-    /* Flexible types with expandable size */
-    if (PyDataType_ISUNSIZED(flex_dtype)) {
-        /* First replace the flex_dtype */
-        retval = PyArray_DescrNew(flex_dtype);
-        Py_DECREF(flex_dtype);
-        if (retval == NULL) {
-            return retval;
-        }
-
-        if (data_dtype->type_num == flex_type_num ||
-                                    flex_type_num == NPY_VOID) {
-            (retval)->elsize = data_dtype->elsize;
-        }
-        else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
-            npy_intp size = 8;
-
-            /*
-             * Get a string-size estimate of the input. These
-             * are generallly the size needed, rounded up to
-             * a multiple of eight.
-             */
-            switch (data_dtype->type_num) {
-                case NPY_BOOL:
-                case NPY_UBYTE:
-                case NPY_BYTE:
-                case NPY_USHORT:
-                case NPY_SHORT:
-                case NPY_UINT:
-                case NPY_INT:
-                case NPY_ULONG:
-                case NPY_LONG:
-                case NPY_ULONGLONG:
-                case NPY_LONGLONG:
-                    if (data_dtype->kind == 'b') {
-                        /* 5 chars needed for cast to 'True' or 'False' */
-                        size = 5;
-                    }
-                    else if (data_dtype->elsize > 8 ||
-                             data_dtype->elsize < 0) {
-                        /*
-                         * Element size should never be greater than 8 or
-                         * less than 0 for integer type, but just in case...
-                         */
-                        break;
-                    }
-                    else if (data_dtype->kind == 'u') {
-                        size = REQUIRED_STR_LEN[data_dtype->elsize];
-                    }
-                    else if (data_dtype->kind == 'i') {
-                        /* Add character for sign symbol */
-                        size = REQUIRED_STR_LEN[data_dtype->elsize] + 1;
-                    }
-                    break;
-                case NPY_HALF:
-                case NPY_FLOAT:
-                case NPY_DOUBLE:
-                    size = 32;
-                    break;
-                case NPY_LONGDOUBLE:
-                    size = 48;
-                    break;
-                case NPY_CFLOAT:
-                case NPY_CDOUBLE:
-                    size = 2 * 32;
-                    break;
-                case NPY_CLONGDOUBLE:
-                    size = 2 * 48;
-                    break;
-                case NPY_OBJECT:
-                    size = 64;
-                    break;
-                case NPY_STRING:
-                case NPY_VOID:
-                    size = data_dtype->elsize;
-                    break;
-                case NPY_UNICODE:
-                    size = data_dtype->elsize / 4;
-                    break;
-                case NPY_DATETIME:
-                    meta = get_datetime_metadata_from_dtype(data_dtype);
-                    if (meta == NULL) {
-                        Py_DECREF(retval);
-                        return NULL;
-                    }
-                    size = get_datetime_iso_8601_strlen(0, meta->base);
-                    break;
-                case NPY_TIMEDELTA:
-                    size = 21;
-                    break;
-            }
-
-            if (flex_type_num == NPY_STRING) {
-                retval->elsize = size;
-            }
-            else if (flex_type_num == NPY_UNICODE) {
-                retval->elsize = size * 4;
-            }
-        }
-        else {
-            /*
-             * We should never get here, but just in case someone adds
-             * a new flex dtype...
-             */
-            PyErr_SetString(PyExc_TypeError,
-                    "don't know how to adapt flex dtype");
-            Py_DECREF(retval);
-            return NULL;
-        }
-    }
-    /* Flexible type with generic time unit that adapts */
-    else if (flex_type_num == NPY_DATETIME ||
-                flex_type_num == NPY_TIMEDELTA) {
-        meta = get_datetime_metadata_from_dtype(flex_dtype);
-        retval = flex_dtype;
-        if (meta == NULL) {
-            return NULL;
-        }
-
-        if (meta->base == NPY_FR_GENERIC) {
-            if (data_dtype->type_num == NPY_DATETIME ||
-                    data_dtype->type_num == NPY_TIMEDELTA) {
-                meta = get_datetime_metadata_from_dtype(data_dtype);
-                if (meta == NULL) {
-                    return NULL;
-                }
-
-                retval = create_datetime_dtype(flex_type_num, meta);
-                Py_DECREF(flex_dtype);
-            }
-        }
-    }
-    else {
-        retval = flex_dtype;
-    }
-    return retval;
-}
 
 /*
  * Must be broadcastable.
@@ -322,60 +341,115 @@ PyArray_CastAnyTo(PyArrayObject *out, PyArrayObject *mp)
     return PyArray_CopyAnyInto(out, mp);
 }
 
+
+/**
+ * Given two dtype instances, find the correct casting safety.
+ *
+ * Note that in many cases, it may be preferable to fetch the casting
+ * implementations fully to have them available for doing the actual cast
+ * later.
+ *
+ * @param from
+ * @param to The descriptor to cast to (may be NULL)
+ * @param to_dtype If `to` is NULL, must pass the to_dtype (otherwise this
+ *        is ignored).
+ * @return NPY_CASTING or -1 on error or if the cast is not possible.
+ */
+NPY_NO_EXPORT NPY_CASTING
+PyArray_GetCastSafety(
+        PyArray_Descr *from, PyArray_Descr *to, PyArray_DTypeMeta *to_dtype)
+{
+    NPY_CASTING casting;
+    if (to != NULL) {
+        to_dtype = NPY_DTYPE(to);
+    }
+    PyObject *meth = PyArray_GetCastingImpl(NPY_DTYPE(from), to_dtype);
+    if (meth == NULL) {
+        return -1;
+    }
+    if (meth == Py_None) {
+        Py_DECREF(Py_None);
+        return -1;
+    }
+
+    PyArrayMethodObject *castingimpl = (PyArrayMethodObject *)meth;
+
+    PyArray_DTypeMeta *dtypes[2] = {NPY_DTYPE(from), to_dtype};
+    PyArray_Descr *descrs[2] = {from, to};
+    PyArray_Descr *out_descrs[2];
+
+    casting = castingimpl->resolve_descriptors(
+            castingimpl, dtypes, descrs, out_descrs);
+    Py_DECREF(meth);
+    if (casting < 0) {
+        return -1;
+    }
+    /* The returned descriptors may not match, requiring a second check */
+    if (out_descrs[0] != descrs[0]) {
+        NPY_CASTING from_casting = PyArray_GetCastSafety(
+                descrs[0], out_descrs[0], NULL);
+        casting = PyArray_MinCastSafety(casting, from_casting);
+        if (casting < 0) {
+            goto finish;
+        }
+    }
+    if (descrs[1] != NULL && out_descrs[1] != descrs[1]) {
+        NPY_CASTING from_casting = PyArray_GetCastSafety(
+                descrs[1], out_descrs[1], NULL);
+        casting = PyArray_MinCastSafety(casting, from_casting);
+        if (casting < 0) {
+            goto finish;
+        }
+    }
+
+  finish:
+    Py_DECREF(out_descrs[0]);
+    Py_DECREF(out_descrs[1]);
+    /* NPY_NO_CASTING has to be used for (NPY_EQUIV_CASTING|_NPY_CAST_IS_VIEW) */
+    assert(casting != (NPY_EQUIV_CASTING|_NPY_CAST_IS_VIEW));
+    return casting;
+}
+
+
 /*NUMPY_API
  *Check the type coercion rules.
  */
 NPY_NO_EXPORT int
 PyArray_CanCastSafely(int fromtype, int totype)
 {
-    PyArray_Descr *from;
-
-    /* Fast table lookup for small type numbers */
-    if ((unsigned int)fromtype < NPY_NTYPES &&
-                                (unsigned int)totype < NPY_NTYPES) {
-        return _npy_can_cast_safely_table[fromtype][totype];
+#if NPY_USE_NEW_CASTINGIMPL
+    PyArray_DTypeMeta *from = PyArray_DTypeFromTypeNum(fromtype);
+    if (from == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-
-    /* Identity */
-    if (fromtype == totype) {
-        return 1;
+    PyArray_DTypeMeta *to = PyArray_DTypeFromTypeNum(totype);
+    if (to == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-    /* Special-cases for some types */
-    switch (fromtype) {
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-        case NPY_OBJECT:
-        case NPY_VOID:
-            return 0;
-        case NPY_BOOL:
-            return 1;
+    PyObject *castingimpl = PyArray_GetCastingImpl(from, to);
+    Py_DECREF(from);
+    Py_DECREF(to);
+
+    if (castingimpl == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-    switch (totype) {
-        case NPY_BOOL:
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-            return 0;
-        case NPY_OBJECT:
-        case NPY_VOID:
-            return 1;
+    else if (castingimpl == Py_None) {
+        Py_DECREF(Py_None);
+        return 0;
     }
+    NPY_CASTING safety = ((PyArrayMethodObject *)castingimpl)->casting;
+    int res = PyArray_MinCastSafety(safety, NPY_SAFE_CASTING) == NPY_SAFE_CASTING;
+    Py_DECREF(castingimpl);
+    return res;
+#else
+    return PyArray_LegacyCanCastSafely(fromtype, totype);
+#endif
+}
 
-    from = PyArray_DescrFromType(fromtype);
-    /*
-     * cancastto is a NPY_NOTYPE terminated C-int-array of types that
-     * the data-type can be cast to safely.
-     */
-    if (from->f->cancastto) {
-        int *curtype = from->f->cancastto;
 
-        while (*curtype != NPY_NOTYPE) {
-            if (*curtype++ == totype) {
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
 
 /*NUMPY_API
  * leaves reference count alone --- cannot be NULL
@@ -386,117 +460,16 @@ PyArray_CanCastSafely(int fromtype, int totype)
 NPY_NO_EXPORT npy_bool
 PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
 {
-    int from_type_num = from->type_num;
-    int to_type_num = to->type_num;
-    npy_bool ret;
-
-    ret = (npy_bool) PyArray_CanCastSafely(from_type_num, to_type_num);
-    if (ret) {
-        /* Check String and Unicode more closely */
-        if (from_type_num == NPY_STRING) {
-            if (to_type_num == NPY_STRING) {
-                ret = (from->elsize <= to->elsize);
-            }
-            else if (to_type_num == NPY_UNICODE) {
-                ret = (from->elsize << 2 <= to->elsize);
-            }
-        }
-        else if (from_type_num == NPY_UNICODE) {
-            if (to_type_num == NPY_UNICODE) {
-                ret = (from->elsize <= to->elsize);
-            }
-        }
-        /*
-         * For datetime/timedelta, only treat casts moving towards
-         * more precision as safe.
-         */
-        else if (from_type_num == NPY_DATETIME && to_type_num == NPY_DATETIME) {
-            PyArray_DatetimeMetaData *meta1, *meta2;
-            meta1 = get_datetime_metadata_from_dtype(from);
-            if (meta1 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-            meta2 = get_datetime_metadata_from_dtype(to);
-            if (meta2 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-
-            return can_cast_datetime64_metadata(meta1, meta2,
-                                                NPY_SAFE_CASTING);
-        }
-        else if (from_type_num == NPY_TIMEDELTA &&
-                                    to_type_num == NPY_TIMEDELTA) {
-            PyArray_DatetimeMetaData *meta1, *meta2;
-            meta1 = get_datetime_metadata_from_dtype(from);
-            if (meta1 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-            meta2 = get_datetime_metadata_from_dtype(to);
-            if (meta2 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-
-            return can_cast_timedelta64_metadata(meta1, meta2,
-                                                 NPY_SAFE_CASTING);
-        }
-        /*
-         * If to_type_num is STRING or unicode
-         * see if the length is long enough to hold the
-         * stringified value of the object.
-         */
-        else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
-            /*
-             * Boolean value cast to string type is 5 characters max
-             * for string 'False'.
-             */
-            int char_size = 1;
-            if (to_type_num == NPY_UNICODE) {
-                char_size = 4;
-            }
-
-            ret = 0;
-            if (PyDataType_ISUNSIZED(to)) {
-                ret = 1;
-            }
-            /*
-             * Need at least 5 characters to convert from boolean
-             * to 'True' or 'False'.
-             */
-            else if (from->kind == 'b' && to->elsize >= 5 * char_size) {
-                ret = 1;
-            }
-            else if (from->kind == 'u') {
-                /* Guard against unexpected integer size */
-                if (from->elsize > 8 || from->elsize < 0) {
-                    ret = 0;
-                }
-                else if (to->elsize >=
-                        REQUIRED_STR_LEN[from->elsize] * char_size) {
-                    ret = 1;
-                }
-            }
-            else if (from->kind == 'i') {
-                /* Guard against unexpected integer size */
-                if (from->elsize > 8 || from->elsize < 0) {
-                    ret = 0;
-                }
-                /* Extra character needed for sign */
-                else if (to->elsize >=
-                        (REQUIRED_STR_LEN[from->elsize] + 1) * char_size) {
-                    ret = 1;
-                }
-            }
-        }
-    }
-    return ret;
+#if NPY_USE_NEW_CASTINGIMPL
+    return PyArray_CanCastTypeTo(from, to, NPY_SAFE_CASTING);
+#else
+    return PyArray_LegacyCanCastTo(from, to);
+#endif
 }
 
+
 /* Provides an ordering for the dtype 'kind' character codes */
-static int
+NPY_NO_EXPORT int
 dtype_kind_to_ordering(char kind)
 {
     switch (kind) {
@@ -557,51 +530,6 @@ type_num_unsigned_to_signed(int type_num)
     }
 }
 
-/*
- * Compare two field dictionaries for castability.
- *
- * Return 1 if 'field1' can be cast to 'field2' according to the rule
- * 'casting', 0 if not.
- *
- * Castabiliy of field dictionaries is defined recursively: 'field1' and
- * 'field2' must have the same field names (possibly in different
- * orders), and the corresponding field types must be castable according
- * to the given casting rule.
- */
-static int
-can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
-{
-    Py_ssize_t ppos;
-    PyObject *key;
-    PyObject *tuple1, *tuple2;
-
-    if (field1 == field2) {
-        return 1;
-    }
-    if (field1 == NULL || field2 == NULL) {
-        return 0;
-    }
-    if (PyDict_Size(field1) != PyDict_Size(field2)) {
-        return 0;
-    }
-
-    /* Iterate over all the fields and compare for castability */
-    ppos = 0;
-    while (PyDict_Next(field1, &ppos, &key, &tuple1)) {
-        if ((tuple2 = PyDict_GetItem(field2, key)) == NULL) {
-            return 0;
-        }
-        /* Compare the dtype of the field for castability */
-        if (!PyArray_CanCastTypeTo(
-                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple1, 0),
-                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple2, 0),
-                        casting)) {
-            return 0;
-        }
-    }
-
-    return 1;
-}
 
 /*NUMPY_API
  * Returns true if data of type 'from' may be cast to data of type
@@ -609,224 +537,41 @@ can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
  */
 NPY_NO_EXPORT npy_bool
 PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
-                                                    NPY_CASTING casting)
+        NPY_CASTING casting)
 {
+#if NPY_USE_NEW_CASTINGIMPL
     /*
-     * Fast paths for equality and for basic types.
+     * NOTE: This code supports U and S, this is identical to the code
+     *       in `ctors.c` which does not allow these dtypes to be attached
+     *       to an array. Unlike the code for `np.array(..., dtype=)`
+     *       which uses `PyArray_ExtractDTypeAndDescriptor` it rejects "m8"
+     *       as a flexible dtype instance representing a DType.
      */
-    if (from == to ||
-        ((NPY_LIKELY(PyDataType_ISNUMBER(from)) ||
-          PyDataType_ISOBJECT(from)) &&
-         NPY_LIKELY(from->type_num == to->type_num) &&
-         NPY_LIKELY(from->byteorder == to->byteorder))) {
-        return 1;
-    }
     /*
-     * Cases with subarrays and fields need special treatment.
+     * TODO: We should grow support for `np.can_cast("d", "S")` being
+     *       different from `np.can_cast("d", "S0")` here, at least for
+     *       the python side API.
      */
-    if (PyDataType_HASFIELDS(from)) {
-        /*
-         * If from is a structured data type, then it can be cast to a simple
-         * non-object one only for unsafe casting *and* if it has a single
-         * field; recurse just in case the single field is itself structured.
-         */
-        if (!PyDataType_HASFIELDS(to) && !PyDataType_ISOBJECT(to)) {
-            if (casting == NPY_UNSAFE_CASTING &&
-                    PyDict_Size(from->fields) == 1) {
-                Py_ssize_t ppos = 0;
-                PyObject *tuple;
-                PyArray_Descr *field;
-                PyDict_Next(from->fields, &ppos, NULL, &tuple);
-                field = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
-                /*
-                 * For a subarray, we need to get the underlying type;
-                 * since we already are casting unsafely, we can ignore
-                 * the shape.
-                 */
-                if (PyDataType_HASSUBARRAY(field)) {
-                    field = field->subarray->base;
-                }
-                return PyArray_CanCastTypeTo(field, to, casting);
-            }
-            else {
-                return 0;
-            }
-        }
-        /*
-         * Casting from one structured data type to another depends on the fields;
-         * we pass that case on to the EquivTypenums case below.
-         *
-         * TODO: move that part up here? Need to check whether equivalent type
-         * numbers is an addition constraint that is needed.
-         *
-         * TODO/FIXME: For now, always allow structured to structured for unsafe
-         * casting; this is not correct, but needed since the treatment in can_cast
-         * below got out of sync with astype; see gh-13667.
-         */
-        if (casting == NPY_UNSAFE_CASTING) {
-            return 1;
-        }
+    NPY_CASTING safety;
+    if (PyDataType_ISUNSIZED(to) && to->subarray == NULL) {
+        safety = PyArray_GetCastSafety(from, NULL, NPY_DTYPE(to));
     }
-    else if (PyDataType_HASFIELDS(to)) {
-        /*
-         * If "from" is a simple data type and "to" has fields, then only
-         * unsafe casting works (and that works always, even to multiple fields).
-         */
-        return casting == NPY_UNSAFE_CASTING;
-    }
-    /*
-     * Everything else we consider castable for unsafe for now.
-     * FIXME: ensure what we do here is consistent with "astype",
-     * i.e., deal more correctly with subarrays and user-defined dtype.
-     */
-    else if (casting == NPY_UNSAFE_CASTING) {
-        return 1;
-    }
-    /*
-     * Equivalent simple types can be cast with any value of 'casting', but
-     * we need to be careful about structured to structured.
-     */
-    if (PyArray_EquivTypenums(from->type_num, to->type_num)) {
-        /* For complicated case, use EquivTypes (for now) */
-        if (PyTypeNum_ISUSERDEF(from->type_num) ||
-                        from->subarray != NULL) {
-            int ret;
-
-            /* Only NPY_NO_CASTING prevents byte order conversion */
-            if ((casting != NPY_NO_CASTING) &&
-                                (!PyArray_ISNBO(from->byteorder) ||
-                                 !PyArray_ISNBO(to->byteorder))) {
-                PyArray_Descr *nbo_from, *nbo_to;
-
-                nbo_from = PyArray_DescrNewByteorder(from, NPY_NATIVE);
-                nbo_to = PyArray_DescrNewByteorder(to, NPY_NATIVE);
-                if (nbo_from == NULL || nbo_to == NULL) {
-                    Py_XDECREF(nbo_from);
-                    Py_XDECREF(nbo_to);
-                    PyErr_Clear();
-                    return 0;
-                }
-                ret = PyArray_EquivTypes(nbo_from, nbo_to);
-                Py_DECREF(nbo_from);
-                Py_DECREF(nbo_to);
-            }
-            else {
-                ret = PyArray_EquivTypes(from, to);
-            }
-            return ret;
-        }
-
-        if (PyDataType_HASFIELDS(from)) {
-            switch (casting) {
-                case NPY_EQUIV_CASTING:
-                case NPY_SAFE_CASTING:
-                case NPY_SAME_KIND_CASTING:
-                    /*
-                     * `from' and `to' must have the same fields, and
-                     * corresponding fields must be (recursively) castable.
-                     */
-                    return can_cast_fields(from->fields, to->fields, casting);
-
-                case NPY_NO_CASTING:
-                default:
-                    return PyArray_EquivTypes(from, to);
-            }
-        }
-
-        switch (from->type_num) {
-            case NPY_DATETIME: {
-                PyArray_DatetimeMetaData *meta1, *meta2;
-                meta1 = get_datetime_metadata_from_dtype(from);
-                if (meta1 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-                meta2 = get_datetime_metadata_from_dtype(to);
-                if (meta2 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-
-                if (casting == NPY_NO_CASTING) {
-                    return PyArray_ISNBO(from->byteorder) ==
-                                        PyArray_ISNBO(to->byteorder) &&
-                            can_cast_datetime64_metadata(meta1, meta2, casting);
-                }
-                else {
-                    return can_cast_datetime64_metadata(meta1, meta2, casting);
-                }
-            }
-            case NPY_TIMEDELTA: {
-                PyArray_DatetimeMetaData *meta1, *meta2;
-                meta1 = get_datetime_metadata_from_dtype(from);
-                if (meta1 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-                meta2 = get_datetime_metadata_from_dtype(to);
-                if (meta2 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-
-                if (casting == NPY_NO_CASTING) {
-                    return PyArray_ISNBO(from->byteorder) ==
-                                        PyArray_ISNBO(to->byteorder) &&
-                        can_cast_timedelta64_metadata(meta1, meta2, casting);
-                }
-                else {
-                    return can_cast_timedelta64_metadata(meta1, meta2, casting);
-                }
-            }
-            default:
-                switch (casting) {
-                    case NPY_NO_CASTING:
-                        return PyArray_EquivTypes(from, to);
-                    case NPY_EQUIV_CASTING:
-                        return (from->elsize == to->elsize);
-                    case NPY_SAFE_CASTING:
-                        return (from->elsize <= to->elsize);
-                    default:
-                        return 1;
-                }
-                break;
-        }
+    else {
+        safety = PyArray_GetCastSafety(from, to, NPY_DTYPE(to));
     }
-    /* If safe or same-kind casts are allowed */
-    else if (casting == NPY_SAFE_CASTING || casting == NPY_SAME_KIND_CASTING) {
-        if (PyArray_CanCastTo(from, to)) {
-            return 1;
-        }
-        else if(casting == NPY_SAME_KIND_CASTING) {
-            /*
-             * Also allow casting from lower to higher kinds, according
-             * to the ordering provided by dtype_kind_to_ordering.
-             * Some kinds, like datetime, don't fit in the hierarchy,
-             * and are special cased as -1.
-             */
-            int from_order, to_order;
-
-            from_order = dtype_kind_to_ordering(from->kind);
-            to_order = dtype_kind_to_ordering(to->kind);
 
-            if (to->kind == 'm') {
-                /* both types being timedelta is already handled before. */
-                int integer_order = dtype_kind_to_ordering('i');
-                return (from_order != -1) && (from_order <= integer_order);
-            }
-
-            return (from_order != -1) && (from_order <= to_order);
-        }
-        else {
-            return 0;
-        }
-    }
-    /* NPY_NO_CASTING or NPY_EQUIV_CASTING was specified */
-    else {
+    if (safety < 0) {
+        PyErr_Clear();
         return 0;
     }
+    /* If casting is the smaller (or equal) safety we match */
+    return PyArray_MinCastSafety(safety, casting) == casting;
+#else
+    return PyArray_LegacyCanCastTypeTo(from, to, casting);
+#endif
 }
 
+
 /* CanCastArrayTo needs this function */
 static int min_scalar_type_num(char *valueptr, int type_num,
                                             int *is_small_unsigned);
@@ -1035,7 +780,7 @@ promote_types(PyArray_Descr *type1, PyArray_Descr *type2,
  * Returns a new reference to type if it is already NBO, otherwise
  * returns a copy converted to NBO.
  */
-static PyArray_Descr *
+NPY_NO_EXPORT PyArray_Descr *
 ensure_dtype_nbo(PyArray_Descr *type)
 {
     if (PyArray_ISNBO(type->byteorder)) {
@@ -1047,327 +792,178 @@ ensure_dtype_nbo(PyArray_Descr *type)
     }
 }
 
-/*NUMPY_API
- * Produces the smallest size and lowest kind type to which both
- * input types can be cast.
+
+/**
+ * This function should possibly become public API eventually.  At this
+ * time it is implemented by falling back to `PyArray_AdaptFlexibleDType`.
+ * We will use `CastingImpl[from, to].resolve_descriptors(...)` to implement
+ * this logic.
+ * Before that, the API needs to be reviewed though.
+ *
+ * WARNING: This function currently does not guarantee that `descr` can
+ *          actually be cast to the given DType.
+ *
+ * @param descr The dtype instance to adapt "cast"
+ * @param given_DType The DType class for which we wish to find an instance able
+ *        to represent `descr`.
+ * @returns Instance of `given_DType`. If `given_DType` is parametric the
+ *          descr may be adapted to hold it.
  */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType)
 {
-    int type_num1, type_num2, ret_type_num;
-
-    /*
-     * Fast path for identical dtypes.
-     *
-     * Non-native-byte-order types are converted to native ones below, so we
-     * can't quit early.
-     */
-    if (type1 == type2 && PyArray_ISNBO(type1->byteorder)) {
-        Py_INCREF(type1);
-        return type1;
+    if (NPY_DTYPE(descr) == given_DType) {
+        Py_INCREF(descr);
+        return descr;
     }
-
-    type_num1 = type1->type_num;
-    type_num2 = type2->type_num;
-
-    /* If they're built-in types, use the promotion table */
-    if (type_num1 < NPY_NTYPES && type_num2 < NPY_NTYPES) {
-        ret_type_num = _npy_type_promotion_table[type_num1][type_num2];
+    if (!given_DType->parametric) {
         /*
-         * The table doesn't handle string/unicode/void/datetime/timedelta,
-         * so check the result
+         * Don't actually do anything, the default is always the result
+         * of any cast.
          */
-        if (ret_type_num >= 0) {
-            return PyArray_DescrFromType(ret_type_num);
-        }
+        return given_DType->default_descr(given_DType);
+    }
+    if (PyObject_TypeCheck((PyObject *)descr, (PyTypeObject *)given_DType)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+
+#if NPY_USE_NEW_CASTINGIMPL
+    PyObject *tmp = PyArray_GetCastingImpl(NPY_DTYPE(descr), given_DType);
+    if (tmp == NULL || tmp == Py_None) {
+        Py_XDECREF(tmp);
+        goto error;
+    }
+    PyArray_DTypeMeta *dtypes[2] = {NPY_DTYPE(descr), given_DType};
+    PyArray_Descr *given_descrs[2] = {descr, NULL};
+    PyArray_Descr *loop_descrs[2];
+
+    PyArrayMethodObject *meth = (PyArrayMethodObject *)tmp;
+    NPY_CASTING casting = meth->resolve_descriptors(
+            meth, dtypes, given_descrs, loop_descrs);
+    Py_DECREF(tmp);
+    if (casting < 0) {
+        goto error;
+    }
+    Py_DECREF(loop_descrs[0]);
+    return loop_descrs[1];
+
+  error:;  /* (; due to compiler limitations) */
+    PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
+    PyErr_Fetch(&err_type, &err_value, &err_traceback);
+    PyErr_Format(PyExc_ValueError,
+            "cannot cast dtype %S to %S.", descr, given_DType);
+    npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
+    return NULL;
+
+#else  /* NPY_USE_NEW_CASTS */
+    if (!given_DType->legacy) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                "Must use casting to find the correct DType for a parametric "
+                "user DType. This is not yet implemented (this error should be "
+                "unreachable).");
+        return NULL;
     }
-    /* If one or both are user defined, calculate it */
-    else {
-        int skind1 = NPY_NOSCALAR, skind2 = NPY_NOSCALAR, skind;
-
-        if (PyArray_CanCastTo(type2, type1)) {
-            /* Promoted types are always native byte order */
-            return ensure_dtype_nbo(type1);
-        }
-        else if (PyArray_CanCastTo(type1, type2)) {
-            /* Promoted types are always native byte order */
-            return ensure_dtype_nbo(type2);
-        }
-
-        /* Convert the 'kind' char into a scalar kind */
-        switch (type1->kind) {
-            case 'b':
-                skind1 = NPY_BOOL_SCALAR;
-                break;
-            case 'u':
-                skind1 = NPY_INTPOS_SCALAR;
-                break;
-            case 'i':
-                skind1 = NPY_INTNEG_SCALAR;
-                break;
-            case 'f':
-                skind1 = NPY_FLOAT_SCALAR;
-                break;
-            case 'c':
-                skind1 = NPY_COMPLEX_SCALAR;
-                break;
-        }
-        switch (type2->kind) {
-            case 'b':
-                skind2 = NPY_BOOL_SCALAR;
-                break;
-            case 'u':
-                skind2 = NPY_INTPOS_SCALAR;
-                break;
-            case 'i':
-                skind2 = NPY_INTNEG_SCALAR;
-                break;
-            case 'f':
-                skind2 = NPY_FLOAT_SCALAR;
-                break;
-            case 'c':
-                skind2 = NPY_COMPLEX_SCALAR;
-                break;
-        }
-
-        /* If both are scalars, there may be a promotion possible */
-        if (skind1 != NPY_NOSCALAR && skind2 != NPY_NOSCALAR) {
-
-            /* Start with the larger scalar kind */
-            skind = (skind1 > skind2) ? skind1 : skind2;
-            ret_type_num = _npy_smallest_type_of_kind_table[skind];
-
-            for (;;) {
-
-                /* If there is no larger type of this kind, try a larger kind */
-                if (ret_type_num < 0) {
-                    ++skind;
-                    /* Use -1 to signal no promoted type found */
-                    if (skind < NPY_NSCALARKINDS) {
-                        ret_type_num = _npy_smallest_type_of_kind_table[skind];
-                    }
-                    else {
-                        break;
-                    }
-                }
 
-                /* If we found a type to which we can promote both, done! */
-                if (PyArray_CanCastSafely(type_num1, ret_type_num) &&
-                            PyArray_CanCastSafely(type_num2, ret_type_num)) {
-                    return PyArray_DescrFromType(ret_type_num);
-                }
+    PyArray_Descr *flex_dtype = PyArray_DescrNew(given_DType->singleton);
+    return PyArray_AdaptFlexibleDType(descr, flex_dtype);
+#endif  /* NPY_USE_NEW_CASTS */
+}
 
-                /* Try the next larger type of this kind */
-                ret_type_num = _npy_next_larger_type_table[ret_type_num];
-            }
 
-        }
+/**
+ * This function defines the common DType operator.
+ *
+ * Note that the common DType will not be "object" (unless one of the dtypes
+ * is object), even though object can technically represent all values
+ * correctly.
+ *
+ * TODO: Before exposure, we should review the return value (e.g. no error
+ *       when no common DType is found).
+ *
+ * @param dtype1 DType class to find the common type for.
+ * @param dtype2 Second DType class.
+ * @return The common DType or NULL with an error set
+ */
+NPY_NO_EXPORT PyArray_DTypeMeta *
+PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2)
+{
+    if (dtype1 == dtype2) {
+        Py_INCREF(dtype1);
+        return dtype1;
+    }
 
-        PyErr_SetString(PyExc_TypeError,
-                "invalid type promotion with custom data type");
+    PyArray_DTypeMeta *common_dtype;
+
+    common_dtype = dtype1->common_dtype(dtype1, dtype2);
+    if (common_dtype == (PyArray_DTypeMeta *)Py_NotImplemented) {
+        Py_DECREF(common_dtype);
+        common_dtype = dtype2->common_dtype(dtype2, dtype1);
+    }
+    if (common_dtype == NULL) {
         return NULL;
     }
+    if (common_dtype == (PyArray_DTypeMeta *)Py_NotImplemented) {
+        Py_DECREF(Py_NotImplemented);
+        PyErr_Format(PyExc_TypeError,
+                "The DTypes %S and %S do not have a common DType. "
+                "For example they cannot be stored in a single array unless "
+                "the dtype is `object`.", dtype1, dtype2);
+        return NULL;
+    }
+    return common_dtype;
+}
 
-    switch (type_num1) {
-        /* BOOL can convert to anything except datetime/void */
-        case NPY_BOOL:
-            if (type_num2 == NPY_STRING || type_num2 == NPY_UNICODE) {
-                int char_size = 1;
-                if (type_num2 == NPY_UNICODE) {
-                    char_size = 4;
-                }
-                if (type2->elsize < 5 * char_size) {
-                    PyArray_Descr *ret = NULL;
-                    PyArray_Descr *temp = PyArray_DescrNew(type2);
-                    ret = ensure_dtype_nbo(temp);
-                    ret->elsize = 5 * char_size;
-                    Py_DECREF(temp);
-                    return ret;
-                }
-                return ensure_dtype_nbo(type2);
-            }
-            else if (type_num2 != NPY_DATETIME && type_num2 != NPY_VOID) {
-                return ensure_dtype_nbo(type2);
-            }
-            break;
-        /* For strings and unicodes, take the larger size */
-        case NPY_STRING:
-            if (type_num2 == NPY_STRING) {
-                if (type1->elsize > type2->elsize) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    return ensure_dtype_nbo(type2);
-                }
-            }
-            else if (type_num2 == NPY_UNICODE) {
-                if (type2->elsize >= type1->elsize * 4) {
-                    return ensure_dtype_nbo(type2);
-                }
-                else {
-                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
-                    if (d == NULL) {
-                        return NULL;
-                    }
-                    d->elsize = type1->elsize * 4;
-                    return d;
-                }
-            }
-            /* Allow NUMBER -> STRING */
-            else if (PyTypeNum_ISNUMBER(type_num2)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type1);
-                PyDataType_MAKEUNSIZED(temp);
 
-                temp = PyArray_AdaptFlexibleDType(type2, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type1->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type1);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_UNICODE:
-            if (type_num2 == NPY_UNICODE) {
-                if (type1->elsize > type2->elsize) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    return ensure_dtype_nbo(type2);
-                }
-            }
-            else if (type_num2 == NPY_STRING) {
-                if (type1->elsize >= type2->elsize * 4) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
-                    if (d == NULL) {
-                        return NULL;
-                    }
-                    d->elsize = type2->elsize * 4;
-                    return d;
-                }
-            }
-            /* Allow NUMBER -> UNICODE */
-            else if (PyTypeNum_ISNUMBER(type_num2)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type1);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type2, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type1->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type1);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-            if (type_num2 == NPY_DATETIME || type_num2 == NPY_TIMEDELTA) {
-                return datetime_type_promotion(type1, type2);
-            }
-            break;
+/*NUMPY_API
+ * Produces the smallest size and lowest kind type to which both
+ * input types can be cast.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+{
+    PyArray_DTypeMeta *common_dtype;
+    PyArray_Descr *res;
+
+    /* Fast path for identical inputs (NOTE: This path preserves metadata!) */
+    if (type1 == type2 && PyArray_ISNBO(type1->byteorder)) {
+        Py_INCREF(type1);
+        return type1;
     }
 
-    switch (type_num2) {
-        /* BOOL can convert to almost anything */
-        case NPY_BOOL:
-            if (type_num2 == NPY_STRING || type_num2 == NPY_UNICODE) {
-                int char_size = 1;
-                if (type_num2 == NPY_UNICODE) {
-                    char_size = 4;
-                }
-                if (type2->elsize < 5 * char_size) {
-                    PyArray_Descr *ret = NULL;
-                    PyArray_Descr *temp = PyArray_DescrNew(type2);
-                    ret = ensure_dtype_nbo(temp);
-                    ret->elsize = 5 * char_size;
-                    Py_DECREF(temp);
-                    return ret;
-                }
-                return ensure_dtype_nbo(type2);
-            }
-            else if (type_num1 != NPY_DATETIME && type_num1 != NPY_TIMEDELTA &&
-                                    type_num1 != NPY_VOID) {
-                return ensure_dtype_nbo(type1);
-            }
-            break;
-        case NPY_STRING:
-            /* Allow NUMBER -> STRING */
-            if (PyTypeNum_ISNUMBER(type_num1)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type2);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type1, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type2->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type2);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_UNICODE:
-            /* Allow NUMBER -> UNICODE */
-            if (PyTypeNum_ISNUMBER(type_num1)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type2);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type1, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type2->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type2);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_TIMEDELTA:
-            if (PyTypeNum_ISSIGNED(type_num1)) {
-                return ensure_dtype_nbo(type2);
-            }
-            break;
+    common_dtype = PyArray_CommonDType(NPY_DTYPE(type1), NPY_DTYPE(type2));
+    if (common_dtype == NULL) {
+        return NULL;
     }
 
-    /* For types equivalent up to endianness, can return either */
-    if (PyArray_CanCastTypeTo(type1, type2, NPY_EQUIV_CASTING)) {
-        return ensure_dtype_nbo(type1);
+    if (!common_dtype->parametric) {
+        res = common_dtype->default_descr(common_dtype);
+        Py_DECREF(common_dtype);
+        return res;
     }
 
-    /* TODO: Also combine fields, subarrays, strings, etc */
+    /* Cast the input types to the common DType if necessary */
+    type1 = PyArray_CastDescrToDType(type1, common_dtype);
+    if (type1 == NULL) {
+        Py_DECREF(common_dtype);
+        return NULL;
+    }
+    type2 = PyArray_CastDescrToDType(type2, common_dtype);
+    if (type2 == NULL) {
+        Py_DECREF(type1);
+        Py_DECREF(common_dtype);
+        return NULL;
+    }
 
     /*
-    printf("invalid type promotion: ");
-    PyObject_Print(type1, stdout, 0);
-    printf(" ");
-    PyObject_Print(type2, stdout, 0);
-    printf("\n");
-    */
-    PyErr_SetString(PyExc_TypeError, "invalid type promotion");
-    return NULL;
+     * And find the common instance of the two inputs
+     * NOTE: Common instance preserves metadata (normally and of one input)
+     */
+    res = common_dtype->common_instance(type1, type2);
+    Py_DECREF(type1);
+    Py_DECREF(type2);
+    Py_DECREF(common_dtype);
+    return res;
 }
 
 /*
@@ -1989,7 +1585,7 @@ PyArray_Zero(PyArrayObject *arr)
     }
 
     if (zero_obj == NULL) {
-        zero_obj = PyInt_FromLong((long) 0);
+        zero_obj = PyLong_FromLong((long) 0);
         if (zero_obj == NULL) {
             return NULL;
         }
@@ -2035,7 +1631,7 @@ PyArray_One(PyArrayObject *arr)
     }
 
     if (one_obj == NULL) {
-        one_obj = PyInt_FromLong((long) 1);
+        one_obj = PyLong_FromLong((long) 1);
         if (one_obj == NULL) {
             return NULL;
         }
@@ -2202,3 +1798,1108 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn)
     PyDataMem_FREE(mps);
     return NULL;
 }
+
+
+/**
+ * Private function to add a casting implementation by unwrapping a bound
+ * array method.
+ *
+ * @param meth
+ * @return 0 on success -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AddCastingImplmentation(PyBoundArrayMethodObject *meth)
+{
+    if (meth->method->nin != 1 || meth->method->nout != 1) {
+        PyErr_SetString(PyExc_TypeError,
+                "A cast must have one input and one output.");
+        return -1;
+    }
+    if (meth->dtypes[0] == meth->dtypes[1]) {
+        if (!(meth->method->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+            PyErr_Format(PyExc_TypeError,
+                    "A cast where input and output DType (class) are identical "
+                    "must currently support unaligned data. (method: %s)",
+                    meth->method->name);
+            return -1;
+        }
+        if ((meth->method->casting & ~_NPY_CAST_IS_VIEW) != NPY_NO_CASTING) {
+            PyErr_Format(PyExc_TypeError,
+                    "A cast where input and output DType (class) are identical "
+                    "must signal `no-casting`. (method: %s)",
+                    meth->method->name);
+            return -1;
+        }
+    }
+    if (PyDict_Contains(meth->dtypes[0]->castingimpls,
+            (PyObject *)meth->dtypes[1])) {
+        PyErr_Format(PyExc_RuntimeError,
+                "A cast was already added for %S -> %S. (method: %s)",
+                meth->dtypes[0], meth->dtypes[1], meth->method->name);
+        return -1;
+    }
+    if (PyDict_SetItem(meth->dtypes[0]->castingimpls,
+            (PyObject *)meth->dtypes[1], (PyObject *)meth->method) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * Add a new casting implementation using a PyArrayMethod_Spec.
+ *
+ * @param spec
+ * @param private If private, allow slots not publically exposed.
+ * @return 0 on success -1 on failure
+ */
+NPY_NO_EXPORT int
+PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private)
+{
+    /* Create a bound method, unbind and store it */
+    PyBoundArrayMethodObject *meth = PyArrayMethod_FromSpec_int(spec, private);
+    if (meth == NULL) {
+        return -1;
+    }
+    int res = PyArray_AddCastingImplmentation(meth);
+    Py_DECREF(meth);
+    if (res < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT NPY_CASTING
+legacy_same_dtype_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(loop_descrs[0]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    /* this function only makes sense for non-flexible legacy dtypes: */
+    assert(loop_descrs[0]->elsize == loop_descrs[1]->elsize);
+
+    /*
+     * Legacy dtypes (except datetime) only have byte-order and elsize as
+     * storage parameters.
+     */
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+                PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+    return NPY_EQUIV_CASTING;
+}
+
+
+/*
+ * Simple dtype resolver for casting between two different (non-parametric)
+ * (legacy) dtypes.
+ */
+NPY_NO_EXPORT NPY_CASTING
+simple_cast_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    assert(dtypes[0]->legacy && dtypes[1]->legacy);
+
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    if (given_descrs[1] != NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+    }
+
+    if (self->casting != NPY_NO_CASTING) {
+        return self->casting;
+    }
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+            PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+    return NPY_EQUIV_CASTING;
+}
+
+
+static int
+add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyType_Slot slots[6];
+    PyArray_DTypeMeta *dtypes[2] = {from, to};
+    PyArrayMethod_Spec spec = {
+            .name = "numeric_cast",
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .slots = slots,
+            .dtypes = dtypes,
+    };
+
+    npy_intp from_itemsize = dtypes[0]->singleton->elsize;
+    npy_intp to_itemsize = dtypes[1]->singleton->elsize;
+
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &simple_cast_resolve_descriptors;
+    /* Fetch the optimized loops (2<<10 is a non-contiguous stride) */
+    slots[1].slot = NPY_METH_strided_loop;
+    slots[1].pfunc = PyArray_GetStridedNumericCastFn(
+            1, 2<<10, 2<<10, from->type_num, to->type_num);
+    slots[2].slot = NPY_METH_contiguous_loop;
+    slots[2].pfunc = PyArray_GetStridedNumericCastFn(
+            1, from_itemsize, to_itemsize, from->type_num, to->type_num);
+    slots[3].slot = NPY_METH_unaligned_strided_loop;
+    slots[3].pfunc = PyArray_GetStridedNumericCastFn(
+            0, 2<<10, 2<<10, from->type_num, to->type_num);
+    slots[4].slot = NPY_METH_unaligned_contiguous_loop;
+    slots[4].pfunc = PyArray_GetStridedNumericCastFn(
+            0, from_itemsize, to_itemsize, from->type_num, to->type_num);
+    slots[5].slot = 0;
+    slots[5].pfunc = NULL;
+
+    assert(slots[1].pfunc && slots[2].pfunc && slots[3].pfunc && slots[4].pfunc);
+
+    /* Find the correct casting level, and special case no-cast */
+    if (dtypes[0]->kind == dtypes[1]->kind && from_itemsize == to_itemsize) {
+        spec.casting = NPY_NO_CASTING;
+
+        /* When there is no casting (equivalent C-types) use byteswap loops */
+        slots[0].slot = NPY_METH_resolve_descriptors;
+        slots[0].pfunc = &legacy_same_dtype_resolve_descriptors;
+        slots[1].slot = NPY_METH_get_loop;
+        slots[1].pfunc = NULL;
+        slots[2].slot = 0;
+        slots[2].pfunc = NULL;
+
+        spec.name = "numeric_copy_or_byteswap";
+        spec.flags |= NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    }
+    else if (_npy_can_cast_safely_table[from->type_num][to->type_num]) {
+        spec.casting = NPY_SAFE_CASTING;
+    }
+    else if (dtype_kind_to_ordering(dtypes[0]->kind) <=
+             dtype_kind_to_ordering(dtypes[1]->kind)) {
+        spec.casting = NPY_SAME_KIND_CASTING;
+    }
+    else {
+        spec.casting = NPY_UNSAFE_CASTING;
+    }
+
+    /* Create a bound method, unbind and store it */
+    return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+}
+
+
+/*
+ * This registers the castingimpl for all casts between numeric types.
+ * Eventually, this function should likely be defined as part of a .c.src
+ * file to remove `PyArray_GetStridedNumericCastFn` entirely.
+ */
+static int
+PyArray_InitializeNumericCasts(void)
+{
+    for (int from = 0; from < NPY_NTYPES; from++) {
+        if (!PyTypeNum_ISNUMBER(from) && from != NPY_BOOL) {
+            continue;
+        }
+        PyArray_DTypeMeta *from_dt = PyArray_DTypeFromTypeNum(from);
+
+        for (int to = 0; to < NPY_NTYPES; to++) {
+            if (!PyTypeNum_ISNUMBER(to) && to != NPY_BOOL) {
+                continue;
+            }
+            PyArray_DTypeMeta *to_dt = PyArray_DTypeFromTypeNum(to);
+            int res = add_numeric_cast(from_dt, to_dt);
+            Py_DECREF(to_dt);
+            if (res < 0) {
+                Py_DECREF(from_dt);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+static int
+cast_to_string_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /*
+     * NOTE: The following code used to be part of PyArray_AdaptFlexibleDType
+     *
+     * Get a string-size estimate of the input. These
+     * are generallly the size needed, rounded up to
+     * a multiple of eight.
+     */
+    npy_intp size = -1;
+    switch (dtypes[0]->type_num) {
+        case NPY_BOOL:
+        case NPY_UBYTE:
+        case NPY_BYTE:
+        case NPY_USHORT:
+        case NPY_SHORT:
+        case NPY_UINT:
+        case NPY_INT:
+        case NPY_ULONG:
+        case NPY_LONG:
+        case NPY_ULONGLONG:
+        case NPY_LONGLONG:
+            assert(dtypes[0]->singleton->elsize <= 8);
+            assert(dtypes[0]->singleton->elsize > 0);
+            if (dtypes[0]->kind == 'b') {
+                /* 5 chars needed for cast to 'True' or 'False' */
+                size = 5;
+            }
+            else if (dtypes[0]->kind == 'u') {
+                size = REQUIRED_STR_LEN[dtypes[0]->singleton->elsize];
+            }
+            else if (dtypes[0]->kind == 'i') {
+                /* Add character for sign symbol */
+                size = REQUIRED_STR_LEN[dtypes[0]->singleton->elsize] + 1;
+            }
+            break;
+        case NPY_HALF:
+        case NPY_FLOAT:
+        case NPY_DOUBLE:
+            size = 32;
+            break;
+        case NPY_LONGDOUBLE:
+            size = 48;
+            break;
+        case NPY_CFLOAT:
+        case NPY_CDOUBLE:
+            size = 2 * 32;
+            break;
+        case NPY_CLONGDOUBLE:
+            size = 2 * 48;
+            break;
+        case NPY_STRING:
+        case NPY_VOID:
+            size = given_descrs[0]->elsize;
+            break;
+        case NPY_UNICODE:
+            size = given_descrs[0]->elsize / 4;
+            break;
+        default:
+            PyErr_SetString(PyExc_SystemError,
+                    "Impossible cast to string path requested.");
+            return -1;
+    }
+    if (dtypes[1]->type_num == NPY_UNICODE) {
+        size *= 4;
+    }
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = PyArray_DescrNewFromType(dtypes[1]->type_num);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+        loop_descrs[1]->elsize = size;
+    }
+    else {
+        /* The legacy loop can handle mismatching itemsizes */
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+
+    /* Set the input one as well (late for easier error management) */
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+
+    if (self->casting == NPY_UNSAFE_CASTING) {
+        assert(dtypes[0]->type_num == NPY_UNICODE &&
+               dtypes[1]->type_num == NPY_STRING);
+        return NPY_UNSAFE_CASTING;
+    }
+    assert(self->casting == NPY_SAFE_CASTING);
+
+    if (loop_descrs[1]->elsize >= size) {
+        return NPY_SAFE_CASTING;
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+static int
+add_other_to_and_from_string_cast(
+        PyArray_DTypeMeta *string, PyArray_DTypeMeta *other)
+{
+    if (string == other) {
+        return 0;
+    }
+
+    /* Casting from string, is always a simple legacy-style cast */
+    if (other->type_num != NPY_STRING && other->type_num != NPY_UNICODE) {
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                string, other, NPY_UNSAFE_CASTING) < 0) {
+            return -1;
+        }
+    }
+    /*
+     * Casting to strings, is almost the same, but requires a custom resolver
+     * to define the correct string length. Right now we use a generic function
+     * for this.
+     */
+    PyArray_DTypeMeta *dtypes[2] = {other, string};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &cast_to_string_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+        .name = "legacy_cast_to_string",
+        .nin = 1,
+        .nout = 1,
+        .flags = NPY_METH_REQUIRES_PYAPI,
+        .dtypes = dtypes,
+        .slots = slots,
+    };
+    /* Almost everything can be safely cast to string (except unicode) */
+    if (other->type_num != NPY_UNICODE) {
+        spec.casting = NPY_SAFE_CASTING;
+    }
+    else {
+        spec.casting = NPY_UNSAFE_CASTING;
+    }
+
+    return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+}
+
+
+NPY_NO_EXPORT NPY_CASTING
+string_to_string_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(loop_descrs[0]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    if (loop_descrs[0]->elsize == loop_descrs[1]->elsize) {
+        if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+                PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+            return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else {
+            return NPY_EQUIV_CASTING;
+        }
+    }
+    else if (loop_descrs[0]->elsize <= loop_descrs[1]->elsize) {
+        return NPY_SAFE_CASTING;
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+/*
+ * Add string casts. Right now all string casts are just legacy-wrapped ones
+ * (except string<->string and unicode<->unicode), but they do require
+ * custom type resolution for the string length.
+ *
+ * A bit like `object`, it could make sense to define a simpler protocol for
+ * string casts, however, we also need to remember that the itemsize of the
+ * output has to be found.
+ */
+static int
+PyArray_InitializeStringCasts(void)
+{
+    int result = -1;
+    PyArray_DTypeMeta *string = PyArray_DTypeFromTypeNum(NPY_STRING);
+    PyArray_DTypeMeta *unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE);
+    PyArray_DTypeMeta *other_dt = NULL;
+
+    /* Add most casts as legacy ones */
+    for (int other = 0; other < NPY_NTYPES; other++) {
+        if (PyTypeNum_ISDATETIME(other) || other == NPY_VOID ||
+                other == NPY_OBJECT) {
+            continue;
+        }
+        other_dt = PyArray_DTypeFromTypeNum(other);
+
+        /* The functions skip string == other_dt or unicode == other_dt */
+        if (add_other_to_and_from_string_cast(string, other_dt) < 0) {
+            goto finish;
+        }
+        if (add_other_to_and_from_string_cast(unicode, other_dt) < 0) {
+            goto finish;
+        }
+
+        Py_SETREF(other_dt, NULL);
+    }
+
+    /* string<->string and unicode<->unicode have their own specialized casts */
+    PyArray_DTypeMeta *dtypes[2];
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "string_to_string_cast",
+            .casting = NPY_NO_CASTING,
+            .nin = 1,
+            .nout = 1,
+            .flags = (NPY_METH_REQUIRES_PYAPI |
+                      NPY_METH_NO_FLOATINGPOINT_ERRORS |
+                      NPY_METH_SUPPORTS_UNALIGNED),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    dtypes[0] = string;
+    dtypes[1] = string;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto finish;
+    }
+
+    dtypes[0] = unicode;
+    dtypes[1] = unicode;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto finish;
+    }
+
+    result = 0;
+  finish:
+    Py_DECREF(string);
+    Py_DECREF(unicode);
+    Py_XDECREF(other_dt);
+    return result;
+}
+
+
+/*
+ * Small helper function to handle the case of `arr.astype(dtype="V")`.
+ * When the output descriptor is not passed, we always use `V<itemsize>`
+ * of the other dtype.
+ */
+static NPY_CASTING
+cast_to_void_dtype_class(
+        PyArray_Descr **given_descrs, PyArray_Descr **loop_descrs)
+{
+    /* `dtype="V"` means unstructured currently (compare final path) */
+    loop_descrs[1] = PyArray_DescrNewFromType(NPY_VOID);
+    if (loop_descrs[1] == NULL) {
+        return -1;
+    }
+    loop_descrs[1]->elsize = given_descrs[0]->elsize;
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_SAFE_CASTING | _NPY_CAST_IS_VIEW;
+}
+
+
+static NPY_CASTING
+nonstructured_to_structured_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    NPY_CASTING casting;
+
+    if (given_descrs[1] == NULL) {
+        return cast_to_void_dtype_class(given_descrs, loop_descrs);
+    }
+
+    if (given_descrs[1]->subarray != NULL) {
+        /*
+         * We currently consider this at most a safe cast. It would be
+         * possible to allow a view if the field has exactly one element.
+         */
+        casting = NPY_SAFE_CASTING;
+        /* Subarray dtype */
+        NPY_CASTING base_casting = PyArray_GetCastSafety(
+                given_descrs[0], given_descrs[1]->subarray->base, NULL);
+        if (base_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, base_casting);
+    }
+    else if (given_descrs[1]->names != NULL) {
+        /* Structured dtype */
+        if (PyTuple_Size(given_descrs[1]->names) == 0) {
+            /* TODO: This retained behaviour, but likely should be changed. */
+            casting = NPY_UNSAFE_CASTING;
+        }
+        else {
+            /* Considered at most unsafe casting (but this could be changed) */
+            casting = NPY_UNSAFE_CASTING;
+            if (PyTuple_Size(given_descrs[1]->names) == 1) {
+                /* A view may be acceptable */
+                casting |= _NPY_CAST_IS_VIEW;
+            }
+
+            Py_ssize_t pos = 0;
+            PyObject *key, *tuple;
+            while (PyDict_Next(given_descrs[1]->fields, &pos, &key, &tuple)) {
+                PyArray_Descr *field_descr = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
+                NPY_CASTING field_casting = PyArray_GetCastSafety(
+                        given_descrs[0], field_descr, NULL);
+                casting = PyArray_MinCastSafety(casting, field_casting);
+                if (casting < 0) {
+                    return -1;
+                }
+            }
+        }
+    }
+    else {
+        /* Plain void type. This behaves much like a "view" */
+        if (given_descrs[0]->elsize == given_descrs[1]->elsize &&
+                !PyDataType_REFCHK(given_descrs[0])) {
+            /*
+             * A simple view, at the moment considered "safe" (the refcheck is
+             * probably not necessary, but more future proof
+             */
+            casting = NPY_SAFE_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else if (given_descrs[0]->elsize <= given_descrs[1]->elsize) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else {
+            casting = NPY_UNSAFE_CASTING;
+        }
+    }
+
+    /* Void dtypes always do the full cast. */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    Py_INCREF(given_descrs[1]);
+    loop_descrs[1] = given_descrs[1];
+
+    return casting;
+}
+
+
+int give_bad_field_error(PyObject *key)
+{
+    if (!PyErr_Occurred()) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Invalid or missing field %R, this should be impossible "
+                "and indicates a NumPy bug.", key);
+    }
+    return -1;
+}
+
+
+static PyObject *
+PyArray_GetGenericToVoidCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->name = "any_to_void_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_SAFE_CASTING;
+    method->resolve_descriptors = &nonstructured_to_structured_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+static NPY_CASTING
+structured_to_nonstructured_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    PyArray_Descr *base_descr;
+
+    if (given_descrs[0]->subarray != NULL) {
+        base_descr = given_descrs[0]->subarray->base;
+    }
+    else if (given_descrs[0]->names != NULL) {
+        if (PyTuple_Size(given_descrs[0]->names) != 1) {
+            /* Only allow casting a single field */
+            return -1;
+        }
+        PyObject *key = PyTuple_GetItem(given_descrs[0]->names, 0);
+        PyObject *base_tup = PyDict_GetItem(given_descrs[0]->fields, key);
+        base_descr = (PyArray_Descr *)PyTuple_GET_ITEM(base_tup, 0);
+    }
+    else {
+        /*
+         * unstructured voids are considered unsafe casts and defined, albeit,
+         * at this time they go back to legacy behaviour using getitem/setitem.
+         */
+        base_descr = NULL;
+    }
+
+    /*
+     * The cast is always considered unsafe, so the PyArray_GetCastSafety
+     * result currently does not matter.
+     */
+    if (base_descr != NULL && PyArray_GetCastSafety(
+            base_descr, given_descrs[1], dtypes[1]) < 0) {
+        return -1;
+    }
+
+    /* Void dtypes always do the full cast. */
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        /*
+         * Special case strings here, it should be useless (and only actually
+         * work for empty arrays).  Possibly this should simply raise for
+         * all parametric DTypes.
+         */
+        if (dtypes[1]->type_num == NPY_STRING) {
+            loop_descrs[1]->elsize = given_descrs[0]->elsize;
+        }
+        else if (dtypes[1]->type_num == NPY_UNICODE) {
+            loop_descrs[1]->elsize = given_descrs[0]->elsize * 4;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    return NPY_UNSAFE_CASTING;
+}
+
+
+static PyObject *
+PyArray_GetVoidToGenericCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->name = "void_to_any_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_UNSAFE_CASTING;
+    method->resolve_descriptors = &structured_to_nonstructured_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+/*
+ * Find the correct field casting safety.  See the TODO note below, including
+ * in 1.20 (and later) this was based on field names rather than field order
+ * which it should be using.
+ *
+ * NOTE: In theory it would be possible to cache the all the field casting
+ *       implementations on the dtype, to avoid duplicate work.
+ */
+static NPY_CASTING
+can_cast_fields_safety(PyArray_Descr *from, PyArray_Descr *to)
+{
+    NPY_CASTING casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+
+    Py_ssize_t field_count = PyTuple_Size(from->names);
+    if (field_count != PyTuple_Size(to->names)) {
+        /* TODO: This should be rejected! */
+        return NPY_UNSAFE_CASTING;
+    }
+    for (Py_ssize_t i = 0; i < field_count; i++) {
+        PyObject *from_key = PyTuple_GET_ITEM(from->names, i);
+        PyObject *from_tup = PyDict_GetItemWithError(from->fields, from_key);
+        if (from_tup == NULL) {
+            return give_bad_field_error(from_key);
+        }
+        PyArray_Descr *from_base = (PyArray_Descr*)PyTuple_GET_ITEM(from_tup, 0);
+
+        /*
+         * TODO: This should use to_key (order), compare gh-15509 by
+         *       by Allan Haldane.  And raise an error on failure.
+         *       (Fixing that may also requires fixing/changing promotion.)
+         */
+        PyObject *to_tup = PyDict_GetItem(to->fields, from_key);
+        if (to_tup == NULL) {
+            return NPY_UNSAFE_CASTING;
+        }
+        PyArray_Descr *to_base = (PyArray_Descr*)PyTuple_GET_ITEM(to_tup, 0);
+
+        NPY_CASTING field_casting = PyArray_GetCastSafety(from_base, to_base, NULL);
+        if (field_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, field_casting);
+    }
+    if (!(casting & _NPY_CAST_IS_VIEW)) {
+        assert((casting & ~_NPY_CAST_IS_VIEW) != NPY_NO_CASTING);
+        return casting;
+    }
+
+    /*
+     * If the itemsize (includes padding at the end), fields, or names
+     * do not match, this cannot be a view and also not a "no" cast
+     * (identical dtypes).
+     * It may be possible that this can be relaxed in some cases.
+     */
+    if (from->elsize != to->elsize) {
+        /*
+         * The itemsize may mismatch even if all fields and formats match
+         * (due to additional padding).
+         */
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+
+    int cmp = PyObject_RichCompareBool(from->fields, to->fields, Py_EQ);
+    if (cmp != 1) {
+        if (cmp == -1) {
+            PyErr_Clear();
+        }
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+    cmp = PyObject_RichCompareBool(from->names, to->names, Py_EQ);
+    if (cmp != 1) {
+        if (cmp == -1) {
+            PyErr_Clear();
+        }
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+    return casting;
+}
+
+
+static NPY_CASTING
+void_to_void_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    NPY_CASTING casting;
+
+    if (given_descrs[1] == NULL) {
+        /* This is weird, since it doesn't return the original descr, but... */
+        return cast_to_void_dtype_class(given_descrs, loop_descrs);
+    }
+
+    if (given_descrs[0]->names != NULL && given_descrs[1]->names != NULL) {
+        /* From structured to structured, need to check fields */
+        casting = can_cast_fields_safety(given_descrs[0], given_descrs[1]);
+    }
+    else if (given_descrs[0]->names != NULL) {
+        return structured_to_nonstructured_resolve_descriptors(
+                self, dtypes, given_descrs, loop_descrs);
+    }
+    else if (given_descrs[1]->names != NULL) {
+        return nonstructured_to_structured_resolve_descriptors(
+                self, dtypes, given_descrs, loop_descrs);
+    }
+    else if (given_descrs[0]->subarray == NULL &&
+                given_descrs[1]->subarray == NULL) {
+        /* Both are plain void dtypes */
+        if (given_descrs[0]->elsize == given_descrs[1]->elsize) {
+            casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else if (given_descrs[0]->elsize < given_descrs[1]->elsize) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else {
+            casting = NPY_SAME_KIND_CASTING;
+        }
+    }
+    else {
+        /*
+         * At this point, one of the dtypes must be a subarray dtype, the
+         * other is definitely not a structured one.
+         */
+        PyArray_ArrayDescr *from_sub = given_descrs[0]->subarray;
+        PyArray_ArrayDescr *to_sub = given_descrs[1]->subarray;
+        assert(from_sub || to_sub);
+
+        /* If the shapes do not match, this is at most an unsafe cast */
+        casting = NPY_UNSAFE_CASTING;
+        if (from_sub && to_sub) {
+            int res = PyObject_RichCompareBool(from_sub->shape, to_sub->shape, Py_EQ);
+            if (res < 0) {
+                return -1;
+            }
+            else if (res) {
+                /* Both are subarrays and the shape matches */
+                casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+            }
+        }
+        NPY_CASTING field_casting = PyArray_GetCastSafety(
+                given_descrs[0]->subarray->base, given_descrs[1]->subarray->base, NULL);
+        if (field_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, field_casting);
+    }
+
+    /* Void dtypes always do the full cast. */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    Py_INCREF(given_descrs[1]);
+    loop_descrs[1] = given_descrs[1];
+
+    return casting;
+}
+
+
+/*
+ * This initializes the void to void cast. Voids include structured dtypes,
+ * which means that they can cast from and to any other dtype and, in that
+ * sense, are special (similar to Object).
+ */
+static int
+PyArray_InitializeVoidToVoidCast(void)
+{
+    PyArray_DTypeMeta *Void = PyArray_DTypeFromTypeNum(NPY_VOID);
+    PyArray_DTypeMeta *dtypes[2] = {Void, Void};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &void_to_void_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "void_to_void_cast",
+            .casting = NPY_NO_CASTING,
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    Py_DECREF(Void);
+    return res;
+}
+
+
+/*
+ * Implement object to any casting implementation. Casting from object may
+ * require inspecting of all array elements (for parametric dtypes), and
+ * the resolver will thus reject all parametric dtypes if the out dtype
+ * is not provided.
+ */
+static NPY_CASTING
+object_to_any_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    if (given_descrs[1] == NULL) {
+        /*
+         * This should not really be called, since object -> parametric casts
+         * require inspecting the object array. Allow legacy ones, the path
+         * here is that e.g. "M8" input is considered to be the DType class,
+         * and by allowing it here, we go back to the "M8" instance.
+         */
+        if (dtypes[1]->parametric) {
+            PyErr_Format(PyExc_TypeError,
+                    "casting from object to the parametric DType %S requires "
+                    "the specified output dtype instance. "
+                    "This may be a NumPy issue, since the correct instance "
+                    "should be discovered automatically, however.", dtypes[1]);
+            return -1;
+        }
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/*
+ * Casting to object is special since it is generic to all input dtypes.
+ */
+static PyObject *
+PyArray_GetObjectToGenericCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->nin = 1;
+    method->nout = 1;
+    method->name = "object_to_any_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_UNSAFE_CASTING;
+    method->resolve_descriptors = &object_to_any_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+
+/* Any object object is simple (could even use the default) */
+static NPY_CASTING
+any_to_object_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_SAFE_CASTING;
+}
+
+
+/*
+ * Casting to object is special since it is generic to all input dtypes.
+ */
+static PyObject *
+PyArray_GetGenericToObjectCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->nin = 1;
+    method->nout = 1;
+    method->name = "any_to_object_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_SAFE_CASTING;
+    method->resolve_descriptors = &any_to_object_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+static int
+PyArray_InitializeObjectToObjectCast(void)
+{
+    /*
+     * The object dtype does not support byte order changes, so its cast
+     * is always a direct view.
+     */
+    PyArray_DTypeMeta *Object = PyArray_DTypeFromTypeNum(NPY_OBJECT);
+    PyArray_DTypeMeta *dtypes[2] = {Object, Object};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "object_to_object_cast",
+            .casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW,
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    Py_DECREF(Object);
+    return res;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_InitializeCasts()
+{
+    if (PyArray_InitializeNumericCasts() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeStringCasts() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeVoidToVoidCast() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeObjectToObjectCast() < 0) {
+        return -1;
+    }
+    /* Datetime casts are defined in datetime.c */
+    if (PyArray_InitializeDatetimeCasts() < 0) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 9b7f39db2..cc1930f77 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -1,6 +1,13 @@
 #ifndef _NPY_ARRAY_CONVERT_DATATYPE_H_
 #define _NPY_ARRAY_CONVERT_DATATYPE_H_
 
+#include "array_method.h"
+
+extern NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[];
+
+NPY_NO_EXPORT PyObject *
+_get_castingimpl(PyObject *NPY_UNUSED(module), PyObject *args);
+
 NPY_NO_EXPORT PyArray_VectorUnaryFunc *
 PyArray_GetCastFunc(PyArray_Descr *descr, int type_num);
 
@@ -10,14 +17,23 @@ PyArray_ObjectType(PyObject *op, int minimum_type);
 NPY_NO_EXPORT PyArrayObject **
 PyArray_ConvertToCommonType(PyObject *op, int *retn);
 
+NPY_NO_EXPORT PyArray_DTypeMeta *
+PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
+
 NPY_NO_EXPORT int
 PyArray_ValidType(int type);
 
+NPY_NO_EXPORT int
+dtype_kind_to_ordering(char kind);
+
 /* Like PyArray_CanCastArrayTo */
 NPY_NO_EXPORT npy_bool
 can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
                     PyArray_Descr *to, NPY_CASTING casting);
 
+NPY_NO_EXPORT PyArray_Descr *
+ensure_dtype_nbo(PyArray_Descr *type);
+
 NPY_NO_EXPORT int
 should_use_min_scalar(npy_intp narrs, PyArrayObject **arr,
                       npy_intp ndtypes, PyArray_Descr **dtypes);
@@ -30,23 +46,37 @@ npy_set_invalid_cast_error(
         PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
         NPY_CASTING casting, npy_bool scalar);
 
-/*
- * This function calls Py_DECREF on flex_dtype, and replaces it with
- * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
- *
- * Usually, if data_obj is not an array, dtype should be the result
- * given by the PyArray_GetArrayParamsFromObject function.
- *
- * The data_obj may be NULL if just a dtype is known for the source.
- *
- * If *flex_dtype is NULL, returns immediately, without setting an
- * exception, leaving any previous error handling intact.
- *
- * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
- * and NPY_DATETIME with generic units.
- */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype);
+PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType);
+
+NPY_NO_EXPORT int
+PyArray_AddCastingImplmentation(PyBoundArrayMethodObject *meth);
+
+NPY_NO_EXPORT int
+PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private);
+
+NPY_NO_EXPORT NPY_CASTING
+PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2);
+
+NPY_NO_EXPORT NPY_CASTING
+PyArray_GetCastSafety(
+        PyArray_Descr *from, PyArray_Descr *to, PyArray_DTypeMeta *to_dtype);
+
+NPY_NO_EXPORT NPY_CASTING
+legacy_same_dtype_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs);
+
+NPY_NO_EXPORT NPY_CASTING
+simple_cast_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **input_descrs,
+        PyArray_Descr **loop_descrs);
+
+NPY_NO_EXPORT int
+PyArray_InitializeCasts(void);
 
 #endif
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index cb448756b..f6031e370 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -300,12 +300,12 @@ _update_descr_and_dimensions(PyArray_Descr **des, npy_intp *newdims,
     }
     if (tuple) {
         for (i = 0; i < numnew; i++) {
-            mydim[i] = (npy_intp) PyInt_AsLong(
+            mydim[i] = (npy_intp) PyLong_AsLong(
                     PyTuple_GET_ITEM(old->subarray->shape, i));
         }
     }
     else {
-        mydim[0] = (npy_intp) PyInt_AsLong(old->subarray->shape);
+        mydim[0] = (npy_intp) PyLong_AsLong(old->subarray->shape);
     }
 
     if (newstrides) {
@@ -610,6 +610,7 @@ PyArray_AssignFromCache(PyArrayObject *self, coercion_cache_obj *cache) {
         PyErr_SetString(PyExc_RuntimeError,
                 "Inconsistent object during array creation? "
                 "Content of sequences changed (cache not consumed).");
+        npy_free_coercion_cache(cache);
         return -1;
     }
     return 0;
@@ -755,9 +756,11 @@ PyArray_NewFromDescr_int(
         Py_DECREF(descr);
         return NULL;
     }
+    fa->_buffer_info = NULL;
     fa->nd = nd;
     fa->dimensions = NULL;
     fa->data = NULL;
+
     if (data == NULL) {
         fa->flags = NPY_ARRAY_DEFAULT;
         if (flags) {
@@ -868,11 +871,14 @@ PyArray_NewFromDescr_int(
 
         func = PyObject_GetAttr((PyObject *)fa, npy_ma_str_array_finalize);
         if (func && func != Py_None) {
-            if (NpyCapsule_Check(func)) {
+            if (PyCapsule_CheckExact(func)) {
                 /* A C-function is stored here */
                 PyArray_FinalizeFunc *cfunc;
-                cfunc = NpyCapsule_AsVoidPtr(func);
+                cfunc = PyCapsule_GetPointer(func, NULL);
                 Py_DECREF(func);
+                if (cfunc == NULL) {
+                    goto fail;
+                }
                 if (cfunc((PyArrayObject *)fa, obj) < 0) {
                     goto fail;
                 }
@@ -1364,6 +1370,160 @@ PyArray_GetArrayParamsFromObject(PyObject *NPY_UNUSED(op),
 }
 
 
+/*
+ * This function is a legacy implementation to retain subarray dtype
+ * behaviour in array coercion. The behaviour here makes sense if tuples
+ * of matching dimensionality are being coerced. Due to the difficulty
+ * that the result is ill-defined for lists of array-likes, this is deprecated.
+ *
+ * WARNING: Do not use this function, it exists purely to support a deprecated
+ *          code path.
+ */
+static int
+setArrayFromSequence(PyArrayObject *a, PyObject *s,
+                        int dim, PyArrayObject * dst)
+{
+    Py_ssize_t i, slen;
+    int res = -1;
+
+    /* first recursion, view equal destination */
+    if (dst == NULL)
+        dst = a;
+
+    /*
+     * This code is to ensure that the sequence access below will
+     * return a lower-dimensional sequence.
+     */
+
+    /* INCREF on entry DECREF on exit */
+    Py_INCREF(s);
+
+    PyObject *seq = NULL;
+
+    if (PyArray_Check(s)) {
+        if (!(PyArray_CheckExact(s))) {
+            /*
+             * make sure a base-class array is used so that the dimensionality
+             * reduction assumption is correct.
+             */
+            /* This will DECREF(s) if replaced */
+            s = PyArray_EnsureArray(s);
+            if (s == NULL) {
+                goto fail;
+            }
+        }
+
+        /* dst points to correct array subsection */
+        if (PyArray_CopyInto(dst, (PyArrayObject *)s) < 0) {
+            goto fail;
+        }
+
+        Py_DECREF(s);
+        return 0;
+    }
+
+    if (dim > PyArray_NDIM(a)) {
+        PyErr_Format(PyExc_ValueError,
+                 "setArrayFromSequence: sequence/array dimensions mismatch.");
+        goto fail;
+    }
+
+    /* Try __array__ before using s as a sequence */
+    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    else if (tmp == Py_NotImplemented) {
+        Py_DECREF(tmp);
+    }
+    else {
+        int r = PyArray_CopyInto(dst, (PyArrayObject *)tmp);
+        Py_DECREF(tmp);
+        if (r < 0) {
+            goto fail;
+        }
+        Py_DECREF(s);
+        return 0;
+    }
+
+    seq = PySequence_Fast(s, "Could not convert object to sequence");
+    if (seq == NULL) {
+        goto fail;
+    }
+    slen = PySequence_Fast_GET_SIZE(seq);
+
+    /*
+     * Either the dimensions match, or the sequence has length 1 and can
+     * be broadcast to the destination.
+     */
+    if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
+        PyErr_Format(PyExc_ValueError,
+                 "cannot copy sequence with size %zd to array axis "
+                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
+        goto fail;
+    }
+
+    /* Broadcast the one element from the sequence to all the outputs */
+    if (slen == 1) {
+        PyObject *o = PySequence_Fast_GET_ITEM(seq, 0);
+        npy_intp alen = PyArray_DIM(a, dim);
+
+        for (i = 0; i < alen; i++) {
+            if ((PyArray_NDIM(a) - dim) > 1) {
+                PyArrayObject * tmp =
+                    (PyArrayObject *)array_item_asarray(dst, i);
+                if (tmp == NULL) {
+                    goto fail;
+                }
+
+                res = setArrayFromSequence(a, o, dim+1, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
+                res = PyArray_SETITEM(dst, b, o);
+            }
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+    /* Copy element by element */
+    else {
+        for (i = 0; i < slen; i++) {
+            PyObject * o = PySequence_Fast_GET_ITEM(seq, i);
+            if ((PyArray_NDIM(a) - dim) > 1) {
+                PyArrayObject * tmp =
+                    (PyArrayObject *)array_item_asarray(dst, i);
+                if (tmp == NULL) {
+                    goto fail;
+                }
+
+                res = setArrayFromSequence(a, o, dim+1, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
+                res = PyArray_SETITEM(dst, b, o);
+            }
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+
+    Py_DECREF(seq);
+    Py_DECREF(s);
+    return 0;
+
+ fail:
+    Py_XDECREF(seq);
+    Py_DECREF(s);
+    return res;
+}
+
+
+
 /*NUMPY_API
  * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
  * Steals a reference to newtype --- which can be NULL
@@ -1404,6 +1564,71 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     if (ndim < 0) {
         return NULL;
     }
+
+    if (NPY_UNLIKELY(fixed_descriptor != NULL && PyDataType_HASSUBARRAY(dtype))) {
+        /*
+         * When a subarray dtype was passed in, its dimensions are appended
+         * to the array dimension (causing a dimension mismatch).
+         * There is a problem with that, because if we coerce from non-arrays
+         * we do this correctly by element (as defined by tuples), but for
+         * arrays we first append the dimensions and then assign to the base
+         * dtype and then assign which causes the problem.
+         *
+         * Thus, we check if there is an array included, in that case we
+         * give a FutureWarning.
+         * When the warning is removed, PyArray_Pack will have to ensure
+         * that that it does not append the dimensions when creating the
+         * subarrays to assign `arr[0] = obj[0]`.
+         */
+        int includes_array = 0;
+        if (cache != NULL) {
+            /* This is not ideal, but it is a pretty special case */
+            coercion_cache_obj *next = cache;
+            while (next != NULL) {
+                if (!next->sequence) {
+                    includes_array = 1;
+                    break;
+                }
+                next = next->next;
+            }
+        }
+        if (includes_array) {
+            npy_free_coercion_cache(cache);
+
+            ret = (PyArrayObject *) PyArray_NewFromDescr(
+                    &PyArray_Type, dtype, ndim, dims, NULL, NULL,
+                    flags & NPY_ARRAY_F_CONTIGUOUS, NULL);
+            if (ret == NULL) {
+                return NULL;
+            }
+            assert(PyArray_NDIM(ret) != ndim);
+
+            /* NumPy 1.20, 2020-10-01 */
+            if (DEPRECATE_FUTUREWARNING(
+                    "creating an array with a subarray dtype will behave "
+                    "differently when the `np.array()` (or `asarray`, etc.) "
+                    "call includes an array or array object.\n"
+                    "If you are converting a single array or a list of arrays,"
+                    "you can opt-in to the future behaviour using:\n"
+                    "    np.array(arr, dtype=np.dtype(['f', dtype]))['f']\n"
+                    "    np.array([arr1, arr2], dtype=np.dtype(['f', dtype]))['f']\n"
+                    "\n"
+                    "By including a new field and indexing it after the "
+                    "conversion.\n"
+                    "This may lead to a different result or to current failures "
+                    "succeeding.  (FutureWarning since NumPy 1.20)") < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+
+            if (setArrayFromSequence(ret, op, 0, NULL) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+            return (PyObject *)ret;
+        }
+    }
+
     if (dtype == NULL) {
         dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -1457,6 +1682,31 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 ((PyVoidScalarObject *)op)->flags,
                 NULL, op);
     }
+    else if (cache == 0 && newtype != NULL &&
+            PyDataType_ISSIGNED(newtype) && PyArray_IsScalar(op, Generic)) {
+        assert(ndim == 0);
+        /*
+         * This is an (possible) inconsistency where:
+         *
+         *     np.array(np.float64(np.nan), dtype=np.int64)
+         *
+         * behaves differently from:
+         *
+         *     np.array([np.float64(np.nan)], dtype=np.int64)
+         *     arr1d_int64[0] = np.float64(np.nan)
+         *     np.array(np.array(np.nan), dtype=np.int64)
+         *
+         * by not raising an error instead of using typical casting.
+         * The error is desirable, but to always error seems like a
+         * larger change to be considered at some other time and it is
+         * undesirable that 0-D arrays behave differently from scalars.
+         * This retains the behaviour, largely due to issues in pandas
+         * which relied on a try/except (although hopefully that will
+         * have a better solution at some point):
+         * https://github.com/pandas-dev/pandas/issues/35481
+         */
+        return PyArray_FromScalar(op, dtype);
+    }
 
     /* There was no array (or array-like) passed in directly. */
     if ((flags & NPY_ARRAY_WRITEBACKIFCOPY) ||
@@ -1464,28 +1714,57 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         PyErr_SetString(PyExc_TypeError,
                         "WRITEBACKIFCOPY used for non-array input.");
         Py_DECREF(dtype);
+        npy_free_coercion_cache(cache);
         return NULL;
     }
 
     /* Create a new array and copy the data */
+    Py_INCREF(dtype);  /* hold on in case of a subarray that is replaced */
     ret = (PyArrayObject *)PyArray_NewFromDescr(
             &PyArray_Type, dtype, ndim, dims, NULL, NULL,
             flags&NPY_ARRAY_F_CONTIGUOUS, NULL);
     if (ret == NULL) {
+        npy_free_coercion_cache(cache);
+        Py_DECREF(dtype);
         return NULL;
     }
+    if (ndim == PyArray_NDIM(ret)) {
+        /*
+         * Appending of dimensions did not occur, so use the actual dtype
+         * below. This is relevant for S0 or U0 which can be replaced with
+         * S1 or U1, although that should likely change.
+         */
+        Py_SETREF(dtype, PyArray_DESCR(ret));
+        Py_INCREF(dtype);
+    }
+
     if (cache == NULL) {
         /* This is a single item. Set it directly. */
         assert(ndim == 0);
-        if (PyArray_Pack(PyArray_DESCR(ret), PyArray_DATA(ret), op) < 0) {
+
+        if (PyArray_Pack(dtype, PyArray_BYTES(ret), op) < 0) {
+            Py_DECREF(dtype);
             Py_DECREF(ret);
             return NULL;
         }
+        Py_DECREF(dtype);
         return (PyObject *)ret;
     }
     assert(ndim != 0);
     assert(op == cache->converted_obj);
-    if (PyArray_AssignFromCache(ret, cache) < 0) {
+
+    /* Decrease the number of dimensions to the detected ones */
+    int out_ndim = PyArray_NDIM(ret);
+    PyArray_Descr *out_descr = PyArray_DESCR(ret);
+    ((PyArrayObject_fields *)ret)->nd = ndim;
+    ((PyArrayObject_fields *)ret)->descr = dtype;
+
+    int success = PyArray_AssignFromCache(ret, cache);
+
+    ((PyArrayObject_fields *)ret)->nd = out_ndim;
+    ((PyArrayObject_fields *)ret)->descr = out_descr;
+    Py_DECREF(dtype);
+    if (success < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -1575,6 +1854,7 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
     return obj;
 }
 
+
 /*NUMPY_API
  * steals reference to newtype --- acc. NULL
  */
@@ -1733,10 +2013,8 @@ NPY_NO_EXPORT PyObject *
 PyArray_FromStructInterface(PyObject *input)
 {
     PyArray_Descr *thetype = NULL;
-    char buf[40];
     PyArrayInterface *inter;
     PyObject *attr;
-    PyArrayObject *ret;
     char endian = NPY_NATBYTE;
 
     attr = PyArray_LookupSpecial_OnInstance(input, "__array_struct__");
@@ -1747,10 +2025,22 @@ PyArray_FromStructInterface(PyObject *input)
             return Py_NotImplemented;
         }
     }
-    if (!NpyCapsule_Check(attr)) {
+    if (!PyCapsule_CheckExact(attr)) {
+        if (PyType_Check(input) && PyObject_HasAttrString(attr, "__get__")) {
+            /*
+             * If the input is a class `attr` should be a property-like object.
+             * This cannot be interpreted as an array, but is a valid.
+             * (Needed due to the lookup being on the instance rather than type)
+             */
+            Py_DECREF(attr);
+            return Py_NotImplemented;
+        }
+        goto fail;
+    }
+    inter = PyCapsule_GetPointer(attr, NULL);
+    if (inter == NULL) {
         goto fail;
     }
-    inter = NpyCapsule_AsVoidPtr(attr);
     if (inter->two != 2) {
         goto fail;
     }
@@ -1767,20 +2057,26 @@ PyArray_FromStructInterface(PyObject *input)
     }
 
     if (thetype == NULL) {
-        PyOS_snprintf(buf, sizeof(buf),
-                "%c%c%d", endian, inter->typekind, inter->itemsize);
-        if (!(thetype=_array_typedescr_fromstr(buf))) {
+        PyObject *type_str = PyUnicode_FromFormat(
+            "%c%c%d", endian, inter->typekind, inter->itemsize);
+        if (type_str == NULL) {
+            Py_DECREF(attr);
+            return NULL;
+        }
+        int ok = PyArray_DescrConverter(type_str, &thetype);
+        Py_DECREF(type_str);
+        if (ok != NPY_SUCCEED) {
             Py_DECREF(attr);
             return NULL;
         }
     }
 
-    ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+    PyObject *ret = PyArray_NewFromDescrAndBase(
             &PyArray_Type, thetype,
             inter->nd, inter->shape, inter->strides, inter->data,
             inter->flags, NULL, input);
     Py_DECREF(attr);
-    return (PyObject *)ret;
+    return ret;
 
  fail:
     PyErr_SetString(PyExc_ValueError, "invalid __array_struct__");
@@ -1794,41 +2090,21 @@ PyArray_FromStructInterface(PyObject *input)
  */
 NPY_NO_EXPORT int
 _is_default_descr(PyObject *descr, PyObject *typestr) {
-    PyObject *tuple, *name, *typestr2;
-    PyObject *tmp = NULL;
-    int ret = 0;
-
     if (!PyList_Check(descr) || PyList_GET_SIZE(descr) != 1) {
         return 0;
     }
-    tuple = PyList_GET_ITEM(descr, 0);
+    PyObject *tuple = PyList_GET_ITEM(descr, 0);
     if (!(PyTuple_Check(tuple) && PyTuple_GET_SIZE(tuple) == 2)) {
         return 0;
     }
-    name = PyTuple_GET_ITEM(tuple, 0);
+    PyObject *name = PyTuple_GET_ITEM(tuple, 0);
     if (!(PyUnicode_Check(name) && PyUnicode_GetLength(name) == 0)) {
         return 0;
     }
-    typestr2 = PyTuple_GET_ITEM(tuple, 1);
-    /* Allow unicode type strings */
-    if (PyUnicode_Check(typestr2)) {
-        tmp = PyUnicode_AsASCIIString(typestr2);
-        if (tmp == NULL) {
-            return 0;
-        }
-        typestr2 = tmp;
-    }
-    if (PyBytes_Check(typestr2) &&
-            PyObject_RichCompareBool(typestr, typestr2, Py_EQ)) {
-        ret = 1;
-    }
-    Py_XDECREF(tmp);
-
-    return ret;
+    PyObject *typestr2 = PyTuple_GET_ITEM(tuple, 1);
+    return PyObject_RichCompareBool(typestr, typestr2, Py_EQ);
 }
 
-#define PyIntOrLong_Check(obj) (PyInt_Check(obj) || PyLong_Check(obj))
-
 /*NUMPY_API*/
 NPY_NO_EXPORT PyObject *
 PyArray_FromInterface(PyObject *origin)
@@ -1840,12 +2116,12 @@ PyArray_FromInterface(PyObject *origin)
     PyArray_Descr *dtype = NULL;
     char *data = NULL;
     Py_buffer view;
-    int res, i, n;
+    int i, n;
     npy_intp dims[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     int dataflags = NPY_ARRAY_BEHAVED;
 
-    iface = PyArray_LookupSpecial_OnInstance(origin,
-                                                    "__array_interface__");
+    iface = PyArray_LookupSpecial_OnInstance(origin, "__array_interface__");
+
     if (iface == NULL) {
         if (PyErr_Occurred()) {
             PyErr_Clear(); /* TODO[gh-14801]: propagate crashes during attribute access? */
@@ -1853,6 +2129,16 @@ PyArray_FromInterface(PyObject *origin)
         return Py_NotImplemented;
     }
     if (!PyDict_Check(iface)) {
+        if (PyType_Check(origin) && PyObject_HasAttrString(iface, "__get__")) {
+            /*
+             * If the input is a class `iface` should be a property-like object.
+             * This cannot be interpreted as an array, but is a valid.
+             * (Needed due to the lookup being on the instance rather than type)
+             */
+            Py_DECREF(iface);
+            return Py_NotImplemented;
+        }
+
         Py_DECREF(iface);
         PyErr_SetString(PyExc_ValueError,
                 "Invalid __array_interface__ value, must be a dict");
@@ -1870,26 +2156,15 @@ PyArray_FromInterface(PyObject *origin)
         return NULL;
     }
 
-    /* Allow unicode type strings */
-    if (PyUnicode_Check(attr)) {
-        PyObject *tmp = PyUnicode_AsASCIIString(attr);
-        if (tmp == NULL) {
-            goto fail;
-        }
-        attr = tmp;
-    }
-    else {
-        Py_INCREF(attr);
-    }
-
-    if (!PyBytes_Check(attr)) {
+    /* allow bytes for backwards compatibility */
+    if (!PyBytes_Check(attr) && !PyUnicode_Check(attr)) {
         PyErr_SetString(PyExc_TypeError,
                     "__array_interface__ typestr must be a string");
         goto fail;
     }
+
     /* Get dtype from type string */
-    dtype = _array_typedescr_fromstr(PyString_AS_STRING(attr));
-    if (dtype == NULL) {
+    if (PyArray_DescrConverter(attr, &dtype) != NPY_SUCCEED) {
         goto fail;
     }
 
@@ -1903,16 +2178,24 @@ PyArray_FromInterface(PyObject *origin)
             goto fail;
         }
         PyArray_Descr *new_dtype = NULL;
+        if (descr != NULL) {
+            int is_default = _is_default_descr(descr, attr);
+            if (is_default < 0) {
+                goto fail;
+            }
+            if (!is_default) {
+                if (PyArray_DescrConverter2(descr, &new_dtype) != NPY_SUCCEED) {
+                    goto fail;
+                }
+                if (new_dtype != NULL) {
+                    Py_DECREF(dtype);
+                    dtype = new_dtype;
+                }
+            }
 
-        if (descr != NULL && !_is_default_descr(descr, attr) &&
-                PyArray_DescrConverter2(descr, &new_dtype) == NPY_SUCCEED &&
-                new_dtype != NULL) {
-            Py_DECREF(dtype);
-            dtype = new_dtype;
         }
-    }
 
-    Py_DECREF(attr);  /* Pairs with the unicode handling above */
+    }
 
     /* Get shape tuple from interface specification */
     attr = _PyDict_GetItemStringWithError(iface, "shape");
@@ -1971,22 +2254,16 @@ PyArray_FromInterface(PyObject *origin)
             goto fail;
         }
         dataptr = PyTuple_GET_ITEM(attr, 0);
-        if (PyString_Check(dataptr)) {
-            res = sscanf(PyString_AsString(dataptr),
-                         "%p", (void **)&data);
-            if (res < 1) {
-                PyErr_SetString(PyExc_TypeError,
-                        "__array_interface__ data string cannot be converted");
+        if (PyLong_Check(dataptr)) {
+            data = PyLong_AsVoidPtr(dataptr);
+            if (data == NULL && PyErr_Occurred()) {
                 goto fail;
             }
         }
-        else if (PyIntOrLong_Check(dataptr)) {
-            data = PyLong_AsVoidPtr(dataptr);
-        }
         else {
             PyErr_SetString(PyExc_TypeError,
                     "first element of __array_interface__ data tuple "
-                    "must be integer or string.");
+                    "must be an integer.");
             goto fail;
         }
         if (PyObject_IsTrue(PyTuple_GET_ITEM(attr,1))) {
@@ -2119,6 +2396,16 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
         }
         return Py_NotImplemented;
     }
+    if (PyType_Check(op) && PyObject_HasAttrString(array_meth, "__get__")) {
+        /*
+         * If the input is a class `array_meth` may be a property-like object.
+         * This cannot be interpreted as an array (called), but is a valid.
+         * Trying `array_meth.__call__()` on this should not be useful.
+         * (Needed due to the lookup being on the instance rather than type)
+         */
+        Py_DECREF(array_meth);
+        return Py_NotImplemented;
+    }
     if (typecode == NULL) {
         new = PyObject_CallFunction(array_meth, NULL);
     }
@@ -2236,7 +2523,10 @@ PyArray_EnsureAnyArray(PyObject *op)
     return PyArray_EnsureArray(op);
 }
 
-/* TODO: Put the order parameter in PyArray_CopyAnyInto and remove this */
+/*
+ * Private implementation of PyArray_CopyAnyInto with an additional order
+ * parameter.
+ */
 NPY_NO_EXPORT int
 PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
 {
@@ -2362,16 +2652,21 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     src_count = *src_countptr;
     dst_data = dst_dataptr[0];
     src_data = src_dataptr[0];
+    int res = 0;
     for(;;) {
         /* Transfer the biggest amount that fits both */
         count = (src_count < dst_count) ? src_count : dst_count;
-        stransfer(dst_data, dst_stride,
-                    src_data, src_stride,
-                    count, src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_stride, src_data, src_stride,
+                count, src_itemsize, transferdata) < 0) {
+            res = -1;
+            break;
+        }
 
         /* If we exhausted the dst block, refresh it */
         if (dst_count == count) {
-            if (!dst_iternext(dst_iter)) {
+            res = dst_iternext(dst_iter);
+            if (!res) {
                 break;
             }
             dst_count = *dst_countptr;
@@ -2384,7 +2679,8 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
 
         /* If we exhausted the src block, refresh it */
         if (src_count == count) {
-            if (!src_iternext(src_iter)) {
+            res = src_iternext(src_iter);
+            if (!res) {
                 break;
             }
             src_count = *src_countptr;
@@ -2401,8 +2697,11 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     NPY_AUXDATA_FREE(transferdata);
     NpyIter_Deallocate(dst_iter);
     NpyIter_Deallocate(src_iter);
-
-    return PyErr_Occurred() ? -1 : 0;
+    if (res > 0) {
+        /* The iteration stopped successfully, do not report an error */
+        return 0;
+    }
+    return res;
 }
 
 /*NUMPY_API
@@ -2712,7 +3011,7 @@ _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, i
         return -1;
     }
 
-    zero = PyInt_FromLong(0);
+    zero = PyLong_FromLong(0);
     if (!zero) {
         Py_DECREF(*next);
         *next = NULL;
@@ -2857,14 +3156,14 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
         Py_INCREF(dtype);
     }
     if (!step || step == Py_None) {
-        step = PyInt_FromLong(1);
+        step = PyLong_FromLong(1);
     }
     else {
         Py_XINCREF(step);
     }
     if (!stop || stop == Py_None) {
         stop = start;
-        start = PyInt_FromLong(0);
+        start = PyLong_FromLong(0);
     }
     else {
         Py_INCREF(start);
@@ -2957,7 +3256,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
     return NULL;
 }
 
-/* This array creation function steals the reference to dtype. */
+/* This array creation function does not steal the reference to dtype. */
 static PyArrayObject *
 array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nread)
 {
@@ -2985,7 +3284,6 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
         if (fail) {
             PyErr_SetString(PyExc_IOError,
                             "could not seek in file");
-            Py_DECREF(dtype);
             return NULL;
         }
         num = numbytes / dtype->elsize;
@@ -2997,6 +3295,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
      */
     elsize = dtype->elsize;
 
+    Py_INCREF(dtype);  /* do not steal the original dtype. */
     r = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, 1, &num,
                                               NULL, NULL, 0, NULL);
     if (r == NULL) {
@@ -3012,7 +3311,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
 /*
  * Create an array by reading from the given stream, using the passed
  * next_element and skip_separator functions.
- * As typical for array creation functions, it steals the reference to dtype.
+ * Does not steal the reference to dtype.
  */
 #define FROM_BUFFER_SIZE 4096
 static PyArrayObject *
@@ -3041,7 +3340,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         PyArray_NewFromDescr(&PyArray_Type, dtype, 1, &size,
                              NULL, NULL, 0, NULL);
     if (r == NULL) {
-        Py_DECREF(dtype);
         return NULL;
     }
 
@@ -3104,7 +3402,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         if (PyErr_Occurred()) {
             /* If an error is already set (unlikely), do not create new one */
             Py_DECREF(r);
-            Py_DECREF(dtype);
             return NULL;
         }
         /* 2019-09-12, NumPy 1.18 */
@@ -3116,7 +3413,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
     }
 
 fail:
-    Py_DECREF(dtype);
     if (err == 1) {
         PyErr_NoMemory();
     }
@@ -3182,20 +3478,26 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
                 (skip_separator) fromfile_skip_separator, NULL);
     }
     if (ret == NULL) {
+        Py_DECREF(dtype);
         return NULL;
     }
     if (((npy_intp) nread) < num) {
-        /* Realloc memory for smaller number of elements */
-        const size_t nsize = PyArray_MAX(nread,1)*PyArray_DESCR(ret)->elsize;
+        /*
+         * Realloc memory for smaller number of elements, use original dtype
+         * which may have include a subarray (and is used for `nread`).
+         */
+        const size_t nsize = PyArray_MAX(nread,1) * dtype->elsize;
         char *tmp;
 
-        if((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+        if ((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+            Py_DECREF(dtype);
             Py_DECREF(ret);
             return PyErr_NoMemory();
         }
         ((PyArrayObject_fields *)ret)->data = tmp;
         PyArray_DIMS(ret)[0] = nread;
     }
+    Py_DECREF(dtype);
     return (PyObject *)ret;
 }
 
@@ -3406,6 +3708,7 @@ PyArray_FromString(char *data, npy_intp slen, PyArray_Descr *dtype,
                               (next_element) fromstr_next_element,
                               (skip_separator) fromstr_skip_separator,
                               end);
+        Py_DECREF(dtype);
     }
     return (PyObject *)ret;
 }
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 8f3948c23..9c1b606bb 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -25,6 +25,9 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "convert_datatype.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "usertypes.h"
 
 /*
  * Computes the python `ret, d = divmod(d, unit)`.
@@ -1434,18 +1437,20 @@ raise_if_datetime64_metadata_cast_error(char *object_type,
         return 0;
     }
     else {
-        PyObject *errmsg;
-        errmsg = PyUString_FromFormat("Cannot cast %s "
-                    "from metadata ", object_type);
-        errmsg = append_metastr_to_string(src_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" to "));
-        errmsg = append_metastr_to_string(dst_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromFormat(" according to the rule %s",
-                        npy_casting_to_string(casting)));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *src = metastr_to_unicode(src_meta, 0);
+        if (src == NULL) {
+            return -1;
+        }
+        PyObject *dst = metastr_to_unicode(dst_meta, 0);
+        if (dst == NULL) {
+            Py_DECREF(src);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+            "Cannot cast %s from metadata %S to %S according to the rule %s",
+            object_type, src, dst, npy_casting_to_string(casting));
+        Py_DECREF(src);
+        Py_DECREF(dst);
         return -1;
     }
 }
@@ -1466,18 +1471,20 @@ raise_if_timedelta64_metadata_cast_error(char *object_type,
         return 0;
     }
     else {
-        PyObject *errmsg;
-        errmsg = PyUString_FromFormat("Cannot cast %s "
-                    "from metadata ", object_type);
-        errmsg = append_metastr_to_string(src_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" to "));
-        errmsg = append_metastr_to_string(dst_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromFormat(" according to the rule %s",
-                        npy_casting_to_string(casting)));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *src = metastr_to_unicode(src_meta, 0);
+        if (src == NULL) {
+            return -1;
+        }
+        PyObject *dst = metastr_to_unicode(dst_meta, 0);
+        if (dst == NULL) {
+            Py_DECREF(src);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+             "Cannot cast %s from metadata %S to %S according to the rule %s",
+             object_type, src, dst, npy_casting_to_string(casting));
+        Py_DECREF(src);
+        Py_DECREF(dst);
         return -1;
     }
 }
@@ -1600,32 +1607,38 @@ compute_datetime_metadata_greatest_common_divisor(
     return 0;
 
 incompatible_units: {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Cannot get "
-                    "a common metadata divisor for "
-                    "NumPy datetime metadata ");
-        errmsg = append_metastr_to_string(meta1, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" and "));
-        errmsg = append_metastr_to_string(meta2, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" because they have "
-                    "incompatible nonlinear base time units"));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *umeta1 = metastr_to_unicode(meta1, 0);
+        if (umeta1 == NULL) {
+            return -1;
+        }
+        PyObject *umeta2 = metastr_to_unicode(meta2, 0);
+        if (umeta2 == NULL) {
+            Py_DECREF(umeta1);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+            "Cannot get a common metadata divisor for Numpy datatime "
+            "metadata %S and %S because they have incompatible nonlinear "
+            "base time units.", umeta1, umeta2);
+        Py_DECREF(umeta1);
+        Py_DECREF(umeta2);
         return -1;
     }
 units_overflow: {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Integer overflow "
-                    "getting a common metadata divisor for "
-                    "NumPy datetime metadata ");
-        errmsg = append_metastr_to_string(meta1, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" and "));
-        errmsg = append_metastr_to_string(meta2, 0, errmsg);
-        PyErr_SetObject(PyExc_OverflowError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *umeta1 = metastr_to_unicode(meta1, 0);
+        if (umeta1 == NULL) {
+            return -1;
+        }
+        PyObject *umeta2 = metastr_to_unicode(meta2, 0);
+        if (umeta2 == NULL) {
+            Py_DECREF(umeta1);
+            return -1;
+        }
+        PyErr_Format(PyExc_OverflowError,
+            "Integer overflow getting a common metadata divisor for "
+            "NumPy datetime metadata %S and %S.", umeta1, umeta2);
+        Py_DECREF(umeta1);
+        Py_DECREF(umeta2);
         return -1;
     }
 }
@@ -1717,6 +1730,10 @@ parse_datetime_unit_from_string(char const *str, Py_ssize_t len, char const *met
                 return NPY_FR_as;
         }
     }
+    else if (len == 3 && !strncmp(str, "\xce\xbcs", 3)) {
+        /* greek small letter mu, utf8-encoded */
+        return NPY_FR_us;
+    }
     else if (len == 7 && !strncmp(str, "generic", 7)) {
         return NPY_FR_GENERIC;
     }
@@ -1747,9 +1764,9 @@ convert_datetime_metadata_to_tuple(PyArray_DatetimeMetaData *meta)
     }
 
     PyTuple_SET_ITEM(dt_tuple, 0,
-            PyUString_FromString(_datetime_strings[meta->base]));
+            PyUnicode_FromString(_datetime_strings[meta->base]));
     PyTuple_SET_ITEM(dt_tuple, 1,
-            PyInt_FromLong(meta->num));
+            PyLong_FromLong(meta->num));
 
     return dt_tuple;
 }
@@ -1764,22 +1781,16 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
                                         PyArray_DatetimeMetaData *out_meta,
                                         npy_bool from_pickle)
 {
-    char *basestr = NULL;
-    Py_ssize_t len = 0, tuple_size;
     int den = 1;
-    PyObject *unit_str = NULL;
 
     if (!PyTuple_Check(tuple)) {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Require tuple for tuple to NumPy "
-                                      "datetime metadata conversion, not ");
-        PyUString_ConcatAndDel(&errmsg, PyObject_Repr(tuple));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyErr_Format(PyExc_TypeError,
+                "Require tuple for tuple to NumPy "
+                "datetime metadata conversion, not %R", tuple);
         return -1;
     }
 
-    tuple_size = PyTuple_GET_SIZE(tuple);
+    Py_ssize_t tuple_size = PyTuple_GET_SIZE(tuple);
     if (tuple_size < 2 || tuple_size > 4) {
         PyErr_SetString(PyExc_TypeError,
                         "Require tuple of size 2 to 4 for "
@@ -1787,18 +1798,22 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
         return -1;
     }
 
-    unit_str = PyTuple_GET_ITEM(tuple, 0);
-    Py_INCREF(unit_str);
-    if (PyUnicode_Check(unit_str)) {
-        /* Allow unicode format strings: convert to bytes */
-        PyObject *tmp = PyUnicode_AsASCIIString(unit_str);
-        Py_DECREF(unit_str);
+    PyObject *unit_str = PyTuple_GET_ITEM(tuple, 0);
+    if (PyBytes_Check(unit_str)) {
+        /* Allow bytes format strings: convert to unicode */
+        PyObject *tmp = PyUnicode_FromEncodedObject(unit_str, NULL, NULL);
         if (tmp == NULL) {
             return -1;
         }
         unit_str = tmp;
     }
-    if (PyBytes_AsStringAndSize(unit_str, &basestr, &len) < 0) {
+    else {
+        Py_INCREF(unit_str);
+    }
+
+    Py_ssize_t len;
+    char const *basestr = PyUnicode_AsUTF8AndSize(unit_str, &len);
+    if (basestr == NULL) {
         Py_DECREF(unit_str);
         return -1;
     }
@@ -1812,7 +1827,7 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
     Py_DECREF(unit_str);
 
     /* Convert the values to longs */
-    out_meta->num = PyInt_AsLong(PyTuple_GET_ITEM(tuple, 1));
+    out_meta->num = PyLong_AsLong(PyTuple_GET_ITEM(tuple, 1));
     if (error_converting(out_meta->num)) {
         return -1;
     }
@@ -1837,11 +1852,10 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
         if (from_pickle) {
             /* if (event == 1) */
             PyObject *one = PyLong_FromLong(1);
-            int equal_one;
             if (one == NULL) {
                 return -1;
             }
-            equal_one = PyObject_RichCompareBool(event, one, Py_EQ);
+            int equal_one = PyObject_RichCompareBool(event, one, Py_EQ);
             Py_DECREF(one);
             if (equal_one == -1) {
                 return -1;
@@ -1868,7 +1882,7 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
                 return -1;
             }
         }
-        den = PyInt_AsLong(PyTuple_GET_ITEM(tuple, 2));
+        den = PyLong_AsLong(PyTuple_GET_ITEM(tuple, 2));
         if (error_converting(den)) {
             return -1;
         }
@@ -1900,26 +1914,23 @@ NPY_NO_EXPORT int
 convert_pyobject_to_datetime_metadata(PyObject *obj,
                                       PyArray_DatetimeMetaData *out_meta)
 {
-    PyObject *ascii = NULL;
-    char *str = NULL;
-    Py_ssize_t len = 0;
-
     if (PyTuple_Check(obj)) {
         return convert_datetime_metadata_tuple_to_datetime_metadata(
             obj, out_meta, NPY_FALSE);
     }
 
-    /* Get an ASCII string */
-    if (PyUnicode_Check(obj)) {
-        /* Allow unicode format strings: convert to bytes */
-        ascii = PyUnicode_AsASCIIString(obj);
-        if (ascii == NULL) {
+    /* Get a UTF8 string */
+    PyObject *utf8 = NULL;
+    if (PyBytes_Check(obj)) {
+        /* Allow bytes format strings: convert to unicode */
+        utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+        if (utf8 == NULL) {
             return -1;
         }
     }
-    else if (PyBytes_Check(obj)) {
-        ascii = obj;
-        Py_INCREF(ascii);
+    else if (PyUnicode_Check(obj)) {
+        utf8 = obj;
+        Py_INCREF(utf8);
     }
     else {
         PyErr_SetString(PyExc_TypeError,
@@ -1927,58 +1938,52 @@ convert_pyobject_to_datetime_metadata(PyObject *obj,
         return -1;
     }
 
-    if (PyBytes_AsStringAndSize(ascii, &str, &len) < 0) {
-        Py_DECREF(ascii);
+    Py_ssize_t len = 0;
+    char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+    if (str == NULL) {
+        Py_DECREF(utf8);
         return -1;
     }
 
     if (len > 0 && str[0] == '[') {
         int r = parse_datetime_metadata_from_metastr(str, len, out_meta);
-        Py_DECREF(ascii);
+        Py_DECREF(utf8);
         return r;
     }
     else {
         if (parse_datetime_extended_unit_from_string(str, len,
                                                 NULL, out_meta) < 0) {
-            Py_DECREF(ascii);
+            Py_DECREF(utf8);
             return -1;
         }
 
-        Py_DECREF(ascii);
+        Py_DECREF(utf8);
         return 0;
     }
 }
 
 /*
- * 'ret' is a PyUString containing the datetime string, and this
- * function appends the metadata string to it.
+ * Return the datetime metadata as a Unicode object.
+ *
+ * Returns new reference, NULL on error.
  *
  * If 'skip_brackets' is true, skips the '[]'.
  *
- * This function steals the reference 'ret'
  */
 NPY_NO_EXPORT PyObject *
-append_metastr_to_string(PyArray_DatetimeMetaData *meta,
-                                    int skip_brackets,
-                                    PyObject *ret)
+metastr_to_unicode(PyArray_DatetimeMetaData *meta, int skip_brackets)
 {
-    PyObject *res;
     int num;
     char const *basestr;
 
-    if (ret == NULL) {
-        return NULL;
-    }
-
     if (meta->base == NPY_FR_GENERIC) {
         /* Without brackets, give a string "generic" */
         if (skip_brackets) {
-            PyUString_ConcatAndDel(&ret, PyUString_FromString("generic"));
-            return ret;
+            return PyUnicode_FromString("generic");
         }
-        /* But with brackets, append nothing */
+        /* But with brackets, return nothing */
         else {
-            return ret;
+            return PyUnicode_FromString("");
         }
     }
 
@@ -1994,25 +1999,23 @@ append_metastr_to_string(PyArray_DatetimeMetaData *meta,
 
     if (num == 1) {
         if (skip_brackets) {
-            res = PyUString_FromFormat("%s", basestr);
+            return PyUnicode_FromFormat("%s", basestr);
         }
         else {
-            res = PyUString_FromFormat("[%s]", basestr);
+            return PyUnicode_FromFormat("[%s]", basestr);
         }
     }
     else {
         if (skip_brackets) {
-            res = PyUString_FromFormat("%d%s", num, basestr);
+            return PyUnicode_FromFormat("%d%s", num, basestr);
         }
         else {
-            res = PyUString_FromFormat("[%d%s]", num, basestr);
+            return PyUnicode_FromFormat("[%d%s]", num, basestr);
         }
     }
-
-    PyUString_ConcatAndDel(&ret, res);
-    return ret;
 }
 
+
 /*
  * Adjusts a datetimestruct based on a seconds offset. Assumes
  * the current values are valid.
@@ -2108,7 +2111,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->year = PyInt_AsLong(tmp);
+    out->year = PyLong_AsLong(tmp);
     if (error_converting(out->year)) {
         Py_DECREF(tmp);
         return -1;
@@ -2120,7 +2123,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->month = PyInt_AsLong(tmp);
+    out->month = PyLong_AsLong(tmp);
     if (error_converting(out->month)) {
         Py_DECREF(tmp);
         return -1;
@@ -2132,7 +2135,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->day = PyInt_AsLong(tmp);
+    out->day = PyLong_AsLong(tmp);
     if (error_converting(out->day)) {
         Py_DECREF(tmp);
         return -1;
@@ -2166,7 +2169,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->hour = PyInt_AsLong(tmp);
+    out->hour = PyLong_AsLong(tmp);
     if (error_converting(out->hour)) {
         Py_DECREF(tmp);
         return -1;
@@ -2178,7 +2181,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->min = PyInt_AsLong(tmp);
+    out->min = PyLong_AsLong(tmp);
     if (error_converting(out->min)) {
         Py_DECREF(tmp);
         return -1;
@@ -2190,7 +2193,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->sec = PyInt_AsLong(tmp);
+    out->sec = PyLong_AsLong(tmp);
     if (error_converting(out->sec)) {
         Py_DECREF(tmp);
         return -1;
@@ -2202,7 +2205,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->us = PyInt_AsLong(tmp);
+    out->us = PyLong_AsLong(tmp);
     if (error_converting(out->us)) {
         Py_DECREF(tmp);
         return -1;
@@ -2350,32 +2353,33 @@ convert_pyobject_to_datetime(PyArray_DatetimeMetaData *meta, PyObject *obj,
                                 NPY_CASTING casting, npy_datetime *out)
 {
     if (PyBytes_Check(obj) || PyUnicode_Check(obj)) {
-        PyObject *bytes = NULL;
-        char *str = NULL;
-        Py_ssize_t len = 0;
-        npy_datetimestruct dts;
-        NPY_DATETIMEUNIT bestunit = NPY_FR_ERROR;
+        PyObject *utf8 = NULL;
 
-        /* Convert to an ASCII string for the date parser */
-        if (PyUnicode_Check(obj)) {
-            bytes = PyUnicode_AsASCIIString(obj);
-            if (bytes == NULL) {
+        /* Convert to an UTF8 string for the date parser */
+        if (PyBytes_Check(obj)) {
+            utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+            if (utf8 == NULL) {
                 return -1;
             }
         }
         else {
-            bytes = obj;
-            Py_INCREF(bytes);
+            utf8 = obj;
+            Py_INCREF(utf8);
         }
-        if (PyBytes_AsStringAndSize(bytes, &str, &len) < 0) {
-            Py_DECREF(bytes);
+
+        Py_ssize_t len = 0;
+        char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+        if (str == NULL) {
+            Py_DECREF(utf8);
             return -1;
         }
 
         /* Parse the ISO date */
+        npy_datetimestruct dts;
+        NPY_DATETIMEUNIT bestunit = NPY_FR_ERROR;
         if (parse_iso_8601_datetime(str, len, meta->base, casting,
                                 &dts, &bestunit, NULL) < 0) {
-            Py_DECREF(bytes);
+            Py_DECREF(utf8);
             return -1;
         }
 
@@ -2386,15 +2390,15 @@ convert_pyobject_to_datetime(PyArray_DatetimeMetaData *meta, PyObject *obj,
         }
 
         if (convert_datetimestruct_to_datetime(meta, &dts, out) < 0) {
-            Py_DECREF(bytes);
+            Py_DECREF(utf8);
             return -1;
         }
 
-        Py_DECREF(bytes);
+        Py_DECREF(utf8);
         return 0;
     }
     /* Do no conversion on raw integers */
-    else if (PyInt_Check(obj) || PyLong_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Don't allow conversion from an integer without specifying a unit */
         if (meta->base == NPY_FR_ERROR || meta->base == NPY_FR_GENERIC) {
             PyErr_SetString(PyExc_ValueError, "Converting an integer to a "
@@ -2544,24 +2548,25 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                                 NPY_CASTING casting, npy_timedelta *out)
 {
     if (PyBytes_Check(obj) || PyUnicode_Check(obj)) {
-        PyObject *bytes = NULL;
-        char *str = NULL;
-        Py_ssize_t len = 0;
+        PyObject *utf8 = NULL;
         int succeeded = 0;
 
-        /* Convert to an ASCII string for the date parser */
-        if (PyUnicode_Check(obj)) {
-            bytes = PyUnicode_AsASCIIString(obj);
-            if (bytes == NULL) {
+        /* Convert to an UTF8 string for the date parser */
+        if (PyBytes_Check(obj)) {
+            utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+            if (utf8 == NULL) {
                 return -1;
             }
         }
         else {
-            bytes = obj;
-            Py_INCREF(bytes);
+            utf8 = obj;
+            Py_INCREF(utf8);
         }
-        if (PyBytes_AsStringAndSize(bytes, &str, &len) < 0) {
-            Py_DECREF(bytes);
+
+        Py_ssize_t len = 0;
+        char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+        if (str == NULL) {
+            Py_DECREF(utf8);
             return -1;
         }
 
@@ -2582,7 +2587,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                 succeeded = 1;
             }
         }
-        Py_DECREF(bytes);
+        Py_DECREF(utf8);
 
         if (succeeded) {
             /* Use generic units if none was specified */
@@ -2595,7 +2600,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         }
     }
     /* Do no conversion on raw integers */
-    else if (PyInt_Check(obj) || PyLong_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Use the default unit if none was specified */
         if (meta->base == NPY_FR_ERROR) {
             meta->base = NPY_DATETIME_DEFAULTUNIT;
@@ -2699,7 +2704,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         if (tmp == NULL) {
             return -1;
         }
-        seconds = PyInt_AsLong(tmp);
+        seconds = PyLong_AsLong(tmp);
         if (error_converting(seconds)) {
             Py_DECREF(tmp);
             return -1;
@@ -2711,7 +2716,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         if (tmp == NULL) {
             return -1;
         }
-        useconds = PyInt_AsLong(tmp);
+        useconds = PyLong_AsLong(tmp);
         if (error_converting(useconds)) {
             Py_DECREF(tmp);
             return -1;
@@ -3320,8 +3325,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
         type_nums[2] = NPY_TIMEDELTA;
     }
     else {
-        if (PyInt_Check(objs[1]) ||
-                        PyLong_Check(objs[1]) ||
+        if (PyLong_Check(objs[1]) ||
                         PyArray_IsScalar(objs[1], Integer) ||
                         is_any_numpy_timedelta(objs[1])) {
             type_nums[1] = NPY_TIMEDELTA;
@@ -3724,3 +3728,375 @@ find_object_datetime_type(PyObject *obj, int type_num)
         return NULL;
     }
 }
+
+
+
+
+/*
+ * Describes casting within datetimes or timedelta
+ */
+static NPY_CASTING
+time_to_time_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /* This is a within-dtype cast, which currently must handle byteswapping */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[0]);
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    int is_timedelta = given_descrs[0]->type_num == NPY_TIMEDELTA;
+
+    if (given_descrs[0] == given_descrs[1]) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+
+    NPY_CASTING byteorder_may_allow_view = 0;
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+            PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        byteorder_may_allow_view = _NPY_CAST_IS_VIEW;
+    }
+    PyArray_DatetimeMetaData *meta1, *meta2;
+    meta1 = get_datetime_metadata_from_dtype(loop_descrs[0]);
+    assert(meta1 != NULL);
+    meta2 = get_datetime_metadata_from_dtype(loop_descrs[1]);
+    assert(meta2 != NULL);
+
+    if (meta1->base == meta2->base && meta1->num == meta2->num) {
+        if (byteorder_may_allow_view) {
+            return NPY_NO_CASTING | byteorder_may_allow_view;
+        }
+        return NPY_EQUIV_CASTING;
+    }
+    else if (meta1->base == NPY_FR_GENERIC) {
+        return NPY_SAFE_CASTING | byteorder_may_allow_view;
+    }
+    else if (meta2->base == NPY_FR_GENERIC) {
+        /* TODO: This is actually an invalid cast (casting will error) */
+        return NPY_UNSAFE_CASTING;
+    }
+    else if (is_timedelta && (
+            /* jump between time units and date units is unsafe for timedelta */
+            (meta1->base <= NPY_FR_M && meta2->base > NPY_FR_M) ||
+            (meta1->base > NPY_FR_M && meta2->base <= NPY_FR_M))) {
+        return NPY_UNSAFE_CASTING;
+    }
+    else if (meta1->base <= meta2->base) {
+        /* Casting to a more precise unit is currently considered safe */
+        if (datetime_metadata_divides(meta1, meta2, is_timedelta)) {
+            /* If it divides, we consider it to be a safe cast */
+            return NPY_SAFE_CASTING;
+        }
+        else {
+            return NPY_SAME_KIND_CASTING;
+        }
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+/* Handles datetime<->timedelta type resolution (both directions) */
+static NPY_CASTING
+datetime_to_timedelta_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    if (given_descrs[1] == NULL) {
+        PyArray_DatetimeMetaData *meta = get_datetime_metadata_from_dtype(given_descrs[0]);
+        assert(meta != NULL);
+        loop_descrs[1] = create_datetime_dtype(dtypes[1]->type_num, meta);
+    }
+    else {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+    }
+    if (loop_descrs[1] == NULL) {
+        Py_DECREF(loop_descrs[0]);
+        return -1;
+    }
+    /*
+     * Mostly NPY_UNSAFE_CASTING is not true, the cast will fail.
+     * TODO: Once ufuncs use dtype specific promotion rules,
+     *       this is likely unnecessary
+     */
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/* In the current setup both strings and unicode casts support all outputs */
+static NPY_CASTING
+time_to_string_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs)
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (given_descrs[1] != NULL) {
+        /*
+         * At the time of writing, NumPy does not check the length here,
+         * but will error if filling fails.
+         */
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+    else {
+        /* Find the correct string length, possibly based on the unit */
+        int size;
+        if (given_descrs[0]->type_num == NPY_DATETIME) {
+            PyArray_DatetimeMetaData *meta = get_datetime_metadata_from_dtype(given_descrs[0]);
+            assert(meta != NULL);
+            size = get_datetime_iso_8601_strlen(0, meta->base);
+        }
+        else {
+            size = 21;
+        }
+        if (dtypes[1]->type_num == NPY_UNICODE) {
+            size *= 4;
+        }
+        loop_descrs[1] = PyArray_DescrNewFromType(dtypes[1]->type_num);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+        loop_descrs[1]->elsize = size;
+    }
+    assert(self->casting == NPY_UNSAFE_CASTING);
+    return NPY_UNSAFE_CASTING;
+}
+
+
+static NPY_CASTING
+string_to_datetime_cast_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /* We currently support byte-swapping, so any (unicode) string is OK */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        /* NOTE: This doesn't actually work, and will error during the cast */
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/*
+ * This registers the castingimpl for all datetime related casts.
+ */
+NPY_NO_EXPORT int
+PyArray_InitializeDatetimeCasts()
+{
+    int result = -1;
+
+    PyType_Slot slots[3];
+    PyArray_DTypeMeta *dtypes[2];
+    PyArrayMethod_Spec spec = {
+        .name = "datetime_casts",
+        .nin = 1,
+        .nout = 1,
+        .casting = NPY_NO_CASTING,
+        .flags = NPY_METH_SUPPORTS_UNALIGNED,
+        .slots = slots,
+        .dtypes = dtypes,
+    };
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &time_to_time_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    PyArray_DTypeMeta *datetime = PyArray_DTypeFromTypeNum(NPY_DATETIME);
+    PyArray_DTypeMeta *timedelta = PyArray_DTypeFromTypeNum(NPY_TIMEDELTA);
+    PyArray_DTypeMeta *string = PyArray_DTypeFromTypeNum(NPY_STRING);
+    PyArray_DTypeMeta *unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE);
+    PyArray_DTypeMeta *tmp = NULL;
+
+    dtypes[0] = datetime;
+    dtypes[1] = datetime;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+    dtypes[0] = timedelta;
+    dtypes[1] = timedelta;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Casting between timedelta and datetime uses legacy casting loops, but
+     * custom dtype resolution (to handle copying of the time unit).
+     */
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &datetime_to_timedelta_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    spec.name = "timedelta_and_datetime_cast";
+    dtypes[0] = timedelta;
+    dtypes[1] = datetime;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+    spec.name = "datetime_to_timedelta_cast";
+    dtypes[0] = datetime;
+    dtypes[1] = timedelta;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Cast from numeric types to times.  These use the cast functions
+     * as stored on the datatype, which should be replaced at some point.
+     * Some of these casts can fail (casting to unitless datetime), but these
+     * are rather special.
+     */
+    for (int num = 0; num < NPY_NTYPES; num++) {
+        if (!PyTypeNum_ISNUMBER(num) && num != NPY_BOOL) {
+            continue;
+        }
+
+        Py_XSETREF(tmp, PyArray_DTypeFromTypeNum(num));
+
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                tmp, datetime, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                datetime, tmp, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+
+        NPY_CASTING to_timedelta_casting = NPY_UNSAFE_CASTING;
+        if (PyTypeNum_ISINTEGER(num) || num == NPY_BOOL) {
+            /* timedelta casts like int64 right now... */
+            if (PyTypeNum_ISUNSIGNED(num) && tmp->singleton->elsize == 8) {
+                to_timedelta_casting = NPY_SAME_KIND_CASTING;
+            }
+            else {
+                to_timedelta_casting = NPY_SAFE_CASTING;
+            }
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                tmp, timedelta, to_timedelta_casting) < 0) {
+            goto fail;
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                timedelta, tmp, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+    }
+
+    /*
+     * Cast times to string and unicode
+     */
+    spec.casting = NPY_UNSAFE_CASTING;
+    /*
+     * Casts can error and need API (unicodes needs it for string->unicode).
+     * Unicode handling is currently implemented via a legacy cast.
+     */
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &time_to_string_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    for (int num = NPY_DATETIME; num <= NPY_TIMEDELTA; num++) {
+        for (int str = NPY_STRING; str <= NPY_UNICODE; str++) {
+            dtypes[0] = PyArray_DTypeFromTypeNum(num);
+            dtypes[1] = PyArray_DTypeFromTypeNum(str);
+
+            int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+            Py_SETREF(dtypes[0], NULL);
+            Py_SETREF(dtypes[1], NULL);
+            if (res < 0) {
+                return -1;
+            }
+        }
+    }
+
+    /*
+     * Cast strings to timedelta are currently only legacy casts
+     */
+    if (PyArray_AddLegacyWrapping_CastingImpl(
+            string, timedelta, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
+    }
+    if (PyArray_AddLegacyWrapping_CastingImpl(
+            unicode, timedelta, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Cast strings to datetime
+     */
+    dtypes[1] = datetime;
+    spec.casting = NPY_UNSAFE_CASTING;
+
+    /* The default type resolution should work fine. */
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &string_to_datetime_cast_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    dtypes[0] = string;
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    dtypes[0] = unicode;
+    /*
+     * Unicode handling is currently implemented via a legacy cast, which
+     * requires the Python API.
+     */
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    result = 0;
+  fail:
+    Py_DECREF(datetime);
+    Py_DECREF(timedelta);
+    Py_DECREF(string);
+    Py_DECREF(unicode);
+    Py_XDECREF(tmp);
+    return result;
+}
+
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index d3cce8a37..2cf157551 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -834,24 +834,23 @@ static int
 PyArray_BusDayRollConverter(PyObject *roll_in, NPY_BUSDAY_ROLL *roll)
 {
     PyObject *obj = roll_in;
-    char *str;
-    Py_ssize_t len;
 
-    /* Make obj into an ASCII string */
-    Py_INCREF(obj);
-    if (PyUnicode_Check(obj)) {
-        /* accept unicode input */
-        PyObject *obj_str;
-        obj_str = PyUnicode_AsASCIIString(obj);
+    /* Make obj into an UTF8 string */
+    if (PyBytes_Check(obj)) {
+        /* accept bytes input */
+        PyObject *obj_str = PyUnicode_FromEncodedObject(obj, NULL, NULL);
         if (obj_str == NULL) {
-            Py_DECREF(obj);
             return 0;
         }
-        Py_DECREF(obj);
         obj = obj_str;
     }
+    else {
+        Py_INCREF(obj);
+    }
 
-    if (PyBytes_AsStringAndSize(obj, &str, &len) < 0) {
+    Py_ssize_t len;
+    char const *str = PyUnicode_AsUTF8AndSize(obj, &len);
+    if (str == NULL) {
         Py_DECREF(obj);
         return 0;
     }
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index 6936a803f..d48141d4c 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -30,33 +30,31 @@ PyArray_WeekMaskConverter(PyObject *weekmask_in, npy_bool *weekmask)
 {
     PyObject *obj = weekmask_in;
 
-    /* Make obj into an ASCII string if it is UNICODE */
-    Py_INCREF(obj);
-    if (PyUnicode_Check(obj)) {
-        /* accept unicode input */
-        PyObject *obj_str;
-        obj_str = PyUnicode_AsASCIIString(obj);
+    /* Make obj into an UTF8 string */
+    if (PyBytes_Check(obj)) {
+        /* accept bytes input */
+        PyObject *obj_str = PyUnicode_FromEncodedObject(obj, NULL, NULL);
         if (obj_str == NULL) {
-            Py_DECREF(obj);
             return 0;
         }
-        Py_DECREF(obj);
         obj = obj_str;
     }
+    else {
+        Py_INCREF(obj);
+    }
 
-    if (PyBytes_Check(obj)) {
-        char *str;
-        Py_ssize_t len;
-        int i;
 
-        if (PyBytes_AsStringAndSize(obj, &str, &len) < 0) {
+    if (PyUnicode_Check(obj)) {
+        Py_ssize_t len;
+        char const *str = PyUnicode_AsUTF8AndSize(obj, &len);
+        if (str == NULL) {
             Py_DECREF(obj);
             return 0;
         }
 
         /* Length 7 is a string like "1111100" */
         if (len == 7) {
-            for (i = 0; i < 7; ++i) {
+            for (int i = 0; i < 7; ++i) {
                 switch(str[i]) {
                     case '0':
                         weekmask[i] = 0;
@@ -75,7 +73,7 @@ PyArray_WeekMaskConverter(PyObject *weekmask_in, npy_bool *weekmask)
 general_weekmask_string:
         /* a string like "SatSun" or "Mon Tue Wed" */
         memset(weekmask, 0, 7);
-        for (i = 0; i < len; i += 3) {
+        for (Py_ssize_t i = 0; i < len; i += 3) {
             while (isspace(str[i]))
                 ++i;
 
@@ -168,7 +166,7 @@ invalid_weekmask_string:
                     return 0;
                 }
 
-                val = PyInt_AsLong(f);
+                val = PyLong_AsLong(f);
                 if (error_converting(val)) {
                     Py_DECREF(f);
                     Py_DECREF(obj);
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index f847c7ea8..360868568 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -1385,21 +1385,23 @@ array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
     /* Parse the input unit if provided */
     if (unit_in != NULL && unit_in != Py_None) {
         PyObject *strobj;
-        char *str = NULL;
-        Py_ssize_t len = 0;
 
-        if (PyUnicode_Check(unit_in)) {
-            strobj = PyUnicode_AsASCIIString(unit_in);
-            if (strobj == NULL) {
-                goto fail;
+        if (PyBytes_Check(unit_in)) {
+            /* accept bytes input */
+            PyObject *obj_str = PyUnicode_FromEncodedObject(unit_in, NULL, NULL);
+            if (obj_str == NULL) {
+                return 0;
             }
+            strobj = obj_str;
         }
         else {
+            Py_INCREF(unit_in);
             strobj = unit_in;
-            Py_INCREF(strobj);
         }
 
-        if (PyBytes_AsStringAndSize(strobj, &str, &len) < 0) {
+        Py_ssize_t len;
+        char const *str = PyUnicode_AsUTF8AndSize(strobj, &len);
+        if (str == NULL) {
             Py_DECREF(strobj);
             goto fail;
         }
@@ -1434,24 +1436,27 @@ array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
 
     /* Get the input time zone */
     if (timezone_obj != NULL) {
-        /* Convert to ASCII if it's unicode */
-        if (PyUnicode_Check(timezone_obj)) {
-            /* accept unicode input */
-            PyObject *obj_str;
-            obj_str = PyUnicode_AsASCIIString(timezone_obj);
+        PyObject *strobj;
+        if (PyBytes_Check(timezone_obj)) {
+            /* accept bytes input */
+            PyObject *obj_str = PyUnicode_FromEncodedObject(timezone_obj, NULL, NULL);
             if (obj_str == NULL) {
                 goto fail;
             }
-            Py_DECREF(timezone_obj);
-            timezone_obj = obj_str;
+            strobj = obj_str;
         }
+        else {
+            Py_INCREF(timezone_obj);
+            strobj = timezone_obj;
+        }
+
+        Py_SETREF(timezone_obj, strobj);
 
         /* Check for the supported string inputs */
-        if (PyBytes_Check(timezone_obj)) {
-            char *str;
+        if (PyUnicode_Check(timezone_obj)) {
             Py_ssize_t len;
-
-            if (PyBytes_AsStringAndSize(timezone_obj, &str, &len) < 0) {
+            char const *str = PyUnicode_AsUTF8AndSize(timezone_obj, &len);
+            if (str == NULL) {
                 goto fail;
             }
 
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 67d57975b..a8d575248 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -386,7 +386,7 @@ _convert_from_tuple(PyObject *obj, int align)
         }
         for (int i=0; i < shape.len; i++) {
             PyTuple_SET_ITEM(newdescr->subarray->shape, i,
-                             PyInt_FromLong((long)shape.ptr[i]));
+                             PyLong_FromLong((long)shape.ptr[i]));
 
             if (PyTuple_GET_ITEM(newdescr->subarray->shape, i) == NULL) {
                 Py_DECREF(newdescr);
@@ -441,7 +441,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         }
         PyObject *name = PyTuple_GET_ITEM(item, 0);
         PyObject *title;
-        if (PyBaseString_Check(name)) {
+        if (PyUnicode_Check(name)) {
             title = NULL;
         }
         else if (PyTuple_Check(name)) {
@@ -454,7 +454,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             }
             title = PyTuple_GET_ITEM(name, 0);
             name = PyTuple_GET_ITEM(name, 1);
-            if (!PyBaseString_Check(name)) {
+            if (!PyUnicode_Check(name)) {
                 PyErr_SetString(PyExc_TypeError, "Field name must be a str");
                 goto fail;
             }
@@ -472,7 +472,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         if (PyUnicode_GetLength(name) == 0) {
             Py_DECREF(name);
             if (title == NULL) {
-                name = PyUString_FromFormat("f%d", i);
+                name = PyUnicode_FromFormat("f%d", i);
                 if (name == NULL) {
                     goto fail;
                 }
@@ -512,7 +512,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         }
         if ((PyDict_GetItemWithError(fields, name) != NULL)
              || (title
-                 && PyBaseString_Check(title)
+                 && PyUnicode_Check(title)
                  && (PyDict_GetItemWithError(fields, title) != NULL))) {
             PyErr_Format(PyExc_ValueError,
                     "field %R occurs more than once", name);
@@ -537,7 +537,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             goto fail;
         }
         PyTuple_SET_ITEM(tup, 0, (PyObject *)conv);
-        PyTuple_SET_ITEM(tup, 1, PyInt_FromLong((long) totalsize));
+        PyTuple_SET_ITEM(tup, 1, PyLong_FromLong((long) totalsize));
 
         /*
          * Title can be "meta-data".  Only insert it
@@ -550,7 +550,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             if (PyDict_SetItem(fields, name, tup) < 0) {
                 goto fail;
             }
-            if (PyBaseString_Check(title)) {
+            if (PyUnicode_Check(title)) {
                 PyObject *existing = PyDict_GetItemWithError(fields, title);
                 if (existing == NULL && PyErr_Occurred()) {
                     goto fail;
@@ -660,7 +660,7 @@ _convert_from_list(PyObject *obj, int align)
             }
             maxalign = PyArray_MAX(maxalign, _align);
         }
-        PyObject *size_obj = PyInt_FromLong((long) totalsize);
+        PyObject *size_obj = PyLong_FromLong((long) totalsize);
         if (!size_obj) {
             Py_DECREF(conv);
             goto fail;
@@ -673,7 +673,7 @@ _convert_from_list(PyObject *obj, int align)
         }
         PyTuple_SET_ITEM(tup, 0, (PyObject *)conv);
         PyTuple_SET_ITEM(tup, 1, size_obj);
-        PyObject *key = PyUString_FromFormat("f%d", i);
+        PyObject *key = PyUnicode_FromFormat("f%d", i);
         if (!key) {
             Py_DECREF(tup);
             goto fail;
@@ -1112,7 +1112,7 @@ _convert_from_dict(PyObject *obj, int align)
         /* Build item to insert (descr, offset, [title])*/
         int len = 2;
         PyObject *title = NULL;
-        PyObject *ind = PyInt_FromLong(i);
+        PyObject *ind = PyLong_FromLong(i);
         if (titles) {
             title=PyObject_GetItem(titles, ind);
             if (title && title != Py_None) {
@@ -1166,7 +1166,7 @@ _convert_from_dict(PyObject *obj, int align)
                 goto fail;
             }
 
-            PyTuple_SET_ITEM(tup, 1, PyInt_FromLong(offset));
+            PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(offset));
             /* Flag whether the fields are specified out of order */
             if (offset < totalsize) {
                 has_out_of_order_fields = 1;
@@ -1190,7 +1190,7 @@ _convert_from_dict(PyObject *obj, int align)
             if (align && _align > 1) {
                 totalsize = NPY_NEXT_ALIGNED_OFFSET(totalsize, _align);
             }
-            PyTuple_SET_ITEM(tup, 1, PyInt_FromLong(totalsize));
+            PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(totalsize));
             totalsize += newdescr->elsize;
         }
         if (len == 3) {
@@ -1202,7 +1202,7 @@ _convert_from_dict(PyObject *obj, int align)
             Py_DECREF(tup);
             goto fail;
         }
-        if (!PyBaseString_Check(name)) {
+        if (!PyUnicode_Check(name)) {
             PyErr_SetString(PyExc_ValueError,
                     "field names must be strings");
             Py_DECREF(tup);
@@ -1228,7 +1228,7 @@ _convert_from_dict(PyObject *obj, int align)
             goto fail;
         }
         if (len == 3) {
-            if (PyBaseString_Check(title)) {
+            if (PyUnicode_Check(title)) {
                 if (PyDict_GetItemWithError(fields, title) != NULL) {
                     PyErr_SetString(PyExc_ValueError,
                             "title already used as a name or title.");
@@ -1497,15 +1497,36 @@ _convert_from_any(PyObject *obj, int align)
     }
     else if (PyTuple_Check(obj)) {
         /* or a tuple */
-        return _convert_from_tuple(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a tuple object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_tuple(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyList_Check(obj)) {
         /* or a list */
-        return _convert_from_array_descr(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a list object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_array_descr(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyDict_Check(obj) || PyDictProxy_Check(obj)) {
         /* or a dictionary */
-        return _convert_from_dict(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a dict object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_dict(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyArray_Check(obj)) {
         PyErr_SetString(PyExc_TypeError, "Cannot construct a dtype from an array");
@@ -1887,23 +1908,31 @@ arraydescr_protocol_typestr_get(PyArray_Descr *self)
         size >>= 2;
     }
     if (self->type_num == NPY_OBJECT) {
-        ret = PyUString_FromFormat("%c%c", endian, basic_);
+        ret = PyUnicode_FromFormat("%c%c", endian, basic_);
     }
     else {
-        ret = PyUString_FromFormat("%c%c%d", endian, basic_, size);
+        ret = PyUnicode_FromFormat("%c%c%d", endian, basic_, size);
     }
+    if (ret == NULL) {
+        return NULL;
+    }
+
     if (PyDataType_ISDATETIME(self)) {
         PyArray_DatetimeMetaData *meta;
-
         meta = get_datetime_metadata_from_dtype(self);
         if (meta == NULL) {
             Py_DECREF(ret);
             return NULL;
         }
+        PyObject *umeta = metastr_to_unicode(meta, 0);
+        if (umeta == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
 
-        ret = append_metastr_to_string(meta, 0, ret);
+        Py_SETREF(ret, PyUnicode_Concat(ret, umeta));
+        Py_DECREF(umeta);
     }
-
     return ret;
 }
 
@@ -1950,7 +1979,7 @@ arraydescr_ndim_get(PyArray_Descr *self)
     Py_ssize_t ndim;
 
     if (!PyDataType_HASSUBARRAY(self)) {
-        return PyInt_FromLong(0);
+        return PyLong_FromLong(0);
     }
 
     /*
@@ -1958,7 +1987,7 @@ arraydescr_ndim_get(PyArray_Descr *self)
      * for tuple argument
      */
     ndim = PyTuple_Size(self->subarray->shape);
-    return PyInt_FromLong(ndim);
+    return PyLong_FromLong(ndim);
 }
 
 
@@ -1974,7 +2003,7 @@ arraydescr_protocol_descr_get(PyArray_Descr *self)
         if (dobj == NULL) {
             return NULL;
         }
-        PyTuple_SET_ITEM(dobj, 0, PyUString_FromString(""));
+        PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString(""));
         PyTuple_SET_ITEM(dobj, 1, arraydescr_protocol_typestr_get(self));
         res = PyList_New(1);
         if (res == NULL) {
@@ -2010,7 +2039,7 @@ arraydescr_isbuiltin_get(PyArray_Descr *self)
     if (PyTypeNum_ISUSERDEF(self->type_num)) {
         val = 2;
     }
-    return PyInt_FromLong(val);
+    return PyLong_FromLong(val);
 }
 
 static int
@@ -2153,7 +2182,7 @@ arraydescr_names_set(PyArray_Descr *self, PyObject *val)
         PyObject *item;
         int valid = 1;
         item = PySequence_GetItem(val, i);
-        valid = PyUString_Check(item);
+        valid = PyUnicode_Check(item);
         Py_DECREF(item);
         if (!valid) {
             PyErr_Format(PyExc_ValueError,
@@ -2391,11 +2420,11 @@ _get_pickleabletype_from_datetime_metadata(PyArray_Descr *dtype)
     PyTuple_SET_ITEM(dt_tuple, 0,
             PyBytes_FromString(_datetime_strings[meta->base]));
     PyTuple_SET_ITEM(dt_tuple, 1,
-            PyInt_FromLong(meta->num));
+            PyLong_FromLong(meta->num));
     PyTuple_SET_ITEM(dt_tuple, 2,
-            PyInt_FromLong(1));
+            PyLong_FromLong(1));
     PyTuple_SET_ITEM(dt_tuple, 3,
-            PyInt_FromLong(1));
+            PyLong_FromLong(1));
 
     PyTuple_SET_ITEM(ret, 1, dt_tuple);
 
@@ -2450,7 +2479,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
         if (self->type_num == NPY_UNICODE) {
             elsize >>= 2;
         }
-        obj = PyUString_FromFormat("%c%d",self->kind, elsize);
+        obj = PyUnicode_FromFormat("%c%d",self->kind, elsize);
     }
     PyTuple_SET_ITEM(ret, 1, Py_BuildValue("(NOO)", obj, Py_False, Py_True));
 
@@ -2468,7 +2497,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
     if (PyDataType_ISDATETIME(self)) {
         PyObject *newobj;
         state = PyTuple_New(9);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
         /*
          * newobj is a tuple of the Python metadata dictionary
          * and tuple of date_time info (str, num)
@@ -2483,16 +2512,16 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
     }
     else if (self->metadata) {
         state = PyTuple_New(9);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
         Py_INCREF(self->metadata);
         PyTuple_SET_ITEM(state, 8, self->metadata);
     }
     else { /* Use version 3 pickle format */
         state = PyTuple_New(8);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(3));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(3));
     }
 
-    PyTuple_SET_ITEM(state, 1, PyUString_FromFormat("%c", endian));
+    PyTuple_SET_ITEM(state, 1, PyUnicode_FromFormat("%c", endian));
     PyTuple_SET_ITEM(state, 2, arraydescr_subdescr_get(self));
     if (PyDataType_HASFIELDS(self)) {
         Py_INCREF(self->names);
@@ -2516,9 +2545,9 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
         elsize = -1;
         alignment = -1;
     }
-    PyTuple_SET_ITEM(state, 5, PyInt_FromLong(elsize));
-    PyTuple_SET_ITEM(state, 6, PyInt_FromLong(alignment));
-    PyTuple_SET_ITEM(state, 7, PyInt_FromLong(self->flags));
+    PyTuple_SET_ITEM(state, 5, PyLong_FromLong(elsize));
+    PyTuple_SET_ITEM(state, 6, PyLong_FromLong(alignment));
+    PyTuple_SET_ITEM(state, 7, PyLong_FromLong(self->flags));
 
     PyTuple_SET_ITEM(ret, 2, state);
     return ret;
@@ -2628,7 +2657,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     default:
         /* raise an error */
         if (PyTuple_GET_SIZE(PyTuple_GET_ITEM(args,0)) > 5) {
-            version = PyInt_AsLong(PyTuple_GET_ITEM(args, 0));
+            version = PyLong_AsLong(PyTuple_GET_ITEM(args, 0));
         }
         else {
             version = -1;
@@ -2651,7 +2680,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     if (version == 1 || version == 0) {
         if (fields != Py_None) {
             PyObject *key, *list;
-            key = PyInt_FromLong(-1);
+            key = PyLong_FromLong(-1);
             list = PyDict_GetItemWithError(fields, key);
             if (!list) {
                 if (!PyErr_Occurred()) {
@@ -2788,7 +2817,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
 
         for (i = 0; i < PyTuple_GET_SIZE(names); ++i) {
             name = PyTuple_GET_ITEM(names, i);
-            if (!PyUString_Check(name)) {
+            if (!PyUnicode_Check(name)) {
                 names_ok = 0;
                 break;
             }
@@ -2890,14 +2919,13 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     }
 
     if (PyDataType_ISDATETIME(self) && (metadata != NULL)) {
-        PyObject *old_metadata, *errmsg;
+        PyObject *old_metadata;
         PyArray_DatetimeMetaData temp_dt_data;
 
         if ((! PyTuple_Check(metadata)) || (PyTuple_Size(metadata) != 2)) {
-            errmsg = PyUString_FromString("Invalid datetime dtype (metadata, c_metadata): ");
-            PyUString_ConcatAndDel(&errmsg, PyObject_Repr(metadata));
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
+            PyErr_Format(PyExc_ValueError,
+                    "Invalid datetime dtype (metadata, c_metadata): %R",
+                    metadata);
             return NULL;
         }
 
@@ -3020,7 +3048,7 @@ PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian)
             if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
-            if (!PyUString_Check(key) || !PyTuple_Check(value) ||
+            if (!PyUnicode_Check(key) || !PyTuple_Check(value) ||
                 ((len=PyTuple_GET_SIZE(value)) < 2)) {
                 continue;
             }
@@ -3321,7 +3349,7 @@ _is_list_of_strings(PyObject *obj)
     seqlen = PyList_GET_SIZE(obj);
     for (i = 0; i < seqlen; i++) {
         PyObject *item = PyList_GET_ITEM(obj, i);
-        if (!PyBaseString_Check(item)) {
+        if (!PyUnicode_Check(item)) {
             return NPY_FALSE;
         }
     }
@@ -3393,7 +3421,7 @@ arraydescr_field_subset_view(PyArray_Descr *self, PyObject *ind)
         /* disallow duplicate field indices */
         if (PyDict_Contains(fields, name)) {
             PyObject *msg = NULL;
-            PyObject *fmt = PyUString_FromString(
+            PyObject *fmt = PyUnicode_FromString(
                                    "duplicate field of name {!r}");
             if (fmt != NULL) {
                 msg = PyObject_CallMethod(fmt, "format", "O", name);
@@ -3431,7 +3459,7 @@ descr_subscript(PyArray_Descr *self, PyObject *op)
         return NULL;
     }
 
-    if (PyBaseString_Check(op)) {
+    if (PyUnicode_Check(op)) {
         return _subscript_by_name(self, op);
     }
     else if (_is_list_of_strings(op)) {
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index 553d0effb..a7b252a77 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -3093,7 +3093,7 @@ Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
         free_dragon4_bigint_scratch(scratch);\
         return NULL;\
     }\
-    ret = PyUString_FromString(scratch->repr);\
+    ret = PyUnicode_FromString(scratch->repr);\
     free_dragon4_bigint_scratch(scratch);\
     return ret;\
 }\
@@ -3130,7 +3130,7 @@ Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
         free_dragon4_bigint_scratch(scratch);\
         return NULL;\
     }\
-    ret = PyUString_FromString(scratch->repr);\
+    ret = PyUnicode_FromString(scratch->repr);\
     free_dragon4_bigint_scratch(scratch);\
     return ret;\
 }\
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 3a58b5849..630bd76f3 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -17,7 +17,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_cpu.h>
 
 #include "npy_pycompat.h"
 
@@ -106,7 +105,7 @@ get_bool_setdstone_transfer_function(npy_intp dst_stride,
 /*************************** COPY REFERENCES *******************************/
 
 /* Moves references from src to dst */
-static void
+static int
 _strided_to_strided_move_references(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -114,27 +113,28 @@ _strided_to_strided_move_references(char *dst, npy_intp dst_stride,
 {
     PyObject *src_ref = NULL, *dst_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&src_ref, src, sizeof(src_ref));
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
         /* Release the reference in dst */
         NPY_DT_DBG_REFTRACE("dec dst ref", dst_ref);
         Py_XDECREF(dst_ref);
         /* Move the reference */
         NPY_DT_DBG_REFTRACE("move src ref", src_ref);
-        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        memcpy(dst, &src_ref, sizeof(src_ref));
         /* Set the source reference to NULL */
         src_ref = NULL;
-        NPY_COPY_PYOBJECT_PTR(src, &src_ref);
+        memcpy(src, &src_ref, sizeof(src_ref));
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /* Copies references from src to dst */
-static void
+static int
 _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -142,12 +142,12 @@ _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
 {
     PyObject *src_ref = NULL, *dst_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&src_ref, src, sizeof(src_ref));
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
         /* Copy the reference */
         NPY_DT_DBG_REFTRACE("copy src ref", src_ref);
-        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        memcpy(dst, &src_ref, sizeof(src_ref));
         /* Claim the reference */
         Py_XINCREF(src_ref);
         /* Release the reference in dst */
@@ -158,6 +158,7 @@ _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -188,7 +189,7 @@ static NpyAuxData *_strided_zero_pad_data_clone(NpyAuxData *data)
  * Does a strided to strided zero-padded copy for the case where
  * dst_itemsize > src_itemsize
  */
-static void
+static int
 _strided_to_strided_zero_pad_copy(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -205,13 +206,14 @@ _strided_to_strided_zero_pad_copy(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Does a strided to strided zero-padded copy for the case where
  * dst_itemsize < src_itemsize
  */
-static void
+static int
 _strided_to_strided_truncate_copy(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -226,13 +228,14 @@ _strided_to_strided_truncate_copy(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Does a strided to strided zero-padded or truncated copy for the case where
  * unicode swapping is needed.
  */
-static void
+static int
 _strided_to_strided_unicode_copyswap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -260,6 +263,7 @@ _strided_to_strided_unicode_copyswap(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -379,7 +383,7 @@ static NpyAuxData *_align_wrap_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -395,47 +399,50 @@ _strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
             *todata = d->todata,
             *fromdata = d->fromdata;
     char *bufferin = d->bufferin, *bufferout = d->bufferout;
-    npy_bool init_dest = d->init_dest, out_needs_api = d->out_needs_api;
+    npy_bool init_dest = d->init_dest;
 
     for(;;) {
-        /*
-         * The caller does not know if a previous call resulted in a Python
-         * exception. Much of the Python API is unsafe while an exception is in
-         * flight, so just skip all the work. Someone higher in the call stack
-         * will check for errors and propagate them.
-         */
-        if (out_needs_api && PyErr_Occurred()) {
-            return;
-        }
         if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
-            tobuffer(bufferin, inner_src_itemsize, src, src_stride,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    src_itemsize, todata);
+            if (tobuffer(
+                    bufferin, inner_src_itemsize, src, src_stride,
+                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE, src_itemsize, todata) < 0) {
+                return -1;
+            }
             if (init_dest) {
                 memset(bufferout, 0,
-                        dst_itemsize*NPY_LOWLEVEL_BUFFER_BLOCKSIZE);
+                       dst_itemsize*NPY_LOWLEVEL_BUFFER_BLOCKSIZE);
+            }
+            if (wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
+                        NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                        inner_src_itemsize, wrappeddata) < 0) {
+                return -1;
+            }
+            if (frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                           NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                           dst_itemsize, fromdata) < 0) {
+                return -1;
             }
-            wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    inner_src_itemsize, wrappeddata);
-            frombuffer(dst, dst_stride, bufferout, dst_itemsize,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    dst_itemsize, fromdata);
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
             dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
         }
         else {
-            tobuffer(bufferin, inner_src_itemsize, src, src_stride, N,
-                                            src_itemsize, todata);
+            if (tobuffer(bufferin, inner_src_itemsize, src, src_stride,
+                         N, src_itemsize, todata) < 0) {
+                return -1;
+            }
             if (init_dest) {
                 memset(bufferout, 0, dst_itemsize*N);
             }
-            wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize, N,
-                                            inner_src_itemsize, wrappeddata);
-            frombuffer(dst, dst_stride, bufferout, dst_itemsize, N,
-                                            dst_itemsize, fromdata);
-            return;
+            if (wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
+                        N, inner_src_itemsize, wrappeddata) < 0) {
+                return -1;
+            }
+            if (frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                           N, dst_itemsize, fromdata) < 0) {
+                return -1;
+            }
+            return 0;
         }
     }
 }
@@ -538,7 +545,7 @@ static NpyAuxData *_wrap_copy_swap_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_wrap_copy_swap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -546,7 +553,9 @@ _strided_to_strided_wrap_copy_swap(char *dst, npy_intp dst_stride,
 {
     _wrap_copy_swap_data *d = (_wrap_copy_swap_data *)data;
 
+    /* We assume that d->copyswapn should not be able to error. */
     d->copyswapn(dst, dst_stride, src, src_stride, N, d->swap, d->arr);
+    return 0;
 }
 
 /* This only gets used for custom data types and for Unicode when swapping */
@@ -603,6 +612,7 @@ typedef struct {
     NpyAuxData base;
     PyArray_VectorUnaryFunc *castfunc;
     PyArrayObject *aip, *aop;
+    npy_bool needs_api;
 } _strided_cast_data;
 
 /* strided cast data free function */
@@ -630,7 +640,7 @@ static NpyAuxData *_strided_cast_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -639,17 +649,29 @@ _aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
     _strided_cast_data *d = (_strided_cast_data *)data;
     PyArray_VectorUnaryFunc *castfunc = d->castfunc;
     PyArrayObject *aip = d->aip, *aop = d->aop;
+    npy_bool needs_api = d->needs_api;
 
     while (N > 0) {
         castfunc(src, dst, 1, aip, aop);
+        /*
+         * Since error handling in ufuncs is not ideal (at the time of
+         * writing this, an error could be in process before calling this
+         * function. For most of NumPy history these checks were completely
+         * missing, so this is hopefully OK for the time being (until ufuncs
+         * are fixed).
+         */
+        if (needs_api && PyErr_Occurred()) {
+            return -1;
+        }
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 /* This one requires src be of type NPY_OBJECT */
-static void
+static int
 _aligned_strided_to_strided_cast_decref_src(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -658,31 +680,49 @@ _aligned_strided_to_strided_cast_decref_src(char *dst, npy_intp dst_stride,
     _strided_cast_data *d = (_strided_cast_data *)data;
     PyArray_VectorUnaryFunc *castfunc = d->castfunc;
     PyArrayObject *aip = d->aip, *aop = d->aop;
+    npy_bool needs_api = d->needs_api;
     PyObject *src_ref;
 
     while (N > 0) {
         castfunc(src, dst, 1, aip, aop);
-
-        /* After casting, decrement the source ref */
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_DT_DBG_REFTRACE("dec src ref (cast object -> not object)", src_ref);
+        /*
+         * See comment in `_aligned_strided_to_strided_cast`, an error could
+         * in principle be set before `castfunc` is called.
+         */
+        if (needs_api && PyErr_Occurred()) {
+            return -1;
+        }
+        /* After casting, decrement the source ref and set it to NULL */
+        memcpy(&src_ref, src, sizeof(src_ref));
         Py_XDECREF(src_ref);
+        memset(src, 0, sizeof(PyObject *));
+        NPY_DT_DBG_REFTRACE("dec src ref (cast object -> not object)", src_ref);
 
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _aligned_contig_to_contig_cast(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp NPY_UNUSED(itemsize),
                         NpyAuxData *data)
 {
     _strided_cast_data *d = (_strided_cast_data *)data;
+    npy_bool needs_api = d->needs_api;
 
     d->castfunc(src, dst, N, d->aip, d->aop);
+    /*
+     * See comment in `_aligned_strided_to_strided_cast`, an error could
+     * in principle be set before `castfunc` is called.
+     */
+    if (needs_api && PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
 }
 
 static int
@@ -777,7 +817,7 @@ static NpyAuxData *_strided_datetime_cast_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -792,12 +832,12 @@ _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
 
         if (convert_datetime_to_datetimestruct(&d->src_meta,
                                                dt, &dts) < 0) {
-            dt = NPY_DATETIME_NAT;
+            return -1;
         }
         else {
             if (convert_datetimestruct_to_datetime(&d->dst_meta,
                                                    &dts, &dt) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
 
@@ -807,9 +847,10 @@ _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_datetime_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -838,9 +879,10 @@ _strided_to_strided_datetime_cast(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _aligned_strided_to_strided_datetime_cast(char *dst,
                         npy_intp dst_stride,
                         char *src, npy_intp src_stride,
@@ -870,9 +912,10 @@ _aligned_strided_to_strided_datetime_cast(char *dst,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_datetime_to_string(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -888,28 +931,26 @@ _strided_to_strided_datetime_to_string(char *dst, npy_intp dst_stride,
 
         if (convert_datetime_to_datetimestruct(&d->src_meta,
                                                dt, &dts) < 0) {
-            /* For an error, produce a 'NaT' string */
-            dts.year = NPY_DATETIME_NAT;
+            return -1;
         }
 
         /* Initialize the destination to all zeros */
         memset(dst, 0, dst_itemsize);
 
-        /*
-         * This may also raise an error, but the caller needs
-         * to use PyErr_Occurred().
-         */
-        make_iso_8601_datetime(&dts, dst, dst_itemsize,
+        if (make_iso_8601_datetime(&dts, dst, dst_itemsize,
                                 0, 0, d->src_meta.base, -1,
-                                NPY_UNSAFE_CASTING);
+                                NPY_UNSAFE_CASTING) < 0) {
+            return -1;
+        }
 
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -934,7 +975,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
             if (parse_iso_8601_datetime(tmp_buffer, src_itemsize,
                                     d->dst_meta.base, NPY_SAME_KIND_CASTING,
                                     &dts, NULL, NULL) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
         /* Otherwise parse the data in place */
@@ -942,7 +983,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
             if (parse_iso_8601_datetime(src, tmp - src,
                                     d->dst_meta.base, NPY_SAME_KIND_CASTING,
                                     &dts, NULL, NULL) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
 
@@ -950,7 +991,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
         if (dt != NPY_DATETIME_NAT &&
                 convert_datetimestruct_to_datetime(&d->dst_meta,
                                                &dts, &dt) < 0) {
-            dt = NPY_DATETIME_NAT;
+            return -1;
         }
 
         memcpy(dst, &dt, sizeof(dt));
@@ -959,14 +1000,14 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Assumes src_dtype and dst_dtype are both datetimes or both timedeltas
  */
-static int
+NPY_NO_EXPORT int
 get_nbo_cast_datetime_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
@@ -1040,12 +1081,10 @@ get_nbo_cast_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_datetime_to_string_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata)
+NPY_NO_EXPORT int
+get_nbo_datetime_to_string_transfer_function(
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *src_meta;
     _strided_datetime_cast_data *data;
@@ -1085,7 +1124,7 @@ get_nbo_datetime_to_string_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
+NPY_NO_EXPORT int
 get_datetime_to_unicode_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
@@ -1098,8 +1137,8 @@ get_datetime_to_unicode_transfer_function(int aligned,
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
-    str_dtype = PyArray_DescrFromType(NPY_STRING);
-    str_dtype = PyArray_AdaptFlexibleDType(dst_dtype, str_dtype);
+    str_dtype = PyArray_DescrNewFromType(NPY_STRING);
+    str_dtype->elsize = dst_dtype->elsize / 4;
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1114,10 +1153,9 @@ get_datetime_to_unicode_transfer_function(int aligned,
     }
 
     /* Get the NBO datetime to string aligned contig function */
-    if (get_nbo_datetime_to_string_transfer_function(1,
-                            src_dtype->elsize, str_dtype->elsize,
-                            src_dtype, str_dtype,
-                            &caststransfer, &castdata) != NPY_SUCCEED) {
+    if (get_nbo_datetime_to_string_transfer_function(
+            src_dtype, str_dtype,
+            &caststransfer, &castdata) != NPY_SUCCEED) {
         Py_DECREF(str_dtype);
         NPY_AUXDATA_FREE(todata);
         return NPY_FAIL;
@@ -1156,12 +1194,10 @@ get_datetime_to_unicode_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_string_to_datetime_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata)
+NPY_NO_EXPORT int
+get_nbo_string_to_datetime_transfer_function(
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *dst_meta;
     _strided_datetime_cast_data *data;
@@ -1208,7 +1244,7 @@ get_nbo_string_to_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
+NPY_NO_EXPORT int
 get_unicode_to_datetime_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
@@ -1221,11 +1257,12 @@ get_unicode_to_datetime_transfer_function(int aligned,
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
-    str_dtype = PyArray_DescrFromType(NPY_STRING);
-    str_dtype = PyArray_AdaptFlexibleDType(src_dtype, str_dtype);
+    str_dtype = PyArray_DescrNewFromType(NPY_STRING);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
+    assert(src_dtype->type_num == NPY_UNICODE);
+    str_dtype->elsize = src_dtype->elsize / 4;
 
     /* Get the cast operation from src */
     if (PyArray_GetDTypeTransferFunction(aligned,
@@ -1239,10 +1276,9 @@ get_unicode_to_datetime_transfer_function(int aligned,
     }
 
     /* Get the string to NBO datetime aligned contig function */
-    if (get_nbo_string_to_datetime_transfer_function(1,
-                            str_dtype->elsize, dst_dtype->elsize,
-                            str_dtype, dst_dtype,
-                            &caststransfer, &castdata) != NPY_SUCCEED) {
+    if (get_nbo_string_to_datetime_transfer_function(
+            str_dtype, dst_dtype,
+            &caststransfer, &castdata) != NPY_SUCCEED) {
         Py_DECREF(str_dtype);
         NPY_AUXDATA_FREE(todata);
         return NPY_FAIL;
@@ -1280,95 +1316,21 @@ get_unicode_to_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_cast_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            int move_references,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata,
-                            int *out_needs_api,
-                            int *out_needs_wrap)
+
+NPY_NO_EXPORT int
+get_legacy_dtype_cast_function(
+        int aligned, npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        int move_references,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata,
+        int *out_needs_api, int *out_needs_wrap)
 {
     _strided_cast_data *data;
     PyArray_VectorUnaryFunc *castfunc;
     PyArray_Descr *tmp_dtype;
-    npy_intp shape = 1, src_itemsize = src_dtype->elsize,
-            dst_itemsize = dst_dtype->elsize;
-
-    if (PyTypeNum_ISNUMBER(src_dtype->type_num) &&
-                    PyTypeNum_ISNUMBER(dst_dtype->type_num)) {
-        *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
-                          !PyArray_ISNBO(dst_dtype->byteorder);
-        return get_nbo_cast_numeric_transfer_function(aligned,
-                                    src_stride, dst_stride,
-                                    src_dtype->type_num, dst_dtype->type_num,
-                                    out_stransfer, out_transferdata);
-    }
-
-    if (src_dtype->type_num == NPY_DATETIME ||
-            src_dtype->type_num == NPY_TIMEDELTA ||
-            dst_dtype->type_num == NPY_DATETIME ||
-            dst_dtype->type_num == NPY_TIMEDELTA) {
-        /* A parameterized type, datetime->datetime sometimes needs casting */
-        if ((src_dtype->type_num == NPY_DATETIME &&
-                    dst_dtype->type_num == NPY_DATETIME) ||
-                (src_dtype->type_num == NPY_TIMEDELTA &&
-                    dst_dtype->type_num == NPY_TIMEDELTA)) {
-            *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
-                              !PyArray_ISNBO(dst_dtype->byteorder);
-            return get_nbo_cast_datetime_transfer_function(aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-        }
-
-        /*
-         * Datetime <-> string conversions can be handled specially.
-         * The functions may raise an error if the strings have no
-         * space, or can't be parsed properly.
-         */
-        if (src_dtype->type_num == NPY_DATETIME) {
-            switch (dst_dtype->type_num) {
-                case NPY_STRING:
-                    *out_needs_api = 1;
-                    *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder);
-                    return get_nbo_datetime_to_string_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-
-                case NPY_UNICODE:
-                    return get_datetime_to_unicode_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata,
-                                        out_needs_api);
-            }
-        }
-        else if (dst_dtype->type_num == NPY_DATETIME) {
-            switch (src_dtype->type_num) {
-                case NPY_STRING:
-                    *out_needs_api = 1;
-                    *out_needs_wrap = !PyArray_ISNBO(dst_dtype->byteorder);
-                    return get_nbo_string_to_datetime_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-
-                case NPY_UNICODE:
-                    return get_unicode_to_datetime_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata,
-                                        out_needs_api);
-            }
-        }
-    }
+    npy_intp shape = 1;
+    npy_intp src_itemsize = src_dtype->elsize;
+    npy_intp dst_itemsize = dst_dtype->elsize;
 
     *out_needs_wrap = !aligned ||
                       !PyArray_ISNBO(src_dtype->byteorder) ||
@@ -1422,6 +1384,7 @@ get_nbo_cast_transfer_function(int aligned,
     data->base.free = &_strided_cast_data_free;
     data->base.clone = &_strided_cast_data_clone;
     data->castfunc = castfunc;
+    data->needs_api = *out_needs_api;
     /*
      * TODO: This is a hack so the cast functions have an array.
      *       The cast functions shouldn't need that.  Also, since we
@@ -1500,6 +1463,162 @@ get_nbo_cast_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
+
+static int
+get_nbo_cast_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedUnaryOp **out_stransfer,
+                            NpyAuxData **out_transferdata,
+                            int *out_needs_api,
+                            int *out_needs_wrap)
+{
+    if (PyTypeNum_ISNUMBER(src_dtype->type_num) &&
+                    PyTypeNum_ISNUMBER(dst_dtype->type_num)) {
+        *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
+                          !PyArray_ISNBO(dst_dtype->byteorder);
+        return get_nbo_cast_numeric_transfer_function(aligned,
+                                    src_stride, dst_stride,
+                                    src_dtype->type_num, dst_dtype->type_num,
+                                    out_stransfer, out_transferdata);
+    }
+
+    if (src_dtype->type_num == NPY_DATETIME ||
+            src_dtype->type_num == NPY_TIMEDELTA ||
+            dst_dtype->type_num == NPY_DATETIME ||
+            dst_dtype->type_num == NPY_TIMEDELTA) {
+        /* A parameterized type, datetime->datetime sometimes needs casting */
+        if ((src_dtype->type_num == NPY_DATETIME &&
+                    dst_dtype->type_num == NPY_DATETIME) ||
+                (src_dtype->type_num == NPY_TIMEDELTA &&
+                    dst_dtype->type_num == NPY_TIMEDELTA)) {
+            *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
+                              !PyArray_ISNBO(dst_dtype->byteorder);
+            return get_nbo_cast_datetime_transfer_function(aligned,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata);
+        }
+
+        /*
+         * Datetime <-> string conversions can be handled specially.
+         * The functions may raise an error if the strings have no
+         * space, or can't be parsed properly.
+         */
+        if (src_dtype->type_num == NPY_DATETIME) {
+            switch (dst_dtype->type_num) {
+                case NPY_STRING:
+                    *out_needs_api = 1;
+                    *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder);
+                    return get_nbo_datetime_to_string_transfer_function(
+                            src_dtype, dst_dtype,
+                            out_stransfer, out_transferdata);
+
+                case NPY_UNICODE:
+                    return get_datetime_to_unicode_transfer_function(
+                                        aligned,
+                                        src_stride, dst_stride,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata,
+                                        out_needs_api);
+            }
+        }
+        else if (dst_dtype->type_num == NPY_DATETIME) {
+            switch (src_dtype->type_num) {
+                case NPY_STRING:
+                    *out_needs_api = 1;
+                    *out_needs_wrap = !PyArray_ISNBO(dst_dtype->byteorder);
+                    return get_nbo_string_to_datetime_transfer_function(
+                            src_dtype, dst_dtype,
+                            out_stransfer, out_transferdata);
+
+                case NPY_UNICODE:
+                    return get_unicode_to_datetime_transfer_function(
+                                        aligned,
+                                        src_stride, dst_stride,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata,
+                                        out_needs_api);
+            }
+        }
+    }
+
+    return get_legacy_dtype_cast_function(
+            aligned, src_stride, dst_stride, src_dtype, dst_dtype,
+            move_references, out_stransfer, out_transferdata,
+            out_needs_api, out_needs_wrap);
+}
+
+
+NPY_NO_EXPORT int
+wrap_aligned_contig_transfer_function_with_copyswapn(
+        int aligned, npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata,
+        int *out_needs_api,
+        PyArray_StridedUnaryOp *caststransfer, NpyAuxData *castdata)
+{
+    NpyAuxData *todata = NULL, *fromdata = NULL;
+    PyArray_StridedUnaryOp *tobuffer = NULL, *frombuffer = NULL;
+    npy_intp src_itemsize = src_dtype->elsize;
+    npy_intp dst_itemsize = dst_dtype->elsize;
+
+    /* Get the copy/swap operation from src */
+    PyArray_GetDTypeCopySwapFn(
+            aligned, src_stride, src_itemsize, src_dtype, &tobuffer, &todata);
+
+    if (!PyDataType_REFCHK(dst_dtype)) {
+        /* Copying from buffer is a simple copy/swap operation */
+        PyArray_GetDTypeCopySwapFn(
+                aligned, dst_itemsize, dst_stride, dst_dtype,
+                &frombuffer, &fromdata);
+    }
+    else {
+        /*
+         * Since the buffer is initialized to NULL, need to move the
+         * references in order to DECREF the existing data.
+         */
+         /* Object types cannot be byte swapped */
+        assert(PyDataType_ISNOTSWAPPED(dst_dtype));
+        /* The loop already needs the python api if this is reached */
+        assert(*out_needs_api);
+
+        if (PyArray_GetDTypeTransferFunction(
+                aligned, dst_itemsize, dst_stride,
+                dst_dtype, dst_dtype, 1,
+                &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
+            return NPY_FAIL;
+        }
+    }
+
+    if (frombuffer == NULL || tobuffer == NULL) {
+        NPY_AUXDATA_FREE(castdata);
+        NPY_AUXDATA_FREE(todata);
+        NPY_AUXDATA_FREE(fromdata);
+        return NPY_FAIL;
+    }
+
+    *out_stransfer = caststransfer;
+
+    /* Wrap it all up in a new transfer function + data */
+    if (wrap_aligned_contig_transfer_function(
+                        src_itemsize, dst_itemsize,
+                        tobuffer, todata,
+                        frombuffer, fromdata,
+                        caststransfer, castdata,
+                        PyDataType_FLAGCHK(dst_dtype, NPY_NEEDS_INIT),
+                        *out_needs_api,
+                        out_stransfer, out_transferdata) != NPY_SUCCEED) {
+        NPY_AUXDATA_FREE(castdata);
+        NPY_AUXDATA_FREE(todata);
+        NPY_AUXDATA_FREE(fromdata);
+        return NPY_FAIL;
+    }
+
+    return NPY_SUCCEED;
+}
+
+
 static int
 get_cast_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
@@ -1510,10 +1629,8 @@ get_cast_transfer_function(int aligned,
                             int *out_needs_api)
 {
     PyArray_StridedUnaryOp *caststransfer;
-    NpyAuxData *castdata, *todata = NULL, *fromdata = NULL;
+    NpyAuxData *castdata;
     int needs_wrap = 0;
-    npy_intp src_itemsize = src_dtype->elsize,
-            dst_itemsize = dst_dtype->elsize;
 
     if (get_nbo_cast_transfer_function(aligned,
                             src_stride, dst_stride,
@@ -1538,64 +1655,10 @@ get_cast_transfer_function(int aligned,
     }
     /* Otherwise, we have to copy and/or swap to aligned temporaries */
     else {
-        PyArray_StridedUnaryOp *tobuffer, *frombuffer;
-
-        /* Get the copy/swap operation from src */
-        PyArray_GetDTypeCopySwapFn(aligned,
-                                src_stride, src_itemsize,
-                                src_dtype,
-                                &tobuffer, &todata);
-
-        if (!PyDataType_REFCHK(dst_dtype)) {
-            /* Copying from buffer is a simple copy/swap operation */
-            PyArray_GetDTypeCopySwapFn(aligned,
-                                    dst_itemsize, dst_stride,
-                                    dst_dtype,
-                                    &frombuffer, &fromdata);
-        }
-        else {
-            /*
-             * Since the buffer is initialized to NULL, need to move the
-             * references in order to DECREF the existing data.
-             */
-             /* Object types cannot be byte swapped */
-            assert(PyDataType_ISNOTSWAPPED(dst_dtype));
-            /* The loop already needs the python api if this is reached */
-            assert(*out_needs_api);
-
-            if (PyArray_GetDTypeTransferFunction(
-                    aligned, dst_itemsize, dst_stride,
-                    dst_dtype, dst_dtype, 1,
-                    &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
-                return NPY_FAIL;
-            }
-        }
-
-        if (frombuffer == NULL || tobuffer == NULL) {
-            NPY_AUXDATA_FREE(castdata);
-            NPY_AUXDATA_FREE(todata);
-            NPY_AUXDATA_FREE(fromdata);
-            return NPY_FAIL;
-        }
-
-        *out_stransfer = caststransfer;
-
-        /* Wrap it all up in a new transfer function + data */
-        if (wrap_aligned_contig_transfer_function(
-                            src_itemsize, dst_itemsize,
-                            tobuffer, todata,
-                            frombuffer, fromdata,
-                            caststransfer, castdata,
-                            PyDataType_FLAGCHK(dst_dtype, NPY_NEEDS_INIT),
-                            *out_needs_api,
-                            out_stransfer, out_transferdata) != NPY_SUCCEED) {
-            NPY_AUXDATA_FREE(castdata);
-            NPY_AUXDATA_FREE(todata);
-            NPY_AUXDATA_FREE(fromdata);
-            return NPY_FAIL;
-        }
-
-        return NPY_SUCCEED;
+        return wrap_aligned_contig_transfer_function_with_copyswapn(
+                aligned, src_stride, dst_stride, src_dtype, dst_dtype,
+                out_stransfer, out_transferdata, out_needs_api,
+                caststransfer, castdata);
     }
 }
 
@@ -1652,7 +1715,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1664,18 +1727,19 @@ _strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_itemsize,
-                    src, 0,
-                    subN, src_itemsize,
-                    subdata);
+        if (subtransfer(
+                dst, dst_itemsize, src, 0, subN, src_itemsize, subdata) < 0) {
+            return -1;
+        }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1688,21 +1752,21 @@ _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_itemsize,
-                    src, 0,
-                    subN, src_itemsize,
-                    subdata);
-
+        if (subtransfer(
+                dst, dst_itemsize, src, 0, subN, src_itemsize, subdata) < 0) {
+            return -1;
+        }
 
-        stransfer_finish_src(NULL, 0,
-                            src, 0,
-                            1, src_itemsize,
-                            data_finish_src);
+        if (stransfer_finish_src(
+                NULL, 0, src, 0, 1, src_itemsize, data_finish_src) < 0) {
+            return -1;
+        }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
@@ -1846,7 +1910,7 @@ static NpyAuxData *_n_to_n_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1859,18 +1923,19 @@ _strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
                 dst_subitemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_subitemsize,
-                    src, src_subitemsize,
-                    subN, src_subitemsize,
-                    subdata);
-
+        if (subtransfer(
+                dst, dst_subitemsize, src, src_subitemsize,
+                subN, src_subitemsize, subdata) < 0) {
+            return -1;
+        }
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -1882,10 +1947,12 @@ _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
     npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
                 dst_subitemsize = d->dst_itemsize;
 
-    subtransfer(dst, dst_subitemsize,
-                src, src_subitemsize,
-                subN*N, src_subitemsize,
-                subdata);
+    if (subtransfer(
+            dst, dst_subitemsize, src, src_subitemsize,
+            subN*N, src_subitemsize, subdata) < 0) {
+        return -1;
+    }
+    return 0;
 }
 
 /*
@@ -2049,7 +2116,7 @@ static NpyAuxData *_subarray_broadcast_data_clone( NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2072,10 +2139,11 @@ _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
             count = offsetruns[run].count;
             dst_ptr = dst + loop_index*dst_subitemsize;
             if (offset != -1) {
-                subtransfer(dst_ptr, dst_subitemsize,
-                            src + offset, src_subitemsize,
-                            count, src_subitemsize,
-                            subdata);
+                if (subtransfer(
+                        dst_ptr, dst_subitemsize, src + offset, src_subitemsize,
+                        count, src_subitemsize, subdata) < 0) {
+                    return -1;
+                }
             }
             else {
                 memset(dst_ptr, 0, count*dst_subitemsize);
@@ -2087,10 +2155,11 @@ _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
-static void
+static int
 _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2118,16 +2187,19 @@ _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
             count = offsetruns[run].count;
             dst_ptr = dst + loop_index*dst_subitemsize;
             if (offset != -1) {
-                subtransfer(dst_ptr, dst_subitemsize,
-                            src + offset, src_subitemsize,
-                            count, src_subitemsize,
-                            subdata);
+                if (subtransfer(
+                        dst_ptr, dst_subitemsize, src + offset, src_subitemsize,
+                        count, src_subitemsize, subdata) < 0) {
+                    return -1;
+                }
             }
             else {
                 if (stransfer_decdstref != NULL) {
-                    stransfer_decdstref(NULL, 0, dst_ptr, dst_subitemsize,
-                                        count, dst_subitemsize,
-                                        data_decdstref);
+                    if (stransfer_decdstref(
+                            NULL, 0, dst_ptr, dst_subitemsize,
+                            count, dst_subitemsize, data_decdstref) < 0) {
+                        return -1;
+                    }
                 }
                 memset(dst_ptr, 0, count*dst_subitemsize);
             }
@@ -2135,15 +2207,18 @@ _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
         }
 
         if (stransfer_decsrcref != NULL) {
-            stransfer_decsrcref(NULL, 0, src, src_subitemsize,
-                                    src_subN, src_subitemsize,
-                                    data_decsrcref);
+            if (stransfer_decsrcref(
+                    NULL, 0, src, src_subitemsize,
+                    src_subN, src_subitemsize, data_decsrcref) < 0) {
+                return -1;
+            }
         }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -2500,7 +2575,7 @@ static NpyAuxData *_field_transfer_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2515,11 +2590,13 @@ _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
         field = &d->fields;
         if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
             for (i = 0; i < field_count; ++i, ++field) {
-                field->stransfer(dst + field->dst_offset, dst_stride,
-                                 src + field->src_offset, src_stride,
-                                 NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                 field->src_itemsize,
-                                 field->data);
+                if (field->stransfer(
+                        dst + field->dst_offset, dst_stride,
+                        src + field->src_offset, src_stride,
+                        NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                        field->src_itemsize, field->data) < 0) {
+                    return -1;
+                }
             }
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
@@ -2527,13 +2604,15 @@ _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
         }
         else {
             for (i = 0; i < field_count; ++i, ++field) {
-                field->stransfer(dst + field->dst_offset, dst_stride,
-                                 src + field->src_offset, src_stride,
-                                 N,
-                                 field->src_itemsize,
-                                 field->data);
+                if (field->stransfer(
+                        dst + field->dst_offset, dst_stride,
+                        src + field->src_offset, src_stride,
+                        N,
+                        field->src_itemsize, field->data) < 0) {
+                    return -1;
+                }
             }
-            return;
+            return 0;
         }
     }
 }
@@ -2947,7 +3026,8 @@ static NpyAuxData *_masked_wrapper_transfer_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void _strided_masked_wrapper_decsrcref_transfer_function(
+static int
+_strided_masked_wrapper_decsrcref_transfer_function(
                                     char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_bool *mask, npy_intp mask_stride,
@@ -2969,8 +3049,11 @@ static void _strided_masked_wrapper_decsrcref_transfer_function(
         /* Skip masked values, still calling decsrcref for move_references */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 1);
-        decsrcref_stransfer(NULL, 0, src, src_stride,
-                            subloopsize, src_itemsize, decsrcref_transferdata);
+        if (decsrcref_stransfer(
+                NULL, 0, src, src_stride,
+                subloopsize, src_itemsize, decsrcref_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
@@ -2981,15 +3064,20 @@ static void _strided_masked_wrapper_decsrcref_transfer_function(
         /* Process unmasked values */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 0);
-        unmasked_stransfer(dst, dst_stride, src, src_stride,
-                            subloopsize, src_itemsize, unmasked_transferdata);
+        if (unmasked_stransfer(
+                dst, dst_stride, src, src_stride,
+                subloopsize, src_itemsize, unmasked_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
     }
+    return 0;
 }
 
-static void _strided_masked_wrapper_transfer_function(
+static int
+_strided_masked_wrapper_transfer_function(
                                     char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_bool *mask, npy_intp mask_stride,
@@ -3020,18 +3108,22 @@ static void _strided_masked_wrapper_transfer_function(
         /* Process unmasked values */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 0);
-        unmasked_stransfer(dst, dst_stride, src, src_stride,
-                            subloopsize, src_itemsize, unmasked_transferdata);
+        if (unmasked_stransfer(
+                dst, dst_stride, src, src_stride,
+                subloopsize, src_itemsize, unmasked_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
     }
+    return 0;
 }
 
 
 /************************* DEST BOOL SETONE *******************************/
 
-static void
+static int
 _null_to_strided_set_bool_one(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3046,9 +3138,10 @@ _null_to_strided_set_bool_one(char *dst,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _null_to_contig_set_bool_one(char *dst,
                         npy_intp NPY_UNUSED(dst_stride),
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3058,6 +3151,7 @@ _null_to_contig_set_bool_one(char *dst,
     /* bool type is one byte, so can just use the char */
 
     memset(dst, 1, N);
+    return 0;
 }
 
 /* Only for the bool type, sets the destination to 1 */
@@ -3101,7 +3195,7 @@ static NpyAuxData *_dst_memset_zero_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _null_to_strided_memset_zero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3116,9 +3210,10 @@ _null_to_strided_memset_zero(char *dst,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _null_to_contig_memset_zero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3129,9 +3224,10 @@ _null_to_contig_memset_zero(char *dst,
     npy_intp dst_itemsize = d->dst_itemsize;
 
     memset(dst, 0, N*dst_itemsize);
+    return 0;
 }
 
-static void
+static int
 _null_to_strided_reference_setzero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3141,19 +3237,17 @@ _null_to_strided_reference_setzero(char *dst,
     PyObject *dst_ref = NULL;
 
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
-        /* Release the reference in dst */
+        /* Release the reference in dst and set it to NULL */
         NPY_DT_DBG_REFTRACE("dec dest ref (to set zero)", dst_ref);
         Py_XDECREF(dst_ref);
-
-        /* Set it to zero */
-        dst_ref = NULL;
-        NPY_COPY_PYOBJECT_PTR(dst, &dst_ref);
+        memset(dst, 0, sizeof(PyObject *));
 
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 NPY_NO_EXPORT int
@@ -3250,7 +3344,7 @@ get_setdstzero_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static void
+static int
 _dec_src_ref_nop(char *NPY_UNUSED(dst),
                         npy_intp NPY_UNUSED(dst_stride),
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3259,9 +3353,10 @@ _dec_src_ref_nop(char *NPY_UNUSED(dst),
                         NpyAuxData *NPY_UNUSED(data))
 {
     /* NOP */
+    return 0;
 }
 
-static void
+static int
 _strided_to_null_dec_src_ref_reference(char *NPY_UNUSED(dst),
                         npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp src_stride,
@@ -3271,15 +3366,16 @@ _strided_to_null_dec_src_ref_reference(char *NPY_UNUSED(dst),
 {
     PyObject *src_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-
-        /* Release the reference in src */
+        /* Release the reference in src and set it to NULL */
         NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
+        memcpy(&src_ref, src, sizeof(src_ref));
         Py_XDECREF(src_ref);
+        memset(src, 0, sizeof(PyObject *));
 
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -3661,6 +3757,53 @@ PyArray_GetDTypeTransferFunction(int aligned,
                     out_needs_api);
 }
 
+
+/*
+ * Basic version of PyArray_GetDTypeTransferFunction for legacy dtype
+ * support.
+ * It supports only wrapping the copyswapn functions and the legacy
+ * cast functions registered with `PyArray_RegisterCastFunc`.
+ * This function takes the easy way out: It does not wrap
+ */
+NPY_NO_EXPORT int
+PyArray_GetLegacyDTypeTransferFunction(int aligned,
+        npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        int move_references,
+        PyArray_StridedUnaryOp **out_stransfer,
+        NpyAuxData **out_transferdata,
+        int *out_needs_api)
+{
+    /* Note: We ignore `needs_wrap`; needs-wrap is handled by another cast */
+    int needs_wrap = 0;
+
+    if (src_dtype->type_num == dst_dtype->type_num) {
+        /*
+         * This is a cast within the same dtype. For legacy user-dtypes,
+         * it is always valid to handle this using the copy swap function.
+         */
+        return wrap_copy_swap_function(aligned,
+                src_stride, dst_stride,
+                src_dtype,
+                PyArray_ISNBO(src_dtype->byteorder) !=
+                PyArray_ISNBO(dst_dtype->byteorder),
+                out_stransfer, out_transferdata);
+    }
+
+    if (get_legacy_dtype_cast_function(aligned,
+            src_stride, dst_stride,
+            src_dtype, dst_dtype,
+            move_references,
+            out_stransfer,
+            out_transferdata,
+            out_needs_api,
+            &needs_wrap) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+    return NPY_SUCCEED;
+}
+
+
 NPY_NO_EXPORT int
 PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             npy_intp src_stride,
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index 3026e68e9..4c11723e7 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -15,6 +15,9 @@
 #include "dtypemeta.h"
 #include "_datetime.h"
 #include "array_coercion.h"
+#include "scalartypes.h"
+#include "convert_datatype.h"
+#include "usertypes.h"
 
 
 static void
@@ -24,6 +27,7 @@ dtypemeta_dealloc(PyArray_DTypeMeta *self) {
 
     Py_XDECREF(self->scalar_type);
     Py_XDECREF(self->singleton);
+    Py_XDECREF(self->castingimpls);
     PyType_Type.tp_dealloc((PyObject *) self);
 }
 
@@ -194,7 +198,39 @@ discover_datetime_and_timedelta_from_pyobject(
 
 
 static PyArray_Descr *
-flexible_default_descr(PyArray_DTypeMeta *cls)
+nonparametric_default_descr(PyArray_DTypeMeta *cls)
+{
+    Py_INCREF(cls->singleton);
+    return cls->singleton;
+}
+
+
+/* Ensure a copy of the singleton (just in case we do adapt it somewhere) */
+static PyArray_Descr *
+datetime_and_timedelta_default_descr(PyArray_DTypeMeta *cls)
+{
+    return PyArray_DescrNew(cls->singleton);
+}
+
+
+static PyArray_Descr *
+void_default_descr(PyArray_DTypeMeta *cls)
+{
+    PyArray_Descr *res = PyArray_DescrNew(cls->singleton);
+    if (res == NULL) {
+        return NULL;
+    }
+    /*
+     * The legacy behaviour for `np.array([], dtype="V")` is to use "V8".
+     * This is because `[]` uses `float64` as dtype, and then that is used
+     * for the size of the requested void.
+     */
+    res->elsize = 8;
+    return res;
+}
+
+static PyArray_Descr *
+string_and_unicode_default_descr(PyArray_DTypeMeta *cls)
 {
     PyArray_Descr *res = PyArray_DescrNewFromType(cls->type_num);
     if (res == NULL) {
@@ -208,6 +244,43 @@ flexible_default_descr(PyArray_DTypeMeta *cls)
 }
 
 
+static PyArray_Descr *
+string_unicode_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
+{
+    if (descr1->elsize >= descr2->elsize) {
+        return ensure_dtype_nbo(descr1);
+    }
+    else {
+        return ensure_dtype_nbo(descr2);
+    }
+}
+
+
+static PyArray_Descr *
+void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
+{
+    /*
+     * We currently do not support promotion of void types unless they
+     * are equivalent.
+     */
+    if (!PyArray_CanCastTypeTo(descr1, descr2, NPY_EQUIV_CASTING)) {
+        if (descr1->subarray == NULL && descr1->names == NULL &&
+                descr2->subarray == NULL && descr2->names == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Invalid type promotion with void datatypes of different "
+                    "lengths. Use the `np.bytes_` datatype instead to pad the "
+                    "shorter value with trailing zero bytes.");
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                    "invalid type promotion with structured datatype(s).");
+        }
+        return NULL;
+    }
+    Py_INCREF(descr1);
+    return descr1;
+}
+
 static int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
@@ -242,6 +315,18 @@ python_builtins_are_known_scalar_types(
 
 
 static int
+signed_integers_is_known_scalar_types(
+        PyArray_DTypeMeta *cls, PyTypeObject *pytype)
+{
+    if (python_builtins_are_known_scalar_types(cls, pytype)) {
+        return 1;
+    }
+    /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
+    return PyType_IsSubtype(pytype, &PyGenericArrType_Type);
+}
+
+
+static int
 datetime_known_scalar_types(
         PyArray_DTypeMeta *cls, PyTypeObject *pytype)
 {
@@ -253,7 +338,7 @@ datetime_known_scalar_types(
      * must take charge. Otherwise we would attempt casting which does not
      * truly support this. Only object arrays are special cased in this way.
      */
-    return (PyType_IsSubtype(pytype, &PyString_Type) ||
+    return (PyType_IsSubtype(pytype, &PyBytes_Type) ||
             PyType_IsSubtype(pytype, &PyUnicode_Type));
 }
 
@@ -281,6 +366,86 @@ string_known_scalar_types(
 }
 
 
+/*
+ * The following set of functions define the common dtype operator for
+ * the builtin types.
+ */
+static PyArray_DTypeMeta *
+default_builtin_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    assert(cls->type_num < NPY_NTYPES);
+    if (!other->legacy || other->type_num > cls->type_num) {
+        /* Let the more generic (larger type number) DType handle this */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+
+    /*
+     * Note: The use of the promotion table should probably be revised at
+     *       some point. It may be most useful to remove it entirely and then
+     *       consider adding a fast path/cache `PyArray_CommonDType()` itself.
+     */
+    int common_num = _npy_type_promotion_table[cls->type_num][other->type_num];
+    if (common_num < 0) {
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    return PyArray_DTypeFromTypeNum(common_num);
+}
+
+
+static PyArray_DTypeMeta *
+string_unicode_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    assert(cls->type_num < NPY_NTYPES);
+    if (!other->legacy || other->type_num > cls->type_num ||
+        other->type_num == NPY_OBJECT) {
+        /* Let the more generic (larger type number) DType handle this */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    /*
+     * The builtin types are ordered by complexity (aside from object) here.
+     * Arguably, we should not consider numbers and strings "common", but
+     * we currently do.
+     */
+    Py_INCREF(cls);
+    return cls;
+}
+
+static PyArray_DTypeMeta *
+datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    if (cls->type_num == NPY_DATETIME && other->type_num == NPY_TIMEDELTA) {
+        /*
+         * TODO: We actually currently do allow promotion here. This is
+         *       currently relied on within `np.add(datetime, timedelta)`,
+         *       while for concatenation the cast step will fail.
+         */
+        Py_INCREF(cls);
+        return cls;
+    }
+    return default_builtin_common_dtype(cls, other);
+}
+
+
+
+static PyArray_DTypeMeta *
+object_common_dtype(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *NPY_UNUSED(other))
+{
+    /*
+     * The object DType is special in that it can represent everything,
+     * including all potential user DTypes.
+     * One reason to defer (or error) here might be if the other DType
+     * does not support scalars so that e.g. `arr1d[0]` returns a 0-D array
+     * and `arr.astype(object)` would fail. But object casts are special.
+     */
+    Py_INCREF(cls);
+    return cls;
+}
+
+
 /**
  * This function takes a PyArray_Descr and replaces its base class with
  * a newly created dtype subclass (DTypeMeta instances).
@@ -312,10 +477,28 @@ string_known_scalar_types(
 NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
 {
-    if (Py_TYPE(descr) != &PyArrayDescr_Type) {
+    int has_type_set = Py_TYPE(descr) == &PyArrayDescr_Type;
+
+    if (!has_type_set) {
+        /* Accept if the type was filled in from an existing builtin dtype */
+        for (int i = 0; i < NPY_NTYPES; i++) {
+            PyArray_Descr *builtin = PyArray_DescrFromType(i);
+            has_type_set = Py_TYPE(descr) == Py_TYPE(builtin);
+            Py_DECREF(builtin);
+            if (has_type_set) {
+                break;
+            }
+        }
+    }
+    if (!has_type_set) {
         PyErr_Format(PyExc_RuntimeError,
                 "During creation/wrapping of legacy DType, the original class "
-                "was not PyArrayDescr_Type (it is replaced in this step).");
+                "was not of PyArrayDescr_Type (it is replaced in this step). "
+                "The extension creating a custom DType for type %S must be "
+                "modified to ensure `Py_TYPE(descr) == &PyArrayDescr_Type` or "
+                "that of an existing dtype (with the assumption it is just "
+                "copied over and can be replaced).",
+                descr->typeobj, Py_TYPE(descr));
         return -1;
     }
 
@@ -383,6 +566,12 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
 
     /* Let python finish the initialization (probably unnecessary) */
     if (PyType_Ready((PyTypeObject *)dtype_class) < 0) {
+        Py_DECREF(dtype_class);
+        return -1;
+    }
+    dtype_class->castingimpls = PyDict_New();
+    if (dtype_class->castingimpls == NULL) {
+        Py_DECREF(dtype_class);
         return -1;
     }
 
@@ -398,36 +587,54 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     dtype_class->f = descr->f;
     dtype_class->kind = descr->kind;
 
-    /* Strings and voids have (strange) logic around scalars. */
+    /* Set default functions (correct for most dtypes, override below) */
+    dtype_class->default_descr = nonparametric_default_descr;
+    dtype_class->discover_descr_from_pyobject = (
+            nonparametric_discover_descr_from_pyobject);
     dtype_class->is_known_scalar_type = python_builtins_are_known_scalar_types;
+    dtype_class->common_dtype = default_builtin_common_dtype;
+    dtype_class->common_instance = NULL;
+
+    if (PyTypeNum_ISSIGNED(dtype_class->type_num)) {
+        /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
+        dtype_class->is_known_scalar_type = signed_integers_is_known_scalar_types;
+    }
 
-    if (PyTypeNum_ISDATETIME(descr->type_num)) {
+    if (PyTypeNum_ISUSERDEF(descr->type_num)) {
+        dtype_class->common_dtype = legacy_userdtype_common_dtype_function;
+    }
+    else if (descr->type_num == NPY_OBJECT) {
+        dtype_class->common_dtype = object_common_dtype;
+    }
+    else if (PyTypeNum_ISDATETIME(descr->type_num)) {
         /* Datetimes are flexible, but were not considered previously */
         dtype_class->parametric = NPY_TRUE;
+        dtype_class->default_descr = datetime_and_timedelta_default_descr;
         dtype_class->discover_descr_from_pyobject = (
                 discover_datetime_and_timedelta_from_pyobject);
+        dtype_class->common_dtype = datetime_common_dtype;
+        dtype_class->common_instance = datetime_type_promotion;
         if (descr->type_num == NPY_DATETIME) {
             dtype_class->is_known_scalar_type = datetime_known_scalar_types;
         }
     }
     else if (PyTypeNum_ISFLEXIBLE(descr->type_num)) {
         dtype_class->parametric = NPY_TRUE;
-        dtype_class->default_descr = flexible_default_descr;
         if (descr->type_num == NPY_VOID) {
+            dtype_class->default_descr = void_default_descr;
             dtype_class->discover_descr_from_pyobject = (
                     void_discover_descr_from_pyobject);
+            dtype_class->common_instance = void_common_instance;
         }
         else {
+            dtype_class->default_descr = string_and_unicode_default_descr;
             dtype_class->is_known_scalar_type = string_known_scalar_types;
             dtype_class->discover_descr_from_pyobject = (
                     string_discover_descr_from_pyobject);
+            dtype_class->common_dtype = string_unicode_common_dtype;
+            dtype_class->common_instance = string_unicode_common_instance;
         }
     }
-    else {
-        /* nonparametric case */
-        dtype_class->discover_descr_from_pyobject = (
-                nonparametric_discover_descr_from_pyobject);
-    }
 
     if (_PyArray_MapPyTypeToDType(dtype_class, descr->typeobj,
             PyTypeNum_ISUSERDEF(dtype_class->type_num)) < 0) {
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index e0909a7eb..83cf7c07e 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -2,6 +2,22 @@
 #define _NPY_DTYPEMETA_H
 
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
+/*
+ * This function will hopefully be phased out or replaced, but was convenient
+ * for incremental implementation of new DTypes based on DTypeMeta.
+ * (Error checking is not required for DescrFromType, assuming that the
+ * type is valid.)
+ */
+static NPY_INLINE PyArray_DTypeMeta *
+PyArray_DTypeFromTypeNum(int typenum)
+{
+    PyArray_Descr *descr = PyArray_DescrFromType(typenum);
+    PyArray_DTypeMeta *dtype = NPY_DTYPE(descr);
+    Py_INCREF(dtype);
+    Py_DECREF(descr);
+    return dtype;
+}
+
 
 NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 2538e05c6..6ad375f67 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -16,7 +16,6 @@
 #define _MULTIARRAYMODULE
 #include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
 #include <npy_pycompat.h>
 
 #include <ctype.h>
@@ -25,1898 +24,8 @@
 #include "common.h"
 #include "ctors.h"
 
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
-#else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/********** PRINTF DEBUG TRACING **************/
-#define NPY_EINSUM_DBG_TRACING 0
-
-#if NPY_EINSUM_DBG_TRACING
-#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
-#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
-#else
-#define NPY_EINSUM_DBG_PRINT(s)
-#define NPY_EINSUM_DBG_PRINT1(s, p1)
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
-#endif
-/**********************************************/
-
-/**begin repeat
- * #name = byte, short, int, long, longlong,
- *         ubyte, ushort, uint, ulong, ulonglong,
- *         half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble#
- * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *             npy_float, npy_float, npy_double, npy_longdouble,
- *             npy_float, npy_double, npy_longdouble#
- * #to = ,,,,,
- *       ,,,,,
- *       npy_float_to_half,,,,
- *       ,,#
- * #from = ,,,,,
- *         ,,,,,
- *         npy_half_to_float,,,,
- *         ,,#
- * #complex = 0*5,
- *            0*5,
- *            0*4,
- *            1*3#
- * #float32 = 0*5,
- *            0*5,
- *            0,1,0,0,
- *            0*3#
- * #float64 = 0*5,
- *            0*5,
- *            0,0,1,0,
- *            0*3#
- */
-
-/**begin repeat1
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-static void
-@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data_out += stride_out;
-#  elif @nop@ == 2
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#  elif @nop@ == 3
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) *
-                                         @from@(*(@type@ *)data2) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
-                                         ((@temptype@ *)data_out)[0];
-        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
-                                         ((@temptype@ *)data_out)[1];
-        data0 += stride0;
-        data_out += stride_out;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-}
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_one(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data_out = (@type@ *)dataptr[1];
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            data_out[@i@] = @to@(@from@(data0[@i@]) +
-                                 @from@(data_out[@i@]));
-#else
-            ((@temptype@ *)data_out + 2*@i@)[0] =
-                                    ((@temptype@ *)data0 + 2*@i@)[0] +
-                                    ((@temptype@ *)data_out + 2*@i@)[0];
-            ((@temptype@ *)data_out + 2*@i@)[1] =
-                                    ((@temptype@ *)data0 + 2*@i@)[1] +
-                                    ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#if !@complex@
-        data_out[@i@] = @to@(@from@(data0[@i@]) +
-                             @from@(data_out[@i@]));
-#else /* complex */
-        ((@temptype@ *)data_out + 2*@i@)[0] =
-                                ((@temptype@ *)data0 + 2*@i@)[0] +
-                                ((@temptype@ *)data_out + 2*@i@)[0];
-        ((@temptype@ *)data_out + 2*@i@)[1] =
-                                ((@temptype@ *)data0 + 2*@i@)[1] +
-                                ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 2 && !@complex@
-
-static void
-@name@_sum_of_products_contig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-/* Some extra specializations for the two operand case */
-static void
-@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value0_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(value0 *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value0_sse = _mm_set_ps1(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value0_sse = _mm_set1_pd(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(value0 *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    if (count > 0) {
-        goto finish_after_unrolled_loop;
-    }
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value1_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@])*
-                                 value1  +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value1_sse = _mm_set_ps1(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value1_sse = _mm_set1_pd(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@])*
-                             value1  +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 3 && !@complex@
-
-static void
-@name@_sum_of_products_contig_three(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data2 = (@type@ *)dataptr[2];
-    @type@ *data_out = (@type@ *)dataptr[3];
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) *
-                             @from@(data2[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-        data0 += 8;
-        data1 += 8;
-        data2 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-    if (count-- == 0) {
-        return;
-    }
-    data_out[@i@] = @to@(@from@(data0[@i@]) *
-                         @from@(data1[@i@]) *
-                         @from@(data2[@i@]) +
-                         @from@(data_out[@i@]));
-/**end repeat2**/
-}
-
-#else /* @nop@ > 3 || @complex */
-
-static void
-@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#else /* complex */
-#  if @nop@ <= 3
-#    define _SUMPROD_NOP @nop@
-#  else
-#    define _SUMPROD_NOP nop
-#  endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#  undef _SUMPROD_NOP
-#endif
-    }
-}
-
-#endif /* functions for various @nop@ */
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-    @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
-    @temptype@ accum = 0;
-    @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
-#else
-        data0 += 8*2;
-#endif
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#endif /* @nop@ == 1 */
-
-static void
-@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-#else
-    @temptype@ accum = 0;
-#endif
-
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        accum += @from@(*(@type@ *)data0);
-        data0 += stride0;
-#  elif @nop@ == 2
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1);
-        data0 += stride0;
-        data1 += stride1;
-#  elif @nop@ == 3
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1) *
-                 @from@(*(@type@ *)data2);
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        accum += temp;
-        for (i = 0; i < nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        accum_re += ((@temptype@ *)data0)[0];
-        accum_im += ((@temptype@ *)data0)[1];
-        data0 += stride0;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        accum_re += re;
-        accum_im += im;
-        for (i = 0; i < _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-
-#if @complex@
-#  if @nop@ <= 3
-    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
-    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
-#  else
-    ((@temptype@ *)dataptr[nop])[0] += accum_re;
-    ((@temptype@ *)dataptr[nop])[1] += accum_im;
-#  endif
-#else
-#  if @nop@ <= 3
-    *((@type@ *)dataptr[@nop@]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[@nop@])));
-#  else
-    *((@type@ *)dataptr[nop]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[nop])));
-#  endif
-#endif
-
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-
-/* Do OR of ANDs for the boolean type */
-
-/**begin repeat
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-
-static void
-bool_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        *(npy_bool *)data_out = *(npy_bool *)data0 ||
-                                  *(npy_bool *)data_out;
-        data0 += stride0;
-        data_out += stride_out;
-#elif @nop@ == 2
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#elif @nop@ == 3
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1 &&
-                                   *(npy_bool *)data2) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-}
-
-static void
-bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-#endif
-
-#if (@nop@ <= 3)
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat1
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#  if @nop@ == 1
-            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
-                                            ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 2
-            ((npy_bool *)data_out)[@i@] =
-                            (((npy_bool *)data0)[@i@] &&
-                             ((npy_bool *)data1)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 3
-            ((npy_bool *)data_out)[@i@] =
-                           (((npy_bool *)data0)[@i@] &&
-                            ((npy_bool *)data1)[@i@] &&
-                            ((npy_bool *)data2)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  endif
-/**end repeat1**/
-        case 0:
-            return;
-    }
-#endif
-
-/* Unroll the loop by 8 for fixed-size nop */
-#if (@nop@ <= 3)
-    while (count >= 8) {
-        count -= 8;
-#else
-    while (count--) {
-#endif
-
-#  if @nop@ == 1
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
-                                        (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 2
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                        ((*((npy_bool *)data0 + @i@)) &&
-                         (*((npy_bool *)data1 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 3
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                       ((*((npy_bool *)data0 + @i@)) &&
-                        (*((npy_bool *)data1 + @i@)) &&
-                        (*((npy_bool *)data2 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data2 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(npy_bool);
-        }
-#  endif
-    }
-
-    /* If the loop was unrolled, we need to finish it off */
-#if (@nop@ <= 3)
-    goto finish_after_unrolled_loop;
-#endif
-}
-
-static void
-bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-    npy_bool accum = 0;
-
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        accum = *(npy_bool *)data0 || accum;
-        data0 += stride0;
-#elif @nop@ == 2
-        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
-        data0 += stride0;
-        data1 += stride1;
-#elif @nop@ == 3
-        accum = (*(npy_bool *)data0 &&
-                 *(npy_bool *)data1 &&
-                 *(npy_bool *)data2) || accum;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        accum = temp || accum;
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-
-#  if @nop@ <= 3
-    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
-#  else
-    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
-#  endif
-}
-
-/**end repeat**/
-
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
-/* These tables need to match up with the type enum */
-static sum_of_products_fn
-_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-    &@name@_sum_of_products_contig_outstride0_one,
-#else
-    NULL,
-#endif
-/**end repeat**/
-}; /* End of _contig_outstride0_unary_specialization_table */
-
-static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        0, 0, 0,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_stride0_contig_outstride0_two,
-    &@name@_sum_of_products_stride0_contig_outcontig_two,
-    &@name@_sum_of_products_contig_stride0_outstride0_two,
-    &@name@_sum_of_products_contig_stride0_outcontig_two,
-    &@name@_sum_of_products_contig_contig_outstride0_two,
-},
-#else
-    {NULL, NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _binary_specialization_table */
-
-static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_outstride0_any,
-    &@name@_sum_of_products_outstride0_one,
-    &@name@_sum_of_products_outstride0_two,
-    &@name@_sum_of_products_outstride0_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _outstride0_specialized_table */
-
-static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_contig_any,
-    &@name@_sum_of_products_contig_one,
-    &@name@_sum_of_products_contig_two,
-    &@name@_sum_of_products_contig_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _allcontig_specialized_table */
-
-static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_any,
-    &@name@_sum_of_products_one,
-    &@name@_sum_of_products_two,
-    &@name@_sum_of_products_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _unnspecialized_table */
-
-static sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
-                             npy_intp itemsize, npy_intp const *fixed_strides)
-{
-    int iop;
-
-    if (type_num >= NPY_NTYPES) {
-        return NULL;
-    }
-
-    /* contiguous reduction */
-    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
-        sum_of_products_fn ret =
-            _contig_outstride0_unary_specialization_table[type_num];
-        if (ret != NULL) {
-            return ret;
-        }
-    }
-
-    /* nop of 2 has more specializations */
-    if (nop == 2) {
-        /* Encode the zero/contiguous strides */
-        int code;
-        code = (fixed_strides[0] == 0) ? 0 :
-                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
-        code += (fixed_strides[1] == 0) ? 0 :
-                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
-        code += (fixed_strides[2] == 0) ? 0 :
-                    (fixed_strides[2] == itemsize) ? 1 : 8;
-        if (code >= 2 && code < 7) {
-            sum_of_products_fn ret =
-                        _binary_specialization_table[type_num][code-2];
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-    }
-
-    /* Inner loop with an output stride of 0 */
-    if (fixed_strides[nop] == 0) {
-        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* Check for all contiguous */
-    for (iop = 0; iop < nop + 1; ++iop) {
-        if (fixed_strides[iop] != itemsize) {
-            break;
-        }
-    }
-
-    /* Contiguous loop */
-    if (iop == nop + 1) {
-        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* None of the above specializations caught it, general loops */
-    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
-}
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
 
 
 /*
diff --git a/numpy/core/src/multiarray/einsum_debug.h b/numpy/core/src/multiarray/einsum_debug.h
new file mode 100644
index 000000000..9aa81fcbd
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_debug.h
@@ -0,0 +1,28 @@
+/*
+ * This file provides debug macros used by the other einsum files.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+#ifndef _NPY_MULTIARRAY_EINSUM_DEBUG_H
+#define _NPY_MULTIARRAY_EINSUM_DEBUG_H
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_EINSUM_DBG_TRACING 0
+
+#if NPY_EINSUM_DBG_TRACING
+#include <cstdio>
+#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
+#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
+#else
+#define NPY_EINSUM_DBG_PRINT(s)
+#define NPY_EINSUM_DBG_PRINT1(s, p1)
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
+#endif
+
+#endif
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
new file mode 100644
index 000000000..caba0e00a
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -0,0 +1,1878 @@
+/*
+ * This file provides optimized sum of product implementations used internally
+ * by einsum.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
+#include <numpy/halffloat.h>
+
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
+
+#ifdef NPY_HAVE_SSE_INTRINSICS
+#define EINSUM_USE_SSE1 1
+#else
+#define EINSUM_USE_SSE1 0
+#endif
+
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#define EINSUM_USE_SSE2 1
+#else
+#define EINSUM_USE_SSE2 0
+#endif
+
+#if EINSUM_USE_SSE1
+#include <xmmintrin.h>
+#endif
+
+#if EINSUM_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
+
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**********************************************/
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *             npy_float, npy_float, npy_double, npy_longdouble,
+ *             npy_float, npy_double, npy_longdouble#
+ * #sfx  = s8, s16, s32, long, s64,
+ *        u8, u16, u32, ulong, u64,
+ *        half, f32, f64, longdouble,
+ *        f32, f64, clongdouble#
+ * #to = ,,,,,
+ *       ,,,,,
+ *       npy_float_to_half,,,,
+ *       ,,#
+ * #from = ,,,,,
+ *         ,,,,,
+ *         npy_half_to_float,,,,
+ *         ,,#
+ * #complex = 0*5,
+ *            0*5,
+ *            0*4,
+ *            1*3#
+ * #float32 = 0*5,
+ *            0*5,
+ *            0,1,0,0,
+ *            0*3#
+ * #float64 = 0*5,
+ *            0*5,
+ *            0,0,1,0,
+ *            0*3#
+ * #NPYV_CHK = 0*5,
+ *             0*5,
+ *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0*3#
+ */
+
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif @nop@ == 2
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif @nop@ == 3
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) *
+                                         @from@(*(@type@ *)data2) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
+                                         ((@temptype@ *)data_out)[0];
+        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
+                                         ((@temptype@ *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data_out = (@type@ *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            data_out[@i@] = @to@(@from@(data0[@i@]) +
+                                 @from@(data_out[@i@]));
+#else
+            ((@temptype@ *)data_out + 2*@i@)[0] =
+                                    ((@temptype@ *)data0 + 2*@i@)[0] +
+                                    ((@temptype@ *)data_out + 2*@i@)[0];
+            ((@temptype@ *)data_out + 2*@i@)[1] =
+                                    ((@temptype@ *)data0 + 2*@i@)[1] +
+                                    ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#if !@complex@
+        data_out[@i@] = @to@(@from@(data0[@i@]) +
+                             @from@(data_out[@i@]));
+#else /* complex */
+        ((@temptype@ *)data_out + 2*@i@)[0] =
+                                ((@temptype@ *)data0 + 2*@i@)[0] +
+                                ((@temptype@ *)data_out + 2*@i@)[0];
+        ((@temptype@ *)data_out + 2*@i@)[1] =
+                                ((@temptype@ *)data0 + 2*@i@)[1] +
+                                ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for @type@
+#if @NPYV_CHK@
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
+        }
+    }
+    /**end repeat2**/
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+        npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+        npyv_@sfx@ c = npyv_load_tillz_@sfx@(data_out, count);
+        npyv_store_till_@sfx@(data_out, count, npyv_muladd_@sfx@(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b + c);
+    }
+#endif // NPYV check for @type@
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value0_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value0_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(value0 *
+                                 @from@(data1[@i@]) +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value0_sse = _mm_set_ps1(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    value0_sse = _mm_set1_pd(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(value0 *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data1 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    if (count > 0) {
+        goto finish_after_unrolled_loop;
+    }
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value1_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value1_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(@from@(data0[@i@])*
+                                 value1  +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value1_sse = _mm_set_ps1(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    value1_sse = _mm_set1_pd(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@])*
+                             value1  +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+            accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+        accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data2 = (@type@ *)dataptr[2];
+    @type@ *data_out = (@type@ *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) *
+                             @from@(data2[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) *
+                         @from@(data2[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else /* @nop@ > 3 || @complex */
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#else /* complex */
+#  if @nop@ <= 3
+#    define _SUMPROD_NOP @nop@
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various @nop@ */
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+    @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#else
+    @temptype@ accum = 0;
+    @type@ *data0 = (@type@ *)dataptr[0];
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            accum += @from@(data0[@i@]);
+#else /* complex */
+            accum_re += data0[2*@i@+0];
+            accum_im += data0[2*@i@+1];
+#endif
+/**end repeat2**/
+        case 0:
+#if @complex@
+            ((@temptype@ *)dataptr[1])[0] += accum_re;
+            ((@temptype@ *)dataptr[1])[1] += accum_im;
+#else
+            *((@type@ *)dataptr[1]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[1])));
+#endif
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#  if !@complex@
+        accum += @from@(data0[@i@]);
+#  else /* complex */
+        accum_re += data0[2*@i@+0];
+        accum_im += data0[2*@i@+1];
+#  endif
+/**end repeat2**/
+#endif
+
+#if !@complex@
+        data0 += 8;
+#else
+        data0 += 8*2;
+#endif
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#endif /* @nop@ == 1 */
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+#else
+    @temptype@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        accum += @from@(*(@type@ *)data0);
+        data0 += stride0;
+#  elif @nop@ == 2
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif @nop@ == 3
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1) *
+                 @from@(*(@type@ *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        accum_re += ((@temptype@ *)data0)[0];
+        accum_im += ((@temptype@ *)data0)[1];
+        data0 += stride0;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if @complex@
+#  if @nop@ <= 3
+    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
+    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
+#  else
+    ((@temptype@ *)dataptr[nop])[0] += accum_re;
+    ((@temptype@ *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if @nop@ <= 3
+    *((@type@ *)dataptr[@nop@]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[@nop@])));
+#  else
+    *((@type@ *)dataptr[nop]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif @nop@ == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif @nop@ == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+#endif
+
+#if (@nop@ <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat1
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#  if @nop@ == 1
+            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
+                                            ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 2
+            ((npy_bool *)data_out)[@i@] =
+                            (((npy_bool *)data0)[@i@] &&
+                             ((npy_bool *)data1)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 3
+            ((npy_bool *)data_out)[@i@] =
+                           (((npy_bool *)data0)[@i@] &&
+                            ((npy_bool *)data1)[@i@] &&
+                            ((npy_bool *)data2)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  endif
+/**end repeat1**/
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (@nop@ <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif @nop@ == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif @nop@ == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if @nop@ <= 3
+    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+/**end repeat**/
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+    &@name@_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+/**end repeat**/
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_stride0_contig_outstride0_two,
+    &@name@_sum_of_products_stride0_contig_outcontig_two,
+    &@name@_sum_of_products_contig_stride0_outstride0_two,
+    &@name@_sum_of_products_contig_stride0_outcontig_two,
+    &@name@_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_outstride0_any,
+    &@name@_sum_of_products_outstride0_one,
+    &@name@_sum_of_products_outstride0_two,
+    &@name@_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_contig_any,
+    &@name@_sum_of_products_contig_one,
+    &@name@_sum_of_products_contig_two,
+    &@name@_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_any,
+    &@name@_sum_of_products_one,
+    &@name@_sum_of_products_two,
+    &@name@_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _unnspecialized_table */
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides)
+{
+    int iop;
+
+    if (type_num >= NPY_NTYPES) {
+        return NULL;
+    }
+
+    /* contiguous reduction */
+    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+        sum_of_products_fn ret =
+            _contig_outstride0_unary_specialization_table[type_num];
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        /* Encode the zero/contiguous strides */
+        int code;
+        code = (fixed_strides[0] == 0) ? 0 :
+                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+        code += (fixed_strides[1] == 0) ? 0 :
+                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
+        code += (fixed_strides[2] == 0) ? 0 :
+                    (fixed_strides[2] == itemsize) ? 1 : 8;
+        if (code >= 2 && code < 7) {
+            sum_of_products_fn ret =
+                        _binary_specialization_table[type_num][code-2];
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop + 1; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop + 1) {
+        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* None of the above specializations caught it, general loops */
+    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
new file mode 100644
index 000000000..c6cf18ec6
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -0,0 +1,12 @@
+#ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+#define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+
+#include <numpy/npy_common.h>
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides);
+
+#endif
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index d5f24e75a..9b7d8deae 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -307,7 +307,7 @@ arrayflags_farray_get(PyArrayFlagsObject *self)
 static PyObject *
 arrayflags_num_get(PyArrayFlagsObject *self)
 {
-    return PyInt_FromLong(self->flags);
+    return PyLong_FromLong(self->flags);
 }
 
 /* relies on setflags order being write, align, uic */
@@ -711,7 +711,7 @@ arrayflags_print(PyArrayFlagsObject *self)
     if (fl & NPY_ARRAY_WARN_ON_WRITE) {
         _warn_on_write = "  (with WARN_ON_WRITE=True)";
     }
-    return PyUString_FromFormat(
+    return PyUnicode_FromFormat(
                         "  %s : %s\n  %s : %s\n"
                         "  %s : %s\n  %s : %s%s\n"
                         "  %s : %s\n  %s : %s\n"
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 5405a25db..3575d6fad 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -28,7 +28,7 @@
 static PyObject *
 array_ndim_get(PyArrayObject *self)
 {
-    return PyInt_FromLong(PyArray_NDIM(self));
+    return PyLong_FromLong(PyArray_NDIM(self));
 }
 
 static PyObject *
@@ -217,7 +217,7 @@ array_protocol_descr_get(PyArrayObject *self)
     if (dobj == NULL) {
         return NULL;
     }
-    PyTuple_SET_ITEM(dobj, 0, PyString_FromString(""));
+    PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString(""));
     PyTuple_SET_ITEM(dobj, 1, array_typestr_get(self));
     res = PyList_New(1);
     if (res == NULL) {
@@ -244,8 +244,9 @@ array_dataptr_get(PyArrayObject *self)
 {
     return Py_BuildValue("NO",
                          PyLong_FromVoidPtr(PyArray_DATA(self)),
-                         (PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE ? Py_False :
-                          Py_True));
+                         ((PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE) &&
+                          !(PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) ?
+                         Py_False : Py_True);
 }
 
 static PyObject *
@@ -274,10 +275,6 @@ array_interface_get(PyArrayObject *self)
         return NULL;
     }
 
-    if (array_might_be_written(self) < 0) {
-        Py_DECREF(dict);
-        return NULL;
-    }
     int ret;
 
     /* dataptr */
@@ -321,7 +318,7 @@ array_interface_get(PyArrayObject *self)
         return NULL;
     }
 
-    obj = PyInt_FromLong(3);
+    obj = PyLong_FromLong(3);
     ret = PyDict_SetItemString(dict, "version", obj);
     Py_DECREF(obj);
     if (ret < 0) {
@@ -416,7 +413,7 @@ array_data_set(PyArrayObject *self, PyObject *op)
 static PyObject *
 array_itemsize_get(PyArrayObject *self)
 {
-    return PyInt_FromLong((long) PyArray_DESCR(self)->elsize);
+    return PyLong_FromLong((long) PyArray_DESCR(self)->elsize);
 }
 
 static PyObject *
@@ -424,13 +421,13 @@ array_size_get(PyArrayObject *self)
 {
     npy_intp size=PyArray_SIZE(self);
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) size);
+    return PyLong_FromLong((long) size);
 #else
     if (size > NPY_MAX_LONG || size < NPY_MIN_LONG) {
         return PyLong_FromLongLong(size);
     }
     else {
-        return PyInt_FromLong((long) size);
+        return PyLong_FromLong((long) size);
     }
 #endif
 }
@@ -440,13 +437,13 @@ array_nbytes_get(PyArrayObject *self)
 {
     npy_intp nbytes = PyArray_NBYTES(self);
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) nbytes);
+    return PyLong_FromLong((long) nbytes);
 #else
     if (nbytes > NPY_MAX_LONG || nbytes < NPY_MIN_LONG) {
         return PyLong_FromLongLong(nbytes);
     }
     else {
-        return PyInt_FromLong((long) nbytes);
+        return PyLong_FromLong((long) nbytes);
     }
 #endif
 }
@@ -624,13 +621,7 @@ static PyObject *
 array_struct_get(PyArrayObject *self)
 {
     PyArrayInterface *inter;
-    PyObject *ret;
 
-    if (PyArray_ISWRITEABLE(self)) {
-        if (array_might_be_written(self) < 0) {
-            return NULL;
-        }
-    }
     inter = (PyArrayInterface *)PyArray_malloc(sizeof(PyArrayInterface));
     if (inter==NULL) {
         return PyErr_NoMemory();
@@ -640,6 +631,11 @@ array_struct_get(PyArrayObject *self)
     inter->typekind = PyArray_DESCR(self)->kind;
     inter->itemsize = PyArray_DESCR(self)->elsize;
     inter->flags = PyArray_FLAGS(self);
+    if (inter->flags & NPY_ARRAY_WARN_ON_WRITE) {
+        /* Export a warn-on-write array as read-only */
+        inter->flags = inter->flags & ~NPY_ARRAY_WARN_ON_WRITE;
+        inter->flags = inter->flags & ~NPY_ARRAY_WRITEABLE;
+    }
     /* reset unused flags */
     inter->flags &= ~(NPY_ARRAY_WRITEBACKIFCOPY | NPY_ARRAY_UPDATEIFCOPY |NPY_ARRAY_OWNDATA);
     if (PyArray_ISNOTSWAPPED(self)) inter->flags |= NPY_ARRAY_NOTSWAPPED;
@@ -676,8 +672,14 @@ array_struct_get(PyArrayObject *self)
     else {
         inter->descr = NULL;
     }
+    PyObject *ret = PyCapsule_New(inter, NULL, gentype_struct_free);
+    if (ret == NULL) {
+        return NULL;
+    }
     Py_INCREF(self);
-    ret = NpyCapsule_FromVoidPtrAndDesc(inter, self, gentype_struct_free);
+    if (PyCapsule_SetContext(ret, self) < 0) {
+        return NULL;
+    }
     return ret;
 }
 
diff --git a/numpy/core/src/multiarray/hashdescr.c b/numpy/core/src/multiarray/hashdescr.c
index 0b23b6c21..e9a99cc8f 100644
--- a/numpy/core/src/multiarray/hashdescr.c
+++ b/numpy/core/src/multiarray/hashdescr.c
@@ -132,7 +132,7 @@ static int _array_descr_walk_fields(PyObject *names, PyObject* fields, PyObject*
                     "(Hash) names and fields inconsistent ???");
             return -1;
         }
-        if (!PyUString_Check(key)) {
+        if (!PyUnicode_Check(key)) {
             PyErr_SetString(PyExc_SystemError,
                     "(Hash) key of dtype dict not a string ???");
             return -1;
@@ -165,7 +165,7 @@ static int _array_descr_walk_fields(PyObject *names, PyObject* fields, PyObject*
         }
 
         foffset = PyTuple_GET_ITEM(value, 1);
-        if (!PyInt_Check(foffset)) {
+        if (!PyLong_Check(foffset)) {
             PyErr_SetString(PyExc_SystemError,
                     "(Hash) Second item in compound dtype tuple not an int ???");
             return -1;
@@ -208,7 +208,7 @@ static int _array_descr_walk_subarray(PyArray_ArrayDescr* adescr, PyObject *l)
             PyList_Append(l, item);
         }
     }
-    else if (PyInt_Check(adescr->shape)) {
+    else if (PyLong_Check(adescr->shape)) {
         PyList_Append(l, adescr->shape);
     }
     else {
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 8052e24e4..b279ffc2f 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -576,6 +576,10 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
         return NULL;
     }
 
+    if (PyArray_FailUnlessWriteable(self, "putmask: output array") < 0) {
+        return NULL;
+    }
+
     mask = (PyArrayObject *)PyArray_FROM_OTF(mask0, NPY_BOOL,
                                 NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST);
     if (mask == NULL) {
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index ac5b90400..3ebd4c858 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -61,7 +61,7 @@ parse_index_entry(PyObject *op, npy_intp *step_size,
     }
     else if (PySlice_Check(op)) {
         npy_intp stop;
-        if (NpySlice_GetIndicesEx(op, max, &i, &stop, step_size, n_steps) < 0) {
+        if (PySlice_GetIndicesEx(op, max, &i, &stop, step_size, n_steps) < 0) {
             goto fail;
         }
         if (*n_steps <= 0) {
@@ -597,7 +597,7 @@ iter_subscript(PyArrayIterObject *self, PyObject *ind)
     }
 
     /* Check for Integer or Slice */
-    if (PyLong_Check(ind) || PyInt_Check(ind) || PySlice_Check(ind)) {
+    if (PyLong_Check(ind) || PySlice_Check(ind)) {
         start = parse_index_entry(ind, &step_size, &n_steps,
                                   self->size, 0, 1);
         if (start == -1) {
@@ -1411,10 +1411,10 @@ static PyObject *
 arraymultiter_size_get(PyArrayMultiIterObject *self)
 {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) self->size);
+    return PyLong_FromLong((long) self->size);
 #else
     if (self->size < NPY_MAX_LONG) {
-        return PyInt_FromLong((long) self->size);
+        return PyLong_FromLong((long) self->size);
     }
     else {
         return PyLong_FromLongLong((npy_longlong) self->size);
@@ -1426,10 +1426,10 @@ static PyObject *
 arraymultiter_index_get(PyArrayMultiIterObject *self)
 {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) self->index);
+    return PyLong_FromLong((long) self->index);
 #else
     if (self->size < NPY_MAX_LONG) {
-        return PyInt_FromLong((long) self->index);
+        return PyLong_FromLong((long) self->index);
     }
     else {
         return PyLong_FromLongLong((npy_longlong) self->index);
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.c b/numpy/core/src/multiarray/legacy_dtype_implementation.c
new file mode 100644
index 000000000..3ce4710fd
--- /dev/null
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.c
@@ -0,0 +1,716 @@
+/*
+ * This file hosts legacy implementations of certain functions for
+ * which alternatives exists, but the old functions are still required
+ * in certain code paths, or until the code transition is finalized.
+ *
+ * This code should typically not require modification, and if modified
+ * similar changes may be necessary in the new version.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "scalartypes.h"
+#include "_datetime.h"
+#include "datetime_strings.h"
+#include "convert_datatype.h"
+
+#include "legacy_dtype_implementation.h"
+
+
+/*
+ * Compare the field dictionaries for two types.
+ *
+ * Return 1 if the field types and field names of the two descrs are equal and
+ * in the same order, 0 if not.
+ */
+static int
+_equivalent_fields(PyArray_Descr *type1, PyArray_Descr *type2) {
+
+    int val;
+
+    if (type1->fields == type2->fields && type1->names == type2->names) {
+        return 1;
+    }
+    if (type1->fields == NULL || type2->fields == NULL) {
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(type1->fields, type2->fields, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(type1->names, type2->names, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Compare the subarray data for two types.
+ * Return 1 if they are the same, 0 if not.
+ */
+static int
+_equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
+{
+    int val;
+
+    if (sub1 == sub2) {
+        return 1;
+
+    }
+    if (sub1 == NULL || sub2 == NULL) {
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(sub1->shape, sub2->shape, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    return PyArray_EquivTypes(sub1->base, sub2->base);
+}
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+{
+    int type_num1, type_num2, size1, size2;
+
+    if (type1 == type2) {
+        return NPY_TRUE;
+    }
+
+    type_num1 = type1->type_num;
+    type_num2 = type2->type_num;
+    size1 = type1->elsize;
+    size2 = type2->elsize;
+
+    if (size1 != size2) {
+        return NPY_FALSE;
+    }
+    if (PyArray_ISNBO(type1->byteorder) != PyArray_ISNBO(type2->byteorder)) {
+        return NPY_FALSE;
+    }
+    if (type1->subarray || type2->subarray) {
+        return ((type_num1 == type_num2)
+                && _equivalent_subarrays(type1->subarray, type2->subarray));
+    }
+    if (type_num1 == NPY_VOID || type_num2 == NPY_VOID) {
+        return ((type_num1 == type_num2) && _equivalent_fields(type1, type2));
+    }
+    if (type_num1 == NPY_DATETIME
+        || type_num1 == NPY_TIMEDELTA
+        || type_num2 == NPY_DATETIME
+        || type_num2 == NPY_TIMEDELTA) {
+        return ((type_num1 == type_num2)
+                && has_equivalent_datetime_metadata(type1, type2));
+    }
+    return type1->kind == type2->kind;
+}
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypenums(int typenum1, int typenum2)
+{
+    PyArray_Descr *d1, *d2;
+    npy_bool ret;
+
+    if (typenum1 == typenum2) {
+        return NPY_SUCCEED;
+    }
+
+    d1 = PyArray_DescrFromType(typenum1);
+    d2 = PyArray_DescrFromType(typenum2);
+    ret = PyArray_LegacyEquivTypes(d1, d2);
+    Py_DECREF(d1);
+    Py_DECREF(d2);
+    return ret;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_LegacyCanCastSafely(int fromtype, int totype)
+{
+    PyArray_Descr *from;
+
+    /* Fast table lookup for small type numbers */
+    if ((unsigned int)fromtype < NPY_NTYPES &&
+        (unsigned int)totype < NPY_NTYPES) {
+        return _npy_can_cast_safely_table[fromtype][totype];
+    }
+
+    /* Identity */
+    if (fromtype == totype) {
+        return 1;
+    }
+
+    from = PyArray_DescrFromType(fromtype);
+    /*
+     * cancastto is a NPY_NOTYPE terminated C-int-array of types that
+     * the data-type can be cast to safely.
+     */
+    if (from->f->cancastto) {
+        int *curtype = from->f->cancastto;
+
+        while (*curtype != NPY_NOTYPE) {
+            if (*curtype++ == totype) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTo(PyArray_Descr *from, PyArray_Descr *to)
+{
+    int from_type_num = from->type_num;
+    int to_type_num = to->type_num;
+    npy_bool ret;
+
+    ret = (npy_bool) PyArray_LegacyCanCastSafely(from_type_num, to_type_num);
+    if (ret) {
+        /* Check String and Unicode more closely */
+        if (from_type_num == NPY_STRING) {
+            if (to_type_num == NPY_STRING) {
+                ret = (from->elsize <= to->elsize);
+            }
+            else if (to_type_num == NPY_UNICODE) {
+                ret = (from->elsize << 2 <= to->elsize);
+            }
+        }
+        else if (from_type_num == NPY_UNICODE) {
+            if (to_type_num == NPY_UNICODE) {
+                ret = (from->elsize <= to->elsize);
+            }
+        }
+            /*
+             * For datetime/timedelta, only treat casts moving towards
+             * more precision as safe.
+             */
+        else if (from_type_num == NPY_DATETIME && to_type_num == NPY_DATETIME) {
+            PyArray_DatetimeMetaData *meta1, *meta2;
+            meta1 = get_datetime_metadata_from_dtype(from);
+            if (meta1 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+            meta2 = get_datetime_metadata_from_dtype(to);
+            if (meta2 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+
+            return can_cast_datetime64_metadata(meta1, meta2,
+                    NPY_SAFE_CASTING);
+        }
+        else if (from_type_num == NPY_TIMEDELTA &&
+                 to_type_num == NPY_TIMEDELTA) {
+            PyArray_DatetimeMetaData *meta1, *meta2;
+            meta1 = get_datetime_metadata_from_dtype(from);
+            if (meta1 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+            meta2 = get_datetime_metadata_from_dtype(to);
+            if (meta2 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+
+            return can_cast_timedelta64_metadata(meta1, meta2,
+                    NPY_SAFE_CASTING);
+        }
+            /*
+             * If to_type_num is STRING or unicode
+             * see if the length is long enough to hold the
+             * stringified value of the object.
+             */
+        else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
+            /*
+             * Boolean value cast to string type is 5 characters max
+             * for string 'False'.
+             */
+            int char_size = 1;
+            if (to_type_num == NPY_UNICODE) {
+                char_size = 4;
+            }
+
+            ret = 0;
+            if (PyDataType_ISUNSIZED(to)) {
+                ret = 1;
+            }
+                /*
+                 * Need at least 5 characters to convert from boolean
+                 * to 'True' or 'False'.
+                 */
+            else if (from->kind == 'b' && to->elsize >= 5 * char_size) {
+                ret = 1;
+            }
+            else if (from->kind == 'u') {
+                /* Guard against unexpected integer size */
+                if (from->elsize > 8 || from->elsize < 0) {
+                    ret = 0;
+                }
+                else if (to->elsize >=
+                         REQUIRED_STR_LEN[from->elsize] * char_size) {
+                    ret = 1;
+                }
+            }
+            else if (from->kind == 'i') {
+                /* Guard against unexpected integer size */
+                if (from->elsize > 8 || from->elsize < 0) {
+                    ret = 0;
+                }
+                    /* Extra character needed for sign */
+                else if (to->elsize >=
+                         (REQUIRED_STR_LEN[from->elsize] + 1) * char_size) {
+                    ret = 1;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+
+/*
+ * Compare two field dictionaries for castability.
+ *
+ * Return 1 if 'field1' can be cast to 'field2' according to the rule
+ * 'casting', 0 if not.
+ *
+ * Castabiliy of field dictionaries is defined recursively: 'field1' and
+ * 'field2' must have the same field names (possibly in different
+ * orders), and the corresponding field types must be castable according
+ * to the given casting rule.
+ */
+static int
+can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
+{
+    Py_ssize_t ppos;
+    PyObject *key;
+    PyObject *tuple1, *tuple2;
+
+    if (field1 == field2) {
+        return 1;
+    }
+    if (field1 == NULL || field2 == NULL) {
+        return 0;
+    }
+    if (PyDict_Size(field1) != PyDict_Size(field2)) {
+        return 0;
+    }
+
+    /* Iterate over all the fields and compare for castability */
+    ppos = 0;
+    while (PyDict_Next(field1, &ppos, &key, &tuple1)) {
+        if ((tuple2 = PyDict_GetItem(field2, key)) == NULL) {
+            return 0;
+        }
+        /* Compare the dtype of the field for castability */
+        if (!PyArray_CanCastTypeTo(
+                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple1, 0),
+                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple2, 0),
+                        casting)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
+        NPY_CASTING casting)
+{
+    /*
+     * Fast paths for equality and for basic types.
+     */
+    if (from == to ||
+        ((NPY_LIKELY(PyDataType_ISNUMBER(from)) ||
+          PyDataType_ISOBJECT(from)) &&
+         NPY_LIKELY(from->type_num == to->type_num) &&
+         NPY_LIKELY(from->byteorder == to->byteorder))) {
+        return 1;
+    }
+    /*
+     * Cases with subarrays and fields need special treatment.
+     */
+    if (PyDataType_HASFIELDS(from)) {
+        /*
+         * If from is a structured data type, then it can be cast to a simple
+         * non-object one only for unsafe casting *and* if it has a single
+         * field; recurse just in case the single field is itself structured.
+         */
+        if (!PyDataType_HASFIELDS(to) && !PyDataType_ISOBJECT(to)) {
+            if (casting == NPY_UNSAFE_CASTING &&
+                    PyDict_Size(from->fields) == 1) {
+                Py_ssize_t ppos = 0;
+                PyObject *tuple;
+                PyArray_Descr *field;
+                PyDict_Next(from->fields, &ppos, NULL, &tuple);
+                field = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
+                /*
+                 * For a subarray, we need to get the underlying type;
+                 * since we already are casting unsafely, we can ignore
+                 * the shape.
+                 */
+                if (PyDataType_HASSUBARRAY(field)) {
+                    field = field->subarray->base;
+                }
+                return PyArray_LegacyCanCastTypeTo(field, to, casting);
+            }
+            else {
+                return 0;
+            }
+        }
+        /*
+         * Casting from one structured data type to another depends on the fields;
+         * we pass that case on to the EquivTypenums case below.
+         *
+         * TODO: move that part up here? Need to check whether equivalent type
+         * numbers is an addition constraint that is needed.
+         *
+         * TODO/FIXME: For now, always allow structured to structured for unsafe
+         * casting; this is not correct, but needed since the treatment in can_cast
+         * below got out of sync with astype; see gh-13667.
+         */
+        if (casting == NPY_UNSAFE_CASTING) {
+            return 1;
+        }
+    }
+    else if (PyDataType_HASFIELDS(to)) {
+        /*
+         * If "from" is a simple data type and "to" has fields, then only
+         * unsafe casting works (and that works always, even to multiple fields).
+         */
+        return casting == NPY_UNSAFE_CASTING;
+    }
+    /*
+     * Everything else we consider castable for unsafe for now.
+     * FIXME: ensure what we do here is consistent with "astype",
+     * i.e., deal more correctly with subarrays and user-defined dtype.
+     */
+    else if (casting == NPY_UNSAFE_CASTING) {
+        return 1;
+    }
+    /*
+     * Equivalent simple types can be cast with any value of 'casting', but
+     * we need to be careful about structured to structured.
+     */
+    if (PyArray_LegacyEquivTypenums(from->type_num, to->type_num)) {
+        /* For complicated case, use EquivTypes (for now) */
+        if (PyTypeNum_ISUSERDEF(from->type_num) ||
+                        from->subarray != NULL) {
+            int ret;
+
+            /* Only NPY_NO_CASTING prevents byte order conversion */
+            if ((casting != NPY_NO_CASTING) &&
+                                (!PyArray_ISNBO(from->byteorder) ||
+                                 !PyArray_ISNBO(to->byteorder))) {
+                PyArray_Descr *nbo_from, *nbo_to;
+
+                nbo_from = PyArray_DescrNewByteorder(from, NPY_NATIVE);
+                nbo_to = PyArray_DescrNewByteorder(to, NPY_NATIVE);
+                if (nbo_from == NULL || nbo_to == NULL) {
+                    Py_XDECREF(nbo_from);
+                    Py_XDECREF(nbo_to);
+                    PyErr_Clear();
+                    return 0;
+                }
+                ret = PyArray_LegacyEquivTypes(nbo_from, nbo_to);
+                Py_DECREF(nbo_from);
+                Py_DECREF(nbo_to);
+            }
+            else {
+                ret = PyArray_LegacyEquivTypes(from, to);
+            }
+            return ret;
+        }
+
+        if (PyDataType_HASFIELDS(from)) {
+            switch (casting) {
+                case NPY_EQUIV_CASTING:
+                case NPY_SAFE_CASTING:
+                case NPY_SAME_KIND_CASTING:
+                    /*
+                     * `from' and `to' must have the same fields, and
+                     * corresponding fields must be (recursively) castable.
+                     */
+                    return can_cast_fields(from->fields, to->fields, casting);
+
+                case NPY_NO_CASTING:
+                default:
+                    return PyArray_LegacyEquivTypes(from, to);
+            }
+        }
+
+        switch (from->type_num) {
+            case NPY_DATETIME: {
+                PyArray_DatetimeMetaData *meta1, *meta2;
+                meta1 = get_datetime_metadata_from_dtype(from);
+                if (meta1 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+                meta2 = get_datetime_metadata_from_dtype(to);
+                if (meta2 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+
+                if (casting == NPY_NO_CASTING) {
+                    return PyArray_ISNBO(from->byteorder) ==
+                                        PyArray_ISNBO(to->byteorder) &&
+                            can_cast_datetime64_metadata(meta1, meta2, casting);
+                }
+                else {
+                    return can_cast_datetime64_metadata(meta1, meta2, casting);
+                }
+            }
+            case NPY_TIMEDELTA: {
+                PyArray_DatetimeMetaData *meta1, *meta2;
+                meta1 = get_datetime_metadata_from_dtype(from);
+                if (meta1 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+                meta2 = get_datetime_metadata_from_dtype(to);
+                if (meta2 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+
+                if (casting == NPY_NO_CASTING) {
+                    return PyArray_ISNBO(from->byteorder) ==
+                                        PyArray_ISNBO(to->byteorder) &&
+                        can_cast_timedelta64_metadata(meta1, meta2, casting);
+                }
+                else {
+                    return can_cast_timedelta64_metadata(meta1, meta2, casting);
+                }
+            }
+            default:
+                switch (casting) {
+                    case NPY_NO_CASTING:
+                        return PyArray_LegacyEquivTypes(from, to);
+                    case NPY_EQUIV_CASTING:
+                        return (from->elsize == to->elsize);
+                    case NPY_SAFE_CASTING:
+                        return (from->elsize <= to->elsize);
+                    default:
+                        return 1;
+                }
+                break;
+        }
+    }
+    /* If safe or same-kind casts are allowed */
+    else if (casting == NPY_SAFE_CASTING || casting == NPY_SAME_KIND_CASTING) {
+        if (PyArray_LegacyCanCastTo(from, to)) {
+            return 1;
+        }
+        else if(casting == NPY_SAME_KIND_CASTING) {
+            /*
+             * Also allow casting from lower to higher kinds, according
+             * to the ordering provided by dtype_kind_to_ordering.
+             * Some kinds, like datetime, don't fit in the hierarchy,
+             * and are special cased as -1.
+             */
+            int from_order, to_order;
+
+            from_order = dtype_kind_to_ordering(from->kind);
+            to_order = dtype_kind_to_ordering(to->kind);
+
+            if (to->kind == 'm') {
+                /* both types being timedelta is already handled before. */
+                int integer_order = dtype_kind_to_ordering('i');
+                return (from_order != -1) && (from_order <= integer_order);
+            }
+
+            return (from_order != -1) && (from_order <= to_order);
+        }
+        else {
+            return 0;
+        }
+    }
+    /* NPY_NO_CASTING or NPY_EQUIV_CASTING was specified */
+    else {
+        return 0;
+    }
+}
+
+
+/*
+ * Legacy function to find the correct dtype when casting from any built-in
+ * dtype to NPY_STRING, NPY_UNICODE, NPY_VOID, and NPY_DATETIME with generic
+ * units.
+ *
+ * This function returns a dtype based on flex_dtype and the values in
+ * data_dtype. It also calls Py_DECREF on the flex_dtype. If the
+ * flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype)
+{
+    PyArray_DatetimeMetaData *meta;
+    PyArray_Descr *retval = NULL;
+    int flex_type_num;
+
+    if (flex_dtype == NULL) {
+        return retval;
+    }
+
+    flex_type_num = flex_dtype->type_num;
+
+    /* Flexible types with expandable size */
+    if (PyDataType_ISUNSIZED(flex_dtype)) {
+        /* First replace the flex_dtype */
+        retval = PyArray_DescrNew(flex_dtype);
+        Py_DECREF(flex_dtype);
+        if (retval == NULL) {
+            return retval;
+        }
+
+        if (data_dtype->type_num == flex_type_num ||
+            flex_type_num == NPY_VOID) {
+            (retval)->elsize = data_dtype->elsize;
+        }
+        else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
+            npy_intp size = 8;
+
+            /*
+             * Get a string-size estimate of the input. These
+             * are generallly the size needed, rounded up to
+             * a multiple of eight.
+             */
+            switch (data_dtype->type_num) {
+                case NPY_BOOL:
+                case NPY_UBYTE:
+                case NPY_BYTE:
+                case NPY_USHORT:
+                case NPY_SHORT:
+                case NPY_UINT:
+                case NPY_INT:
+                case NPY_ULONG:
+                case NPY_LONG:
+                case NPY_ULONGLONG:
+                case NPY_LONGLONG:
+                    if (data_dtype->kind == 'b') {
+                        /* 5 chars needed for cast to 'True' or 'False' */
+                        size = 5;
+                    }
+                    else if (data_dtype->elsize > 8 ||
+                             data_dtype->elsize < 0) {
+                        /*
+                         * Element size should never be greater than 8 or
+                         * less than 0 for integer type, but just in case...
+                         */
+                        break;
+                    }
+                    else if (data_dtype->kind == 'u') {
+                        size = REQUIRED_STR_LEN[data_dtype->elsize];
+                    }
+                    else if (data_dtype->kind == 'i') {
+                        /* Add character for sign symbol */
+                        size = REQUIRED_STR_LEN[data_dtype->elsize] + 1;
+                    }
+                    break;
+                case NPY_HALF:
+                case NPY_FLOAT:
+                case NPY_DOUBLE:
+                    size = 32;
+                    break;
+                case NPY_LONGDOUBLE:
+                    size = 48;
+                    break;
+                case NPY_CFLOAT:
+                case NPY_CDOUBLE:
+                    size = 2 * 32;
+                    break;
+                case NPY_CLONGDOUBLE:
+                    size = 2 * 48;
+                    break;
+                case NPY_OBJECT:
+                    size = 64;
+                    break;
+                case NPY_STRING:
+                case NPY_VOID:
+                    size = data_dtype->elsize;
+                    break;
+                case NPY_UNICODE:
+                    size = data_dtype->elsize / 4;
+                    break;
+                case NPY_DATETIME:
+                    meta = get_datetime_metadata_from_dtype(data_dtype);
+                    if (meta == NULL) {
+                        Py_DECREF(retval);
+                        return NULL;
+                    }
+                    size = get_datetime_iso_8601_strlen(0, meta->base);
+                    break;
+                case NPY_TIMEDELTA:
+                    size = 21;
+                    break;
+            }
+
+            if (flex_type_num == NPY_STRING) {
+                retval->elsize = size;
+            }
+            else if (flex_type_num == NPY_UNICODE) {
+                retval->elsize = size * 4;
+            }
+        }
+        else {
+            /*
+             * We should never get here, but just in case someone adds
+             * a new flex dtype...
+             */
+            PyErr_SetString(PyExc_TypeError,
+                    "don't know how to adapt flex dtype");
+            Py_DECREF(retval);
+            return NULL;
+        }
+    }
+        /* Flexible type with generic time unit that adapts */
+    else if (flex_type_num == NPY_DATETIME ||
+             flex_type_num == NPY_TIMEDELTA) {
+        meta = get_datetime_metadata_from_dtype(flex_dtype);
+        retval = flex_dtype;
+        if (meta == NULL) {
+            return NULL;
+        }
+
+        if (meta->base == NPY_FR_GENERIC) {
+            if (data_dtype->type_num == NPY_DATETIME ||
+                data_dtype->type_num == NPY_TIMEDELTA) {
+                meta = get_datetime_metadata_from_dtype(data_dtype);
+                if (meta == NULL) {
+                    return NULL;
+                }
+
+                retval = create_datetime_dtype(flex_type_num, meta);
+                Py_DECREF(flex_dtype);
+            }
+        }
+    }
+    else {
+        retval = flex_dtype;
+    }
+    return retval;
+}
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.h b/numpy/core/src/multiarray/legacy_dtype_implementation.h
new file mode 100644
index 000000000..ca171d773
--- /dev/null
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.h
@@ -0,0 +1,40 @@
+#ifndef _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
+#define _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypes(PyArray_Descr *type1, PyArray_Descr *type2);
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypenums(int typenum1, int typenum2);
+
+NPY_NO_EXPORT int
+PyArray_LegacyCanCastSafely(int fromtype, int totype);
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTo(PyArray_Descr *from, PyArray_Descr *to);
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
+        NPY_CASTING casting);
+
+/*
+ * This function calls Py_DECREF on flex_dtype, and replaces it with
+ * a new dtype that has been adapted based on the values in data_dtype
+ * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * The data_obj may be NULL if just a dtype is known for the source.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
+ *
+ * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
+ * and NPY_DATETIME with generic units.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype);
+
+#endif /*_NPY_LEGACY_DTYPE_IMPLEMENTATION_H*/
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index d234c366c..0590558be 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -110,7 +110,7 @@
  * if not it can decrease performance
  * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
  */
-static void
+static int
 #if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
     NPY_GCC_UNROLL_LOOPS
 #endif
@@ -171,6 +171,7 @@ static void
 
         --N;
     }
+    return 0;
 }
 #endif
 
@@ -182,7 +183,7 @@ static void
  * but it profits from vectorization enabled with -O3
  */
 #if (@src_contig@ == 0) && @is_aligned@
-static NPY_GCC_OPT_3 void
+static NPY_GCC_OPT_3 int
 @prefix@_@oper@_size@elsize@_srcstride0(char *dst,
                         npy_intp dst_stride,
                         char *src, npy_intp NPY_UNUSED(src_stride),
@@ -197,7 +198,7 @@ static NPY_GCC_OPT_3 void
     npy_uint64 temp0, temp1;
 #endif
     if (N == 0) {
-        return;
+        return 0;
     }
 #if @is_aligned@ && @elsize@ != 16
     /* sanity check */
@@ -238,6 +239,7 @@ static NPY_GCC_OPT_3 void
         --N;
     }
 #endif/* @elsize == 1 && @dst_contig@ -- else */
+    return 0;
 }
 #endif/* (@src_contig@ == 0) && @is_aligned@ */
 
@@ -247,7 +249,7 @@ static NPY_GCC_OPT_3 void
 /**end repeat1**/
 /**end repeat**/
 
-static void
+static int
 _strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -259,9 +261,10 @@ _strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _swap_strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -284,9 +287,10 @@ _swap_strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -319,15 +323,17 @@ _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp src_itemsize,
                         NpyAuxData *NPY_UNUSED(data))
 {
     memmove(dst, src, src_itemsize*N);
+    return 0;
 }
 
 
@@ -787,7 +793,7 @@ NPY_NO_EXPORT PyArray_StridedUnaryOp *
 
 #endif
 
-static NPY_GCC_OPT_3 void
+static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
                         char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
@@ -873,6 +879,7 @@ static NPY_GCC_OPT_3 void
         src += src_stride;
 #endif
     }
+    return 0;
 }
 
 #undef _CONVERT_FN
@@ -989,10 +996,14 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
     src_stride0 = src_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride, src, src_stride0, count, src_itemsize, data);
-        return 0;
+        return stransfer(dst, dst_stride, src, src_stride0,
+                         count, src_itemsize, data);
+    }
+    int res = stransfer(dst, dst_stride, src, src_stride0,
+                        N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
     }
-    stransfer(dst, dst_stride, src, src_stride0, N, src_itemsize, data);
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1012,13 +1023,15 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride, src, src_stride0,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(dst, dst_stride, src, src_stride0,
+                             count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride, src, src_stride0,
-                        shape0, src_itemsize, data);
+            res = stransfer(dst, dst_stride, src, src_stride0,
+                            shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         src += src_stride1;
@@ -1073,13 +1086,15 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride, src, src_stride0,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(dst, dst_stride, src, src_stride0,
+                                     count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride, src, src_stride0,
-                                shape0, src_itemsize, data);
+                    res = stransfer(dst, dst_stride, src, src_stride0,
+                                    shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 src += src_stride1;
@@ -1108,10 +1123,14 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
     dst_stride0 = dst_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride0, src, src_stride, count, src_itemsize, data);
-        return 0;
+        return stransfer(dst, dst_stride0, src, src_stride,
+                         count, src_itemsize, data);
+    }
+    int res = stransfer(dst, dst_stride0, src, src_stride,
+                        N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
     }
-    stransfer(dst, dst_stride0, src, src_stride, N, src_itemsize, data);
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1131,13 +1150,15 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride0, src, src_stride,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(dst, dst_stride0, src, src_stride,
+                             count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride0, src, src_stride,
-                        shape0, src_itemsize, data);
+            res = stransfer(dst, dst_stride0, src, src_stride,
+                            shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         dst += dst_stride1;
@@ -1192,13 +1213,15 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride0, src, src_stride,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(dst, dst_stride0, src, src_stride,
+                                     count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride0, src, src_stride,
-                                shape0, src_itemsize, data);
+                    res = stransfer(dst, dst_stride0, src, src_stride,
+                                    shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 dst += dst_stride1;
@@ -1228,16 +1251,18 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
     dst_stride0 = dst_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride0,
-                    src, src_stride,
-                    mask, mask_stride,
-                    count, src_itemsize, data);
-        return 0;
-    }
-    stransfer(dst, dst_stride0,
-                src, src_stride,
+        return stransfer(
+                dst, dst_stride0, src, src_stride,
                 mask, mask_stride,
-                N, src_itemsize, data);
+                count, src_itemsize, data);
+    }
+    int res = stransfer(
+            dst, dst_stride0, src, src_stride,
+            mask, mask_stride,
+            N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
+    }
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1258,17 +1283,19 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride0,
-                        src, src_stride,
-                        mask, mask_stride,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(
+                    dst, dst_stride0, src, src_stride,
+                    mask, mask_stride,
+                    count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride0,
-                        src, src_stride,
-                        mask, mask_stride,
-                        shape0, src_itemsize, data);
+            int res = stransfer(
+                    dst, dst_stride0, src, src_stride,
+                    mask, mask_stride,
+                    shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         dst += dst_stride1;
@@ -1324,17 +1351,19 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride0,
-                                src, src_stride,
-                                mask, mask_stride,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(
+                            dst, dst_stride0, src, src_stride,
+                            mask, mask_stride,
+                            count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride0,
-                                src, src_stride,
-                                mask, mask_stride,
-                                shape0, src_itemsize, data);
+                    res = stransfer(
+                            dst, dst_stride0, src, src_stride,
+                            mask, mask_stride,
+                            shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 dst += dst_stride1;
@@ -1760,13 +1789,23 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                 do {
 
 #if @isget@
-                    stransfer(subspace_ptrs[1], subspace_strides[1],
-                              subspace_ptrs[0], subspace_strides[0],
-                              *counter, src_itemsize, transferdata);
+                    if (NPY_UNLIKELY(stransfer(
+                            subspace_ptrs[1], subspace_strides[1],
+                            subspace_ptrs[0], subspace_strides[0],
+                            *counter, src_itemsize, transferdata) < 0)) {
+                        NPY_END_THREADS;
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
 #else
-                    stransfer(subspace_ptrs[0], subspace_strides[0],
-                              subspace_ptrs[1], subspace_strides[1],
-                              *counter, src_itemsize, transferdata);
+                    if (NPY_UNLIKELY(stransfer(
+                            subspace_ptrs[0], subspace_strides[0],
+                            subspace_ptrs[1], subspace_strides[1],
+                            *counter, src_itemsize, transferdata) < 0)) {
+                        NPY_END_THREADS;
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
 #endif
                 } while (mit->subspace_next(mit->subspace_iter));
 
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index c27e0c391..d64962f87 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -233,7 +233,7 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
             || PySlice_Check(index)
             || PyArray_Check(index)
             || !PySequence_Check(index)
-            || PyBaseString_Check(index)) {
+            || PyUnicode_Check(index)) {
 
         return unpack_scalar(index, result, result_n);
     }
@@ -539,22 +539,22 @@ prepare_index(PyArrayObject *self, PyObject *index,
             /*
              * There are two types of boolean indices (which are equivalent,
              * for the most part though). A single boolean index of matching
-             * dimensionality and size is a boolean index.
-             * If this is not the case, it is instead expanded into (multiple)
-             * integer array indices.
+             * shape is a boolean index. If this is not the case, it is
+             * instead expanded into (multiple) integer array indices.
              */
             PyArrayObject *nonzero_result[NPY_MAXDIMS];
 
             if ((index_ndim == 1) && allow_boolean) {
                 /*
-                 * If ndim and size match, this can be optimized as a single
-                 * boolean index. The size check is necessary only to support
-                 * old non-matching sizes by using fancy indexing instead.
-                 * The reason for that is that fancy indexing uses nonzero,
-                 * and only the result of nonzero is checked for legality.
+                 * If shapes match exactly, this can be optimized as a single
+                 * boolean index. When the dimensions are identical but the shapes are not,
+                 * this is always an error. The check ensures that these errors are raised
+                 * and match those of the generic path.
                  */
                 if ((PyArray_NDIM(arr) == PyArray_NDIM(self))
-                        && PyArray_SIZE(arr) == PyArray_SIZE(self)) {
+                        && PyArray_CompareLists(PyArray_DIMS(arr),
+                                                PyArray_DIMS(self),
+                                                PyArray_NDIM(arr))) {
 
                     index_type = HAS_BOOL;
                     indices[curr_idx].type = HAS_BOOL;
@@ -946,9 +946,9 @@ get_view_from_index(PyArrayObject *self, PyArrayObject **view,
                 }
                 break;
             case HAS_SLICE:
-                if (NpySlice_GetIndicesEx(indices[i].object,
-                                          PyArray_DIMS(self)[orig_dim],
-                                          &start, &stop, &step, &n_steps) < 0) {
+                if (PySlice_GetIndicesEx(indices[i].object,
+                                         PyArray_DIMS(self)[orig_dim],
+                                         &start, &stop, &step, &n_steps) < 0) {
                     return -1;
                 }
                 if (n_steps <= 0) {
@@ -1091,6 +1091,7 @@ array_boolean_subscript(PyArrayObject *self,
 
         self_stride = innerstrides[0];
         bmask_stride = innerstrides[1];
+        int res = 0;
         do {
             innersize = *NpyIter_GetInnerLoopSizePtr(iter);
             self_data = dataptrs[0];
@@ -1105,8 +1106,11 @@ array_boolean_subscript(PyArrayObject *self,
                 /* Process unmasked values */
                 bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
                                         &subloopsize, 0);
-                stransfer(ret_data, itemsize, self_data, self_stride,
-                            subloopsize, itemsize, transferdata);
+                res = stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                if (res < 0) {
+                    break;
+                }
                 innersize -= subloopsize;
                 self_data += subloopsize * self_stride;
                 ret_data += subloopsize * itemsize;
@@ -1115,8 +1119,15 @@ array_boolean_subscript(PyArrayObject *self,
 
         NPY_END_THREADS;
 
-        NpyIter_Deallocate(iter);
+        if (!NpyIter_Deallocate(iter)) {
+            res = -1;
+        }
         NPY_AUXDATA_FREE(transferdata);
+        if (res < 0) {
+            /* Should be practically impossible, since there is no cast */
+            Py_DECREF(ret);
+            return NULL;
+        }
     }
 
     if (!PyArray_CheckExact(self)) {
@@ -1209,6 +1220,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
     v_data = PyArray_DATA(v);
 
     /* Create an iterator for the data */
+    int res = 0;
     if (size > 0) {
         NpyIter *iter;
         PyArrayObject *op[2] = {self, bmask};
@@ -1253,7 +1265,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         if (PyArray_GetDTypeTransferFunction(
-                        IsUintAligned(self) && IsAligned(self) &&
+                 IsUintAligned(self) && IsAligned(self) &&
                         IsUintAligned(v) && IsAligned(v),
                         v_stride, fixed_strides[0],
                         PyArray_DESCR(v), PyArray_DESCR(self),
@@ -1282,8 +1294,11 @@ array_assign_boolean_subscript(PyArrayObject *self,
                 /* Process unmasked values */
                 bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
                                         &subloopsize, 0);
-                stransfer(self_data, self_stride, v_data, v_stride,
-                            subloopsize, src_itemsize, transferdata);
+                res = stransfer(self_data, self_stride, v_data, v_stride,
+                        subloopsize, src_itemsize, transferdata);
+                if (res < 0) {
+                    break;
+                }
                 innersize -= subloopsize;
                 self_data += subloopsize * self_stride;
                 v_data += subloopsize * v_stride;
@@ -1295,22 +1310,12 @@ array_assign_boolean_subscript(PyArrayObject *self,
         }
 
         NPY_AUXDATA_FREE(transferdata);
-        NpyIter_Deallocate(iter);
-    }
-
-    if (needs_api) {
-        /*
-         * FIXME?: most assignment operations stop after the first occurrence
-         * of an error. Boolean does not currently, but should at least
-         * report the error. (This is only relevant for things like str->int
-         * casts which call into python)
-         */
-        if (PyErr_Occurred()) {
-            return -1;
+        if (!NpyIter_Deallocate(iter)) {
+            res = -1;
         }
     }
 
-    return 0;
+    return res;
 }
 
 
@@ -1402,7 +1407,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
     *view = NULL;
 
     /* first check for a single field name */
-    if (PyBaseString_Check(ind)) {
+    if (PyUnicode_Check(ind)) {
         PyObject *tup;
         PyArray_Descr *fieldtype;
         npy_intp offset;
@@ -1413,10 +1418,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
             return 0;
         }
         else if (tup == NULL){
-            PyObject *errmsg = PyUString_FromString("no field of name ");
-            PyUString_Concat(&errmsg, ind);
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
+            PyErr_Format(PyExc_ValueError, "no field of name %S", ind);
             return 0;
         }
         if (_unpack_field(tup, &fieldtype, &offset) < 0) {
@@ -1466,7 +1468,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyErr_Clear();
                 return -1;
             }
-            is_string = PyBaseString_Check(item);
+            is_string = PyUnicode_Check(item);
             Py_DECREF(item);
             if (!is_string) {
                 return -1;
@@ -2340,7 +2342,6 @@ mapiter_fill_info(PyArrayMapIterObject *mit, npy_index_info *indices,
     int consec_status = -1;
     int axis, broadcast_axis;
     npy_intp dimension;
-    PyObject *errmsg, *tmp;
 
     for (i = 0; i < mit->nd_fancy; i++) {
         mit->dimensions[i] = 1;
@@ -2428,35 +2429,38 @@ mapiter_fill_info(PyArrayMapIterObject *mit, npy_index_info *indices,
 
     return 0;
 
-  broadcast_error:
+broadcast_error: ;  // Declarations cannot follow labels, add empty statement.
     /*
      * Attempt to set a meaningful exception. Could also find out
      * if a boolean index was converted.
      */
-    errmsg = PyUString_FromString("shape mismatch: indexing arrays could not "
-                                  "be broadcast together with shapes ");
+    PyObject *errmsg = PyUnicode_FromString("");
     if (errmsg == NULL) {
         return -1;
     }
-
     for (i = 0; i < index_num; i++) {
         if (!(indices[i].type & HAS_FANCY)) {
             continue;
         }
-        tmp = convert_shape_to_string(
-                    PyArray_NDIM((PyArrayObject *)indices[i].object),
-                    PyArray_SHAPE((PyArrayObject *)indices[i].object),
-                    " ");
+
+        int ndim = PyArray_NDIM((PyArrayObject *)indices[i].object);
+        npy_intp *shape = PyArray_SHAPE((PyArrayObject *)indices[i].object);
+        PyObject *tmp = convert_shape_to_string(ndim, shape, " ");
         if (tmp == NULL) {
+            Py_DECREF(errmsg);
             return -1;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
+
+        Py_SETREF(errmsg, PyUnicode_Concat(errmsg, tmp));
+        Py_DECREF(tmp);
         if (errmsg == NULL) {
             return -1;
         }
     }
 
-    PyErr_SetObject(PyExc_IndexError, errmsg);
+    PyErr_Format(PyExc_IndexError,
+            "shape mismatch: indexing arrays could not "
+            "be broadcast together with shapes %S", errmsg);
     Py_DECREF(errmsg);
     return -1;
 }
@@ -2648,7 +2652,6 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
                    npy_uint32 extra_op_flags, PyArrayObject *extra_op,
                    PyArray_Descr *extra_op_dtype)
 {
-    PyObject *errmsg, *tmp;
     /* For shape reporting on error */
     PyArrayObject *original_extra_op = extra_op;
 
@@ -3178,45 +3181,30 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
     goto finish;
 
   broadcast_error:
-    errmsg = PyUString_FromString("shape mismatch: value array "
-                    "of shape ");
-    if (errmsg == NULL) {
-        goto finish;
-    }
-
     /* Report the shape of the original array if it exists */
     if (original_extra_op == NULL) {
         original_extra_op = extra_op;
     }
 
-    tmp = convert_shape_to_string(PyArray_NDIM(original_extra_op),
-                                  PyArray_DIMS(original_extra_op), " ");
-    if (tmp == NULL) {
-        goto finish;
-    }
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
+    int extra_ndim = PyArray_NDIM(original_extra_op);
+    npy_intp *extra_dims = PyArray_DIMS(original_extra_op);
+    PyObject *shape1 = convert_shape_to_string(extra_ndim, extra_dims, " ");
+    if (shape1 == NULL) {
         goto finish;
     }
 
-    tmp = PyUString_FromString("could not be broadcast to indexing "
-                    "result of shape ");
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
+    PyObject *shape2 = convert_shape_to_string(mit->nd, mit->dimensions, "");
+    if (shape2 == NULL) {
+        Py_DECREF(shape1);
         goto finish;
     }
 
-    tmp = convert_shape_to_string(mit->nd, mit->dimensions, "");
-    if (tmp == NULL) {
-        goto finish;
-    }
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
-        goto finish;
-    }
+    PyErr_Format(PyExc_ValueError,
+            "shape mismatch: value array of shape %S could not be broadcast "
+            "to indexing result of shape %S", shape1, shape2);
 
-    PyErr_SetObject(PyExc_ValueError, errmsg);
-    Py_DECREF(errmsg);
+    Py_DECREF(shape1);
+    Py_DECREF(shape2);
 
   finish:
     Py_XDECREF(extra_op);
@@ -3315,7 +3303,7 @@ PyArray_MapIterArrayCopyIfOverlap(PyArrayObject * a, PyObject * index,
     Py_XDECREF(a_copy);
     Py_XDECREF(subspace);
     Py_XDECREF((PyObject *)mit);
-    for (i=0; i < index_num; i++) {
+    for (i = 0; i < index_num; i++) {
         Py_XDECREF(indices[i].object);
     }
     return NULL;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index a2db8042f..9c8bb4135 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -578,6 +578,28 @@ array_tostring(PyArrayObject *self, PyObject *args, PyObject *kwds)
     return PyArray_ToString(self, order);
 }
 
+/* Like PyArray_ToFile but takes the file as a python object */
+static int
+PyArray_ToFileObject(PyArrayObject *self, PyObject *file, char *sep, char *format)
+{
+    npy_off_t orig_pos = 0;
+    FILE *fd = npy_PyFile_Dup2(file, "wb", &orig_pos);
+
+    if (fd == NULL) {
+        return -1;
+    }
+
+    int write_ret = PyArray_ToFile(self, fd, sep, format);
+    PyObject *err_type, *err_value, *err_traceback;
+    PyErr_Fetch(&err_type, &err_value, &err_traceback);
+    int close_ret = npy_PyFile_DupClose2(file, fd, orig_pos);
+    npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
+
+    if (write_ret || close_ret) {
+        return -1;
+    }
+    return 0;
+}
 
 /* This should grow an order= keyword to be consistent
  */
@@ -587,10 +609,8 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int own;
     PyObject *file;
-    FILE *fd;
     char *sep = "";
     char *format = "";
-    npy_off_t orig_pos = 0;
     static char *kwlist[] = {"file", "sep", "format", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|ss:tofile", kwlist,
@@ -615,25 +635,22 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
         own = 0;
     }
 
-    fd = npy_PyFile_Dup2(file, "wb", &orig_pos);
-    if (fd == NULL) {
-        goto fail;
-    }
-    if (PyArray_ToFile(self, fd, sep, format) < 0) {
-        goto fail;
-    }
-    if (npy_PyFile_DupClose2(file, fd, orig_pos) < 0) {
-        goto fail;
-    }
-    if (own && npy_PyFile_CloseFile(file) < 0) {
-        goto fail;
+    int file_ret = PyArray_ToFileObject(self, file, sep, format);
+    int close_ret = 0;
+
+    if (own) {
+        PyObject *err_type, *err_value, *err_traceback;
+        PyErr_Fetch(&err_type, &err_value, &err_traceback);
+        close_ret = npy_PyFile_CloseFile(file);
+        npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
     }
-    Py_DECREF(file);
-    Py_RETURN_NONE;
 
-fail:
     Py_DECREF(file);
-    return NULL;
+
+    if (file_ret || close_ret) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
 }
 
 static PyObject *
@@ -844,6 +861,21 @@ array_astype(PyArrayObject *self, PyObject *args, PyObject *kwds)
         if (ret == NULL) {
             return NULL;
         }
+        /* NumPy 1.20, 2020-10-01 */
+        if ((PyArray_NDIM(self) != PyArray_NDIM(ret)) &&
+                DEPRECATE_FUTUREWARNING(
+                    "casting an array to a subarray dtype "
+                    "will not using broadcasting in the future, but cast each "
+                    "element to the new dtype and then append the dtype's shape "
+                    "to the new array. You can opt-in to the new behaviour, by "
+                    "additional field to the cast: "
+                    "`arr.astype(np.dtype([('f', dtype)]))['f']`.\n"
+                    "This may lead to a different result or to current failures "
+                    "succeeding.  "
+                    "(FutureWarning since NumPy 1.20)") < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
 
         if (PyArray_CopyInto(ret, self) < 0) {
             Py_DECREF(ret);
@@ -1508,14 +1540,14 @@ _deepcopy_call(char *iptr, char *optr, PyArray_Descr *dtype,
     else {
         PyObject *itemp, *otemp;
         PyObject *res;
-        NPY_COPY_PYOBJECT_PTR(&itemp, iptr);
-        NPY_COPY_PYOBJECT_PTR(&otemp, optr);
+        memcpy(&itemp, iptr, sizeof(itemp));
+        memcpy(&otemp, optr, sizeof(otemp));
         Py_XINCREF(itemp);
         /* call deepcopy on this argument */
         res = PyObject_CallFunctionObjArgs(deepcopy, itemp, visit, NULL);
         Py_XDECREF(itemp);
         Py_XDECREF(otemp);
-        NPY_COPY_PYOBJECT_PTR(optr, &res);
+        memcpy(optr, &res, sizeof(res));
     }
 
 }
@@ -1616,7 +1648,7 @@ _getlist_pkl(PyArrayObject *self)
     }
     while (iter->index < iter->size) {
         theobject = getitem(iter->dataptr, self);
-        PyList_SET_ITEM(list, (int) iter->index, theobject);
+        PyList_SET_ITEM(list, iter->index, theobject);
         PyArray_ITER_NEXT(iter);
     }
     Py_DECREF(iter);
@@ -1636,7 +1668,7 @@ _setlist_pkl(PyArrayObject *self, PyObject *list)
         return -1;
     }
     while(iter->index < iter->size) {
-        theobject = PyList_GET_ITEM(list, (int) iter->index);
+        theobject = PyList_GET_ITEM(list, iter->index);
         setitem(theobject, iter->dataptr, self);
         PyArray_ITER_NEXT(iter);
     }
@@ -1676,7 +1708,7 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
                      Py_BuildValue("ONc",
                                    (PyObject *)Py_TYPE(self),
                                    Py_BuildValue("(N)",
-                                                 PyInt_FromLong(0)),
+                                                 PyLong_FromLong(0)),
                                    /* dummy data-type */
                                    'b'));
 
@@ -1701,7 +1733,7 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
         Py_DECREF(ret);
         return NULL;
     }
-    PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+    PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
     PyTuple_SET_ITEM(state, 1, PyObject_GetAttrString((PyObject *)self,
                                                       "shape"));
     descr = PyArray_DESCR(self);
@@ -1763,7 +1795,7 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
 #if PY_VERSION_HEX >= 0x03080000
     /* we expect protocol 5 to be available in Python 3.8 */
     pickle_module = PyImport_ImportModule("pickle");
-#elif PY_VERSION_HEX >= 0x03060000
+#else
     pickle_module = PyImport_ImportModule("pickle5");
     if (pickle_module == NULL) {
         /* for protocol 5, raise a clear ImportError if pickle5 is not found
@@ -1772,10 +1804,6 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
                 "requires the pickle5 module for Python >=3.6 and <3.8");
         return NULL;
     }
-#else
-    PyErr_SetString(PyExc_ValueError, "pickle protocol 5 is not available "
-                                      "for Python < 3.6");
-    return NULL;
 #endif
     if (pickle_module == NULL){
         return NULL;
@@ -2152,7 +2180,7 @@ static PyObject *
 array_sizeof(PyArrayObject *self)
 {
     /* object + dimension and strides */
-    Py_ssize_t nbytes = NPY_SIZEOF_PYARRAYOBJECT +
+    Py_ssize_t nbytes = Py_TYPE(self)->tp_basicsize +
         PyArray_NDIM(self) * sizeof(npy_intp) * 2;
     if (PyArray_CHKFLAGS(self, NPY_ARRAY_OWNDATA)) {
         nbytes += PyArray_NBYTES(self);
@@ -2585,9 +2613,10 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     PyArrayObject *arr;
     PyArray_Descr *dtype;
     PyObject *c;
+
     if (PyArray_SIZE(self) != 1) {
-        PyErr_SetString(PyExc_TypeError, "only length-1 arrays can "\
-                        "be converted to Python scalars");
+        PyErr_SetString(PyExc_TypeError,
+                "only length-1 arrays can be converted to Python scalars");
         return NULL;
     }
 
@@ -2598,38 +2627,18 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 
     if (!PyArray_CanCastArrayTo(self, dtype, NPY_SAME_KIND_CASTING) &&
             !(PyArray_TYPE(self) == NPY_OBJECT)) {
-        PyObject *err, *msg_part;
+        PyObject *descr = (PyObject*)PyArray_DESCR(self);
+
         Py_DECREF(dtype);
-        err = PyString_FromString("unable to convert ");
-        if (err == NULL) {
-            return NULL;
-        }
-        msg_part = PyObject_Repr((PyObject*)PyArray_DESCR(self));
-        if (msg_part == NULL) {
-            Py_DECREF(err);
-            return NULL;
-        }
-        PyString_ConcatAndDel(&err, msg_part);
-        if (err == NULL) {
-            return NULL;
-        }
-        msg_part = PyString_FromString(", to complex.");
-        if (msg_part == NULL) {
-            Py_DECREF(err);
-            return NULL;
-        }
-        PyString_ConcatAndDel(&err, msg_part);
-        if (err == NULL) {
-            return NULL;
-        }
-        PyErr_SetObject(PyExc_TypeError, err);
-        Py_DECREF(err);
+        PyErr_Format(PyExc_TypeError,
+                "Unable to convert %R to complex", descr);
         return NULL;
     }
 
     if (PyArray_TYPE(self) == NPY_OBJECT) {
         /* let python try calling __complex__ on the object. */
         PyObject *args, *res;
+
         Py_DECREF(dtype);
         args = Py_BuildValue("(O)", *((PyObject**)PyArray_DATA(self)));
         if (args == NULL) {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 9a34685f4..32c5ac0dc 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -30,6 +30,8 @@
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
+#include "convert_datatype.h"
+#include "legacy_dtype_implementation.h"
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
@@ -65,7 +67,6 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "compiled_base.h"
 #include "mem_overlap.h"
-#include "alloc.h"
 #include "typeinfo.h"
 
 #include "get_attr_string.h"
@@ -363,7 +364,8 @@ PyArray_GetSubType(int narrays, PyArrayObject **arrays) {
  */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
-                          PyArrayObject* ret)
+                          PyArrayObject* ret, PyArray_Descr *dtype,
+                          NPY_CASTING casting)
 {
     int iarrays, idim, ndim;
     npy_intp shape[NPY_MAXDIMS];
@@ -427,6 +429,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
     }
 
     if (ret != NULL) {
+        assert(dtype == NULL);
         if (PyArray_NDIM(ret) != ndim) {
             PyErr_SetString(PyExc_ValueError,
                             "Output array has wrong dimensionality");
@@ -446,10 +449,16 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
-        /* Get the resulting dtype from combining all the arrays */
-        PyArray_Descr *dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
         if (dtype == NULL) {
-            return NULL;
+            /* Get the resulting dtype from combining all the arrays */
+            dtype = (PyArray_Descr *)PyArray_ResultType(
+                                                narrays, arrays, 0, NULL);
+            if (dtype == NULL) {
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(dtype);
         }
 
         /*
@@ -495,7 +504,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
 
         /* Copy the data for this array */
         if (PyArray_AssignArray((PyArrayObject *)sliding_view, arrays[iarrays],
-                            NULL, NPY_SAME_KIND_CASTING) < 0) {
+                            NULL, casting) < 0) {
             Py_DECREF(sliding_view);
             Py_DECREF(ret);
             return NULL;
@@ -515,7 +524,9 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
  */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
-                                    NPY_ORDER order, PyArrayObject *ret)
+                                   NPY_ORDER order, PyArrayObject *ret,
+                                   PyArray_Descr *dtype, NPY_CASTING casting,
+                                   npy_bool casting_not_passed)
 {
     int iarrays;
     npy_intp shape = 0;
@@ -542,7 +553,10 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         }
     }
 
+    int out_passed = 0;
     if (ret != NULL) {
+        assert(dtype == NULL);
+        out_passed = 1;
         if (PyArray_NDIM(ret) != 1) {
             PyErr_SetString(PyExc_ValueError,
                             "Output array must be 1D");
@@ -561,10 +575,16 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
-        /* Get the resulting dtype from combining all the arrays */
-        PyArray_Descr *dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
         if (dtype == NULL) {
-            return NULL;
+            /* Get the resulting dtype from combining all the arrays */
+            dtype = (PyArray_Descr *)PyArray_ResultType(
+                                            narrays, arrays, 0, NULL);
+            if (dtype == NULL) {
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(dtype);
         }
 
         stride = dtype->elsize;
@@ -594,10 +614,37 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         return NULL;
     }
 
+    int give_deprecation_warning = 1;  /* To give warning for just one input array. */
     for (iarrays = 0; iarrays < narrays; ++iarrays) {
         /* Adjust the window dimensions for this array */
         sliding_view->dimensions[0] = PyArray_SIZE(arrays[iarrays]);
 
+        if (!PyArray_CanCastArrayTo(
+                arrays[iarrays], PyArray_DESCR(ret), casting)) {
+            /* This should be an error, but was previously allowed here. */
+            if (casting_not_passed && out_passed) {
+                /* NumPy 1.20, 2020-09-03 */
+                if (give_deprecation_warning && DEPRECATE(
+                        "concatenate() with `axis=None` will use same-kind "
+                        "casting by default in the future. Please use "
+                        "`casting='unsafe'` to retain the old behaviour. "
+                        "In the future this will be a TypeError.") < 0) {
+                    Py_DECREF(sliding_view);
+                    Py_DECREF(ret);
+                    return NULL;
+                }
+                give_deprecation_warning = 0;
+            }
+            else {
+                npy_set_invalid_cast_error(
+                        PyArray_DESCR(arrays[iarrays]), PyArray_DESCR(ret),
+                        casting, PyArray_NDIM(arrays[iarrays]) == 0);
+                Py_DECREF(sliding_view);
+                Py_DECREF(ret);
+                return NULL;
+            }
+        }
+
         /* Copy the data for this array */
         if (PyArray_CopyAsFlat((PyArrayObject *)sliding_view, arrays[iarrays],
                             order) < 0) {
@@ -615,8 +662,21 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
     return ret;
 }
 
+
+/**
+ * Implementation for np.concatenate
+ *
+ * @param op Sequence of arrays to concatenate
+ * @param axis Axis to concatenate along
+ * @param ret output array to fill
+ * @param dtype Forced output array dtype (cannot be combined with ret)
+ * @param casting Casting mode used
+ * @param casting_not_passed Deprecation helper
+ */
 NPY_NO_EXPORT PyObject *
-PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
+PyArray_ConcatenateInto(PyObject *op,
+        int axis, PyArrayObject *ret, PyArray_Descr *dtype,
+        NPY_CASTING casting, npy_bool casting_not_passed)
 {
     int iarrays, narrays;
     PyArrayObject **arrays;
@@ -626,6 +686,12 @@ PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
                         "The first input argument needs to be a sequence");
         return NULL;
     }
+    if (ret != NULL && dtype != NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "concatenate() only takes `out` or `dtype` as an "
+                "argument, but both were provided.");
+        return NULL;
+    }
 
     /* Convert the input list into arrays */
     narrays = PySequence_Size(op);
@@ -652,10 +718,13 @@ PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
     }
 
     if (axis >= NPY_MAXDIMS) {
-        ret = PyArray_ConcatenateFlattenedArrays(narrays, arrays, NPY_CORDER, ret);
+        ret = PyArray_ConcatenateFlattenedArrays(
+                narrays, arrays, NPY_CORDER, ret, dtype,
+                casting, casting_not_passed);
     }
     else {
-        ret = PyArray_ConcatenateArrays(narrays, arrays, axis, ret);
+        ret = PyArray_ConcatenateArrays(
+                narrays, arrays, axis, ret, dtype, casting);
     }
 
     for (iarrays = 0; iarrays < narrays; ++iarrays) {
@@ -687,7 +756,16 @@ fail:
 NPY_NO_EXPORT PyObject *
 PyArray_Concatenate(PyObject *op, int axis)
 {
-    return PyArray_ConcatenateInto(op, axis, NULL);
+    /* retain legacy behaviour for casting */
+    NPY_CASTING casting;
+    if (axis >= NPY_MAXDIMS) {
+        casting = NPY_UNSAFE_CASTING;
+    }
+    else {
+        casting = NPY_SAME_KIND_CASTING;
+    }
+    return PyArray_ConcatenateInto(
+            op, axis, NULL, NULL, casting, 0);
 }
 
 static int
@@ -1404,65 +1482,6 @@ array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
-/*
- * Compare the field dictionaries for two types.
- *
- * Return 1 if the field types and field names of the two descrs are equal and
- * in the same order, 0 if not.
- */
-static int
-_equivalent_fields(PyArray_Descr *type1, PyArray_Descr *type2) {
-
-    int val;
-
-    if (type1->fields == type2->fields && type1->names == type2->names) {
-        return 1;
-    }
-    if (type1->fields == NULL || type2->fields == NULL) {
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(type1->fields, type2->fields, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(type1->names, type2->names, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    return 1;
-}
-
-/*
- * Compare the subarray data for two types.
- * Return 1 if they are the same, 0 if not.
- */
-static int
-_equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
-{
-    int val;
-
-    if (sub1 == sub2) {
-        return 1;
-
-    }
-    if (sub1 == NULL || sub2 == NULL) {
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(sub1->shape, sub2->shape, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    return PyArray_EquivTypes(sub1->base, sub2->base);
-}
-
 
 /*NUMPY_API
  *
@@ -1472,40 +1491,24 @@ _equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
 NPY_NO_EXPORT unsigned char
 PyArray_EquivTypes(PyArray_Descr *type1, PyArray_Descr *type2)
 {
-    int type_num1, type_num2, size1, size2;
-
-    if (type1 == type2) {
-        return NPY_TRUE;
-    }
-
-    type_num1 = type1->type_num;
-    type_num2 = type2->type_num;
-    size1 = type1->elsize;
-    size2 = type2->elsize;
-
-    if (size1 != size2) {
-        return NPY_FALSE;
-    }
-    if (PyArray_ISNBO(type1->byteorder) != PyArray_ISNBO(type2->byteorder)) {
-        return NPY_FALSE;
-    }
-    if (type1->subarray || type2->subarray) {
-        return ((type_num1 == type_num2)
-                && _equivalent_subarrays(type1->subarray, type2->subarray));
-    }
-    if (type_num1 == NPY_VOID || type_num2 == NPY_VOID) {
-        return ((type_num1 == type_num2) && _equivalent_fields(type1, type2));
-    }
-    if (type_num1 == NPY_DATETIME
-            || type_num1 == NPY_TIMEDELTA
-            || type_num2 == NPY_DATETIME
-            || type_num2 == NPY_TIMEDELTA) {
-        return ((type_num1 == type_num2)
-                && has_equivalent_datetime_metadata(type1, type2));
+#if NPY_USE_NEW_CASTINGIMPL
+    /*
+     * Do not use PyArray_CanCastTypeTo because it supports legacy flexible
+     * dtypes as input.
+     */
+    NPY_CASTING safety = PyArray_GetCastSafety(type1, type2, NULL);
+    if (safety < 0) {
+        PyErr_Clear();
+        return 0;
     }
-    return type1->kind == type2->kind;
+    /* If casting is "no casting" this dtypes are considered equivalent. */
+    return PyArray_MinCastSafety(safety, NPY_NO_CASTING) == NPY_NO_CASTING;
+#else
+    return PyArray_LegacyEquivTypes(type1, type2);
+#endif
 }
 
+
 /*NUMPY_API*/
 NPY_NO_EXPORT unsigned char
 PyArray_EquivTypenums(int typenum1, int typenum2)
@@ -1582,13 +1585,16 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     npy_bool subok = NPY_FALSE;
     npy_bool copy = NPY_TRUE;
     int ndmin = 0, nd;
+    PyObject* like;
     PyArray_Descr *type = NULL;
     PyArray_Descr *oldtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
     int flags = 0;
 
-    static char *kwd[]= {"object", "dtype", "copy", "order", "subok",
-                         "ndmin", NULL};
+    PyObject* array_function_result = NULL;
+
+    static char *kwd[] = {"object", "dtype", "copy", "order", "subok",
+                          "ndmin", "like", NULL};
 
     if (PyTuple_GET_SIZE(args) > 2) {
         PyErr_Format(PyExc_TypeError,
@@ -1597,6 +1603,12 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "array", args, kws);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     /* super-fast path for ndarray argument calls */
     if (PyTuple_GET_SIZE(args) == 0) {
         goto full_path;
@@ -1674,13 +1686,14 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     }
 
 full_path:
-    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i:array", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i$O:array", kwd,
                 &op,
                 PyArray_DescrConverter2, &type,
                 PyArray_BoolConverter, &copy,
                 PyArray_OrderConverter, &order,
                 PyArray_BoolConverter, &subok,
-                &ndmin)) {
+                &ndmin,
+                &like)) {
         goto clean_type;
     }
 
@@ -1817,20 +1830,29 @@ static PyObject *
 array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"shape", "dtype", "order", NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", "like", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
+    PyObject *like = NULL;
     npy_bool is_f_order;
+    PyObject *array_function_result = NULL;
     PyArrayObject *ret = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&:empty", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&$O:empty", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &like)) {
         goto fail;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "empty", args, kwds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     switch (order) {
         case NPY_CORDER:
             is_f_order = NPY_FALSE;
@@ -1908,20 +1930,41 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     int alloc = 0;
     void *dptr;
     PyObject *ret;
-
+    PyObject *base = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|O:scalar", kwlist,
                 &PyArrayDescr_Type, &typecode, &obj)) {
         return NULL;
     }
     if (PyDataType_FLAGCHK(typecode, NPY_LIST_PICKLE)) {
-        if (!PySequence_Check(obj)) {
-            PyErr_SetString(PyExc_TypeError,
-                            "found non-sequence while unpickling scalar with "
-                            "NPY_LIST_PICKLE set");
+        if (typecode->type_num == NPY_OBJECT) {
+            /* Deprecated 2020-11-24, NumPy 1.20 */
+            if (DEPRECATE(
+                    "Unpickling a scalar with object dtype is deprecated. "
+                    "Object scalars should never be created. If this was a "
+                    "properly created pickle, please open a NumPy issue. In "
+                    "a best effort this returns the original object.") < 0) {
+                return NULL;
+            }
+            Py_INCREF(obj);
+            return obj;
+        }
+        /* We store the full array to unpack it here: */
+        if (!PyArray_CheckExact(obj)) {
+            /* We pickle structured voids as arrays currently */
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Unpickling NPY_LIST_PICKLE (structured void) scalar "
+                    "requires an array.  The pickle file may be corrupted?");
             return NULL;
         }
-        dptr = &obj;
+        if (!PyArray_EquivTypes(PyArray_DESCR((PyArrayObject *)obj), typecode)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Pickled array is not compatible with requested scalar "
+                    "dtype.  The pickle file may be corrupted?");
+            return NULL;
+        }
+        base = obj;
+        dptr = PyArray_BYTES((PyArrayObject *)obj);
     }
 
     else if (PyDataType_FLAGCHK(typecode, NPY_ITEM_IS_POINTER)) {
@@ -1956,22 +1999,22 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                     return NULL;
                 }
             }
-            if (!PyString_Check(obj)) {
+            if (!PyBytes_Check(obj)) {
                 PyErr_SetString(PyExc_TypeError,
-                        "initializing object must be a string");
+                        "initializing object must be a bytes object");
                 Py_XDECREF(tmpobj);
                 return NULL;
             }
-            if (PyString_GET_SIZE(obj) < typecode->elsize) {
+            if (PyBytes_GET_SIZE(obj) < typecode->elsize) {
                 PyErr_SetString(PyExc_ValueError,
                         "initialization string is too small");
                 Py_XDECREF(tmpobj);
                 return NULL;
             }
-            dptr = PyString_AS_STRING(obj);
+            dptr = PyBytes_AS_STRING(obj);
         }
     }
-    ret = PyArray_Scalar(dptr, typecode, NULL);
+    ret = PyArray_Scalar(dptr, typecode, base);
 
     /* free dptr which contains zeros */
     if (alloc) {
@@ -1984,20 +2027,29 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"shape", "dtype", "order", NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", "like", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
+    PyObject *like = NULL;
     npy_bool is_f_order = NPY_FALSE;
+    PyObject *array_function_result = NULL;
     PyArrayObject *ret = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&:zeros", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&$O:zeros", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &like)) {
         goto fail;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "zeros", args, kwds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     switch (order) {
         case NPY_CORDER:
             is_f_order = NPY_FALSE;
@@ -2050,16 +2102,24 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
     Py_ssize_t nin = -1;
     char *sep = NULL;
     Py_ssize_t s;
-    static char *kwlist[] = {"string", "dtype", "count", "sep", NULL};
+    static char *kwlist[] = {"string", "dtype", "count", "sep", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *descr = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "s#|O&" NPY_SSIZE_T_PYFMT "s:fromstring", kwlist,
-                &data, &s, PyArray_DescrConverter, &descr, &nin, &sep)) {
+                "s#|O&" NPY_SSIZE_T_PYFMT "s$O:fromstring", kwlist,
+                &data, &s, PyArray_DescrConverter, &descr, &nin, &sep, &like)) {
         Py_XDECREF(descr);
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "fromstring", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     /* binary mode, condition copied from PyArray_FromString */
     if (sep == NULL || strlen(sep) == 0) {
         /* Numpy 1.14, 2017-10-19 */
@@ -2082,19 +2142,27 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
     PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
     char *sep = "";
     Py_ssize_t nin = -1;
-    static char *kwlist[] = {"file", "dtype", "count", "sep", "offset", NULL};
+    static char *kwlist[] = {"file", "dtype", "count", "sep", "offset", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *type = NULL;
+    PyObject *array_function_result = NULL;
     int own;
     npy_off_t orig_pos = 0, offset = 0;
     FILE *fp;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "O|O&" NPY_SSIZE_T_PYFMT "s" NPY_OFF_T_PYFMT ":fromfile", kwlist,
-                &file, PyArray_DescrConverter, &type, &nin, &sep, &offset)) {
+                "O|O&" NPY_SSIZE_T_PYFMT "s" NPY_OFF_T_PYFMT "$O:fromfile", kwlist,
+                &file, PyArray_DescrConverter, &type, &nin, &sep, &offset, &like)) {
         Py_XDECREF(type);
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "fromfile", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     file = NpyPath_PathlikeToFspath(file);
     if (file == NULL) {
         return NULL;
@@ -2106,7 +2174,7 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
         Py_DECREF(file);
         return NULL;
     }
-    if (PyString_Check(file) || PyUnicode_Check(file)) {
+    if (PyBytes_Check(file) || PyUnicode_Check(file)) {
         Py_SETREF(file, npy_PyFile_OpenFile(file, "rb"));
         if (file == NULL) {
             Py_XDECREF(type);
@@ -2161,15 +2229,25 @@ array_fromiter(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
 {
     PyObject *iter;
     Py_ssize_t nin = -1;
-    static char *kwlist[] = {"iter", "dtype", "count", NULL};
+    static char *kwlist[] = {"iter", "dtype", "count", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *descr = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "OO&|" NPY_SSIZE_T_PYFMT ":fromiter", kwlist,
-                &iter, PyArray_DescrConverter, &descr, &nin)) {
+                "OO&|" NPY_SSIZE_T_PYFMT "$O:fromiter", kwlist,
+                &iter, PyArray_DescrConverter, &descr, &nin, &like)) {
         Py_XDECREF(descr);
         return NULL;
     }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "fromiter", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        Py_DECREF(descr);
+        return array_function_result;
+    }
+
     return PyArray_FromIter(iter, descr, (npy_intp)nin);
 }
 
@@ -2178,15 +2256,24 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
 {
     PyObject *obj = NULL;
     Py_ssize_t nin = -1, offset = 0;
-    static char *kwlist[] = {"buffer", "dtype", "count", "offset", NULL};
+    static char *kwlist[] = {"buffer", "dtype", "count", "offset", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *type = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "O|O&" NPY_SSIZE_T_PYFMT NPY_SSIZE_T_PYFMT ":frombuffer", kwlist,
-                &obj, PyArray_DescrConverter, &type, &nin, &offset)) {
+                "O|O&" NPY_SSIZE_T_PYFMT NPY_SSIZE_T_PYFMT "$O:frombuffer", kwlist,
+                &obj, PyArray_DescrConverter, &type, &nin, &offset, &like)) {
         Py_XDECREF(type);
         return NULL;
     }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "frombuffer", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     if (type == NULL) {
         type = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -2198,11 +2285,27 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 {
     PyObject *a0;
     PyObject *out = NULL;
+    PyArray_Descr *dtype = NULL;
+    NPY_CASTING casting = NPY_SAME_KIND_CASTING;
+    PyObject *casting_obj = NULL;
+    PyObject *res;
     int axis = 0;
-    static char *kwlist[] = {"seq", "axis", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O:concatenate", kwlist,
-                &a0, PyArray_AxisConverter, &axis, &out)) {
+    static char *kwlist[] = {"seq", "axis", "out", "dtype", "casting", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O$O&O:concatenate", kwlist,
+                &a0, PyArray_AxisConverter, &axis, &out,
+                PyArray_DescrConverter2, &dtype, &casting_obj)) {
+        return NULL;
+    }
+    int casting_not_passed = 0;
+    if (casting_obj == NULL) {
+        /*
+         * Casting was not passed in, needed for deprecation only.
+         * This should be simplified once the deprecation is finished.
+         */
+        casting_not_passed = 1;
+    }
+    else if (!PyArray_CastingConverter(casting_obj, &casting)) {
+        Py_XDECREF(dtype);
         return NULL;
     }
     if (out != NULL) {
@@ -2211,10 +2314,14 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
         }
         else if (!PyArray_Check(out)) {
             PyErr_SetString(PyExc_TypeError, "'out' must be an array");
+            Py_XDECREF(dtype);
             return NULL;
         }
     }
-    return PyArray_ConcatenateInto(a0, axis, (PyArrayObject *)out);
+    res = PyArray_ConcatenateInto(a0, axis, (PyArrayObject *)out, dtype,
+            casting, casting_not_passed);
+    Py_XDECREF(dtype);
+    return res;
 }
 
 static PyObject *
@@ -2635,7 +2742,7 @@ array_einsum(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     arg0 = PyTuple_GET_ITEM(args, 0);
 
     /* einsum('i,j', a, b), einsum('i,j->ij', a, b) */
-    if (PyString_Check(arg0) || PyUnicode_Check(arg0)) {
+    if (PyBytes_Check(arg0) || PyUnicode_Check(arg0)) {
         nop = einsum_sub_op_from_str(args, &str_obj, &subscripts, op);
     }
     /* einsum(a, [0], b, [1]), einsum(a, [0], b, [1], [0,1]) */
@@ -2766,17 +2873,41 @@ array_correlate2(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 static PyObject *
 array_arange(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws) {
     PyObject *o_start = NULL, *o_stop = NULL, *o_step = NULL, *range=NULL;
-    static char *kwd[]= {"start", "stop", "step", "dtype", NULL};
+    PyObject *like = NULL;
+    PyObject *array_function_result = NULL;
+    static char *kwd[] = {"start", "stop", "step", "dtype", "like", NULL};
     PyArray_Descr *typecode = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&:arange", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "|OOOO&$O:arange", kwd,
                 &o_start,
                 &o_stop,
                 &o_step,
-                PyArray_DescrConverter2, &typecode)) {
+                PyArray_DescrConverter2, &typecode,
+                &like)) {
         Py_XDECREF(typecode);
         return NULL;
     }
+
+    if (o_stop == NULL) {
+        if (args == NULL || PyTuple_GET_SIZE(args) == 0){
+            PyErr_SetString(PyExc_TypeError,
+                "arange() requires stop to be specified.");
+            Py_XDECREF(typecode);
+            return NULL;
+        }
+    }
+    else if (o_start == NULL) {
+        o_start = o_stop;
+        o_stop = NULL;
+    }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "arange", args, kws);
+    if (array_function_result != Py_NotImplemented) {
+        Py_XDECREF(typecode);
+        return array_function_result;
+    }
+
     range = PyArray_ArangeObj(o_start, o_stop, o_step, typecode);
     Py_XDECREF(typecode);
 
@@ -2810,7 +2941,7 @@ array__get_ndarray_c_version(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObje
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
         return NULL;
     }
-    return PyInt_FromLong( (long) PyArray_GetNDArrayCVersion() );
+    return PyLong_FromLong( (long) PyArray_GetNDArrayCVersion() );
 }
 
 /*NUMPY_API
@@ -3708,7 +3839,7 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
     }
 
     if (PyArray_TYPE(char_array) == NPY_STRING) {
-        method = PyObject_GetAttr((PyObject *)&PyString_Type, method_name);
+        method = PyObject_GetAttr((PyObject *)&PyBytes_Type, method_name);
     }
     else if (PyArray_TYPE(char_array) == NPY_UNICODE) {
         method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
@@ -3804,36 +3935,6 @@ _PyArray_GetSigintBuf(void)
 
 
 static PyObject *
-test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    int kind = 0;
-    int a = 0;
-
-    if (!PyArg_ParseTuple(args, "|i:test_interrupt", &kind)) {
-        return NULL;
-    }
-    if (kind) {
-        Py_BEGIN_ALLOW_THREADS;
-        while (a >= 0) {
-            if ((a % 1000 == 0) && PyOS_InterruptOccurred()) {
-                break;
-            }
-            a += 1;
-        }
-        Py_END_ALLOW_THREADS;
-    }
-    else {
-        NPY_SIGINT_ON
-        while(a >= 0) {
-            a += 1;
-        }
-        NPY_SIGINT_OFF
-    }
-    return PyInt_FromLong(a);
-}
-
-
-static PyObject *
 array_shares_memory_impl(PyObject *args, PyObject *kwds, Py_ssize_t default_max_work,
                          int raise_exceptions)
 {
@@ -3980,7 +4081,7 @@ normalize_axis_index(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    return PyInt_FromLong(axis);
+    return PyLong_FromLong(axis);
 }
 
 
@@ -4128,9 +4229,6 @@ static struct PyMethodDef array_module_methods[] = {
     {"_vec_string",
         (PyCFunction)_vec_string,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"test_interrupt",
-        (PyCFunction)test_interrupt,
-        METH_VARARGS, NULL},
     {"_insert", (PyCFunction)arr_insert,
         METH_VARARGS | METH_KEYWORDS,
         "Insert vals sequentially into equivalent 1-d positions "
@@ -4162,6 +4260,8 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS, NULL},
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     /* from umath */
     {"frompyfunc",
         (PyCFunction) ufunc_frompyfunc,
@@ -4180,6 +4280,7 @@ static struct PyMethodDef array_module_methods[] = {
 };
 
 #include "__multiarray_api.c"
+#include "array_method.h"
 
 /* Establish scalar-type hierarchy
  *
@@ -4202,7 +4303,7 @@ setup_scalartypes(PyObject *NPY_UNUSED(dict))
     if (PyType_Ready(&PyComplex_Type) < 0) {
         return -1;
     }
-    if (PyType_Ready(&PyString_Type) < 0) {
+    if (PyType_Ready(&PyBytes_Type) < 0) {
         return -1;
     }
     if (PyType_Ready(&PyUnicode_Type) < 0) {
@@ -4274,12 +4375,6 @@ setup_scalartypes(PyObject *NPY_UNUSED(dict))
     /* Timedelta is an integer with an associated unit */
     SINGLE_INHERIT(Timedelta, SignedInteger);
 
-    /*
-       fprintf(stderr,
-        "tp_free = %p, PyObject_Del = %p, int_tp_free = %p, base.tp_free = %p\n",
-         PyIntArrType_Type.tp_free, PyObject_Del, PyInt_Type.tp_free,
-         PySignedIntegerArrType_Type.tp_free);
-     */
     SINGLE_INHERIT(UByte, UnsignedInteger);
     SINGLE_INHERIT(UShort, UnsignedInteger);
     SINGLE_INHERIT(UInt, UnsignedInteger);
@@ -4325,13 +4420,13 @@ set_flaginfo(PyObject *d)
     newd = PyDict_New();
 
 #define _addnew(key, val, one)                                       \
-    PyDict_SetItemString(newd, #key, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #key, s=PyLong_FromLong(val));    \
     Py_DECREF(s);                                               \
-    PyDict_SetItemString(newd, #one, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #one, s=PyLong_FromLong(val));    \
     Py_DECREF(s)
 
 #define _addone(key, val)                                            \
-    PyDict_SetItemString(newd, #key, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #key, s=PyLong_FromLong(val));    \
     Py_DECREF(s)
 
     _addnew(OWNDATA, NPY_ARRAY_OWNDATA, O);
@@ -4364,28 +4459,33 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_dtype = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_ndmin = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis1 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_like = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_numpy = NULL;
 
 static int
 intern_strings(void)
 {
-    npy_ma_str_array = PyUString_InternFromString("__array__");
-    npy_ma_str_array_prepare = PyUString_InternFromString("__array_prepare__");
-    npy_ma_str_array_wrap = PyUString_InternFromString("__array_wrap__");
-    npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__");
-    npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__");
-    npy_ma_str_implementation = PyUString_InternFromString("_implementation");
-    npy_ma_str_order = PyUString_InternFromString("order");
-    npy_ma_str_copy = PyUString_InternFromString("copy");
-    npy_ma_str_dtype = PyUString_InternFromString("dtype");
-    npy_ma_str_ndmin = PyUString_InternFromString("ndmin");
-    npy_ma_str_axis1 = PyUString_InternFromString("axis1");
-    npy_ma_str_axis2 = PyUString_InternFromString("axis2");
+    npy_ma_str_array = PyUnicode_InternFromString("__array__");
+    npy_ma_str_array_prepare = PyUnicode_InternFromString("__array_prepare__");
+    npy_ma_str_array_wrap = PyUnicode_InternFromString("__array_wrap__");
+    npy_ma_str_array_finalize = PyUnicode_InternFromString("__array_finalize__");
+    npy_ma_str_ufunc = PyUnicode_InternFromString("__array_ufunc__");
+    npy_ma_str_implementation = PyUnicode_InternFromString("_implementation");
+    npy_ma_str_order = PyUnicode_InternFromString("order");
+    npy_ma_str_copy = PyUnicode_InternFromString("copy");
+    npy_ma_str_dtype = PyUnicode_InternFromString("dtype");
+    npy_ma_str_ndmin = PyUnicode_InternFromString("ndmin");
+    npy_ma_str_axis1 = PyUnicode_InternFromString("axis1");
+    npy_ma_str_axis2 = PyUnicode_InternFromString("axis2");
+    npy_ma_str_like = PyUnicode_InternFromString("like");
+    npy_ma_str_numpy = PyUnicode_InternFromString("numpy");
 
     return npy_ma_str_array && npy_ma_str_array_prepare &&
            npy_ma_str_array_wrap && npy_ma_str_array_finalize &&
            npy_ma_str_ufunc && npy_ma_str_implementation &&
            npy_ma_str_order && npy_ma_str_copy && npy_ma_str_dtype &&
-           npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
+           npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2 &&
+           npy_ma_str_like && npy_ma_str_numpy;
 }
 
 static struct PyModuleDef moduledef = {
@@ -4510,14 +4610,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         goto err;
     }
 
-    c_api = NpyCapsule_FromVoidPtr((void *)PyArray_API, NULL);
+    c_api = PyCapsule_New((void *)PyArray_API, NULL, NULL);
     if (c_api == NULL) {
         goto err;
     }
     PyDict_SetItemString(d, "_ARRAY_API", c_api);
     Py_DECREF(c_api);
 
-    c_api = NpyCapsule_FromVoidPtr((void *)PyUFunc_API, NULL);
+    c_api = PyCapsule_New((void *)PyUFunc_API, NULL, NULL);
     if (c_api == NULL) {
         goto err;
     }
@@ -4535,11 +4635,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
      */
     PyDict_SetItemString (d, "error", PyExc_Exception);
 
-    s = PyInt_FromLong(NPY_TRACE_DOMAIN);
+    s = PyLong_FromLong(NPY_TRACE_DOMAIN);
     PyDict_SetItemString(d, "tracemalloc_domain", s);
     Py_DECREF(s);
 
-    s = PyUString_FromString("3.1");
+    s = PyUnicode_FromString("3.1");
     PyDict_SetItemString(d, "__version__", s);
     Py_DECREF(s);
 
@@ -4573,7 +4673,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     }
     Py_DECREF(s);
 
-    s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
+    s = PyCapsule_New((void *)_datetime_strings, NULL, NULL);
     if (s == NULL) {
         goto err;
     }
@@ -4581,7 +4681,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     Py_DECREF(s);
 
 #define ADDCONST(NAME)                          \
-    s = PyInt_FromLong(NPY_##NAME);             \
+    s = PyLong_FromLong(NPY_##NAME);             \
     PyDict_SetItemString(d, #NAME, s);          \
     Py_DECREF(s)
 
@@ -4631,9 +4731,20 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (set_typeinfo(d) != 0) {
         goto err;
     }
+    if (PyType_Ready(&PyArrayMethod_Type) < 0) {
+        goto err;
+    }
+    if (PyType_Ready(&PyBoundArrayMethod_Type) < 0) {
+        goto err;
+    }
     if (initialize_and_map_pytypes_to_dtypes() < 0) {
         goto err;
     }
+
+    if (PyArray_InitializeCasts() < 0) {
+        goto err;
+    }
+
     if (initumath(m) != 0) {
         goto err;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index dd437e091..d3ee3337c 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -13,5 +13,7 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_dtype;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_ndmin;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis1;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
 
 #endif
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index a5b5e5c51..059f2c437 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -229,13 +229,22 @@ NpyIter_EnableExternalLoop(NpyIter *iter)
     return NpyIter_Reset(iter, NULL);
 }
 
+
+static char *_reset_cast_error = (
+        "Iterator reset failed due to a casting failure. "
+        "This error is set as a Python error.");
+
 /*NUMPY_API
  * Resets the iterator to its initial state
  *
+ * The use of errmsg is discouraged, it cannot be guaranteed that the GIL
+ * will not be grabbed on casting errors even when this is passed.
+ *
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_Reset(NpyIter *iter, char **errmsg)
@@ -250,6 +259,9 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
         /* If buffer allocation was delayed, do it now */
         if (itflags&NPY_ITFLAG_DELAYBUF) {
             if (!npyiter_allocate_buffers(iter, errmsg)) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
                 return NPY_FAIL;
             }
             NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
@@ -257,7 +269,7 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
         else {
             /*
              * If the iterindex is already right, no need to
-             * do anything
+             * do anything (and no cast error has previously occurred).
              */
             bufferdata = NIT_BUFFERDATA(iter);
             if (NIT_ITERINDEX(iter) == NIT_ITERSTART(iter) &&
@@ -265,9 +277,12 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
                     NBF_SIZE(bufferdata) > 0) {
                 return NPY_SUCCEED;
             }
-
-            /* Copy any data from the buffers back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
+                return NPY_FAIL;
+            }
         }
     }
 
@@ -275,7 +290,12 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         /* Prepare the next buffers and set iterend/size */
-        npyiter_copy_to_buffers(iter, NULL);
+        if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+            if (errmsg != NULL) {
+                *errmsg = _reset_cast_error;
+            }
+            return NPY_FAIL;
+        }
     }
 
     return NPY_SUCCEED;
@@ -288,7 +308,8 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
@@ -309,8 +330,12 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
             NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
         }
         else {
-            /* Copy any data from the buffers back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
+                return NPY_FAIL;
+            }
         }
     }
 
@@ -323,7 +348,12 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         /* Prepare the next buffers and set iterend/size */
-        npyiter_copy_to_buffers(iter, NULL);
+        if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+            if (errmsg != NULL) {
+                *errmsg = _reset_cast_error;
+            }
+            return NPY_FAIL;
+        }
     }
 
     return NPY_SUCCEED;
@@ -335,7 +365,8 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_ResetToIterIndexRange(NpyIter *iter,
@@ -633,12 +664,16 @@ NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex)
         /* Start the buffer at the provided iterindex */
         else {
             /* Write back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                return NPY_FAIL;
+            }
 
             npyiter_goto_iterindex(iter, iterindex);
 
             /* Prepare the next buffers and set iterend/size */
-            npyiter_copy_to_buffers(iter, NULL);
+            if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+                return NPY_FAIL;
+            }
         }
     }
     else {
@@ -1376,6 +1411,7 @@ NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
     }
 }
 
+
 /*NUMPY_API
  * For debugging
  */
@@ -1828,7 +1864,7 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
  * their data needs to be written back to the arrays.  The multi-index
  * must be positioned for the beginning of the buffer.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_from_buffers(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
@@ -1861,7 +1897,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
     /* If we're past the end, nothing to copy */
     if (NBF_SIZE(bufferdata) == 0) {
-        return;
+        return 0;
     }
 
     NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n");
@@ -1968,7 +2004,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
                     maskptr = (npy_bool *)ad_ptrs[maskop];
                 }
 
-                PyArray_TransferMaskedStridedToNDim(ndim_transfer,
+                if (PyArray_TransferMaskedStridedToNDim(ndim_transfer,
                         ad_ptrs[iop], dst_strides, axisdata_incr,
                         buffer, src_stride,
                         maskptr, strides[maskop],
@@ -1976,18 +2012,22 @@ npyiter_copy_from_buffers(NpyIter *iter)
                         dst_shape, axisdata_incr,
                         op_transfersize, dtypes[iop]->elsize,
                         (PyArray_MaskedStridedUnaryOp *)stransfer,
-                        transferdata);
+                        transferdata) < 0) {
+                    return -1;
+                }
             }
             /* Regular operand */
             else {
-                PyArray_TransferStridedToNDim(ndim_transfer,
+                if (PyArray_TransferStridedToNDim(ndim_transfer,
                         ad_ptrs[iop], dst_strides, axisdata_incr,
                         buffer, src_stride,
                         dst_coords, axisdata_incr,
                         dst_shape, axisdata_incr,
                         op_transfersize, dtypes[iop]->elsize,
                         stransfer,
-                        transferdata);
+                        transferdata) < 0) {
+                    return -1;
+                }
             }
         }
         /* If there's no copy back, we may have to decrement refs.  In
@@ -2002,9 +2042,13 @@ npyiter_copy_from_buffers(NpyIter *iter)
             NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
                                 "of operand %d\n", (int)iop);
             /* Decrement refs */
-            stransfer(NULL, 0, buffer, dtypes[iop]->elsize,
-                        transfersize, dtypes[iop]->elsize,
-                        transferdata);
+            if (stransfer(NULL, 0, buffer, dtypes[iop]->elsize,
+                          transfersize, dtypes[iop]->elsize,
+                          transferdata) < 0) {
+                /* Since this should only decrement, it should never error */
+                assert(0);
+                return -1;
+            }
             /*
              * Zero out the memory for safety.  For instance,
              * if during iteration some Python code copied an
@@ -2016,6 +2060,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
     }
 
     NPY_IT_DBG_PRINT("Iterator: Finished copying buffers to outputs\n");
+    return 0;
 }
 
 /*
@@ -2023,7 +2068,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
  * for the start of a buffer.  It decides which operands need a buffer,
  * and copies the data into the buffers.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
@@ -2142,7 +2187,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
         if (reduce_innersize == 0) {
             NBF_REDUCE_OUTERSIZE(bufferdata) = 0;
-            return;
+            return 0;
         }
         else {
             NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
@@ -2508,14 +2553,15 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                                 "buffer (%d items)\n",
                                 (int)iop, (int)op_transfersize);
 
-                PyArray_TransferNDimToStrided(ndim_transfer,
-                        ptrs[iop], dst_stride,
-                        ad_ptrs[iop], src_strides, axisdata_incr,
-                        src_coords, axisdata_incr,
-                        src_shape, axisdata_incr,
-                        op_transfersize, src_itemsize,
-                        stransfer,
-                        transferdata);
+                if (PyArray_TransferNDimToStrided(
+                                ndim_transfer, ptrs[iop], dst_stride,
+                                ad_ptrs[iop], src_strides, axisdata_incr,
+                                src_coords, axisdata_incr,
+                                src_shape, axisdata_incr,
+                                op_transfersize, src_itemsize,
+                                stransfer, transferdata) < 0) {
+                    return -1;
+                }
             }
         }
         else if (ptrs[iop] == buffers[iop]) {
@@ -2551,8 +2597,80 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
     NPY_IT_DBG_PRINT1("Iterator: Finished copying inputs to buffers "
                         "(buffered size is %d)\n", (int)NBF_SIZE(bufferdata));
+    return 0;
 }
 
+
+/**
+ * This function clears any references still held by the buffers and should
+ * only be used to discard buffers if an error occurred.
+ *
+ * @param iter Iterator
+ */
+NPY_NO_EXPORT void
+npyiter_clear_buffers(NpyIter *iter)
+{
+    int nop = iter->nop;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    if (NBF_SIZE(bufferdata) == 0) {
+        /* if the buffers are empty already, there is nothing to do */
+        return;
+    }
+
+    if (!(NIT_ITFLAGS(iter) & NPY_ITFLAG_NEEDSAPI)) {
+        /* Buffers do not require clearing, but should not be copied back */
+        NBF_SIZE(bufferdata) = 0;
+        return;
+    }
+
+    /*
+     * The iterator may be using a dtype with references, which always
+     * requires the API. In that case, further cleanup may be necessary.
+     *
+     * TODO: At this time, we assume that a dtype having references
+     *       implies the need to hold the GIL at all times. In theory
+     *       we could broaden this definition for a new
+     *       `PyArray_Item_XDECREF` API and the assumption may become
+     *       incorrect.
+     */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type,  &value, &traceback);
+
+    /* Cleanup any buffers with references */
+    char **buffers = NBF_BUFFERS(bufferdata);
+    PyArray_Descr **dtypes = NIT_DTYPES(iter);
+    for (int iop = 0; iop < nop; ++iop, ++buffers) {
+        /*
+         * We may want to find a better way to do this, on the other hand,
+         * this cleanup seems rare and fairly special.  A dtype using
+         * references (right now only us) must always keep the buffer in
+         * a well defined state (either NULL or owning the reference).
+         * Only we implement cleanup
+         */
+        if (!PyDataType_REFCHK(dtypes[iop])) {
+            continue;
+        }
+        if (*buffers == 0) {
+            continue;
+        }
+        int itemsize = dtypes[iop]->elsize;
+        for (npy_intp i = 0; i < NBF_SIZE(bufferdata); i++) {
+            /*
+             * See above comment, if this API is expanded the GIL assumption
+             * could become incorrect.
+             */
+            PyArray_Item_XDECREF(*buffers + (itemsize * i), dtypes[iop]);
+        }
+        /* Clear out the buffer just to be sure */
+        memset(*buffers, 0, NBF_SIZE(bufferdata) * itemsize);
+    }
+    /* Signal that the buffers are empty */
+    NBF_SIZE(bufferdata) = 0;
+    PyErr_Restore(type, value, traceback);
+}
+
+
 /*
  * This checks how much space can be buffered without encountering the
  * same value twice, or for operands whose innermost stride is zero,
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 7da17eafe..b379a28ac 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -476,7 +476,10 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
             }
 
             /* Prepare the next buffers and set iterend/size */
-            npyiter_copy_to_buffers(iter, NULL);
+            if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+                NpyIter_Deallocate(iter);
+                return NULL;
+            }
         }
     }
 
@@ -642,21 +645,27 @@ NpyIter_Copy(NpyIter *iter)
 }
 
 /*NUMPY_API
- * Deallocate an iterator
+ * Deallocate an iterator.
+ *
+ * To correctly work when an error is in progress, we have to check
+ * `PyErr_Occurred()`. This is necessary when buffers are not finalized
+ * or WritebackIfCopy is used. We could avoid that check by exposing a new
+ * function which is passed in whether or not a Python error is already set.
  */
 NPY_NO_EXPORT int
 NpyIter_Deallocate(NpyIter *iter)
 {
+    int success = PyErr_Occurred() == NULL;
+
     npy_uint32 itflags;
     /*int ndim = NIT_NDIM(iter);*/
     int iop, nop;
     PyArray_Descr **dtype;
     PyArrayObject **object;
     npyiter_opitflags *op_itflags;
-    npy_bool resolve = 1;
 
     if (iter == NULL) {
-        return NPY_SUCCEED;
+        return success;
     }
 
     itflags = NIT_ITFLAGS(iter);
@@ -667,13 +676,23 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /* Deallocate any buffers and buffering data */
     if (itflags & NPY_ITFLAG_BUFFER) {
+        /* Ensure no data is held by the buffers before they are cleared */
+        if (success) {
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                success = NPY_FAIL;
+            }
+        }
+        else {
+            npyiter_clear_buffers(iter);
+        }
+
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
         char **buffers;
         NpyAuxData **transferdata;
 
         /* buffers */
         buffers = NBF_BUFFERS(bufferdata);
-        for(iop = 0; iop < nop; ++iop, ++buffers) {
+        for (iop = 0; iop < nop; ++iop, ++buffers) {
             PyArray_free(*buffers);
         }
         /* read bufferdata */
@@ -694,12 +713,12 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /*
      * Deallocate all the dtypes and objects that were iterated and resolve
-     * any writeback buffers created by the iterator
+     * any writeback buffers created by the iterator.
      */
-    for(iop = 0; iop < nop; ++iop, ++dtype, ++object) {
+    for (iop = 0; iop < nop; ++iop, ++dtype, ++object) {
         if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
-            if (resolve && PyArray_ResolveWritebackIfCopy(*object) < 0) {
-                resolve = 0;
+            if (success && PyArray_ResolveWritebackIfCopy(*object) < 0) {
+                success = 0;
             }
             else {
                 PyArray_DiscardWritebackIfCopy(*object);
@@ -711,12 +730,10 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /* Deallocate the iterator memory */
     PyObject_Free(iter);
-    if (resolve == 0) {
-        return NPY_FAIL;
-    }
-    return NPY_SUCCEED;
+    return success;
 }
 
+
 /* Checks 'flags' for (C|F)_ORDER_INDEX, MULTI_INDEX, and EXTERNAL_LOOP,
  * setting the appropriate internal flags in 'itflags'.
  *
@@ -1733,73 +1750,70 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
     return 1;
 
 broadcast_error: {
-        PyObject *errmsg, *tmp;
         npy_intp remdims[NPY_MAXDIMS];
-        char *tmpstr;
 
         if (op_axes == NULL) {
-            errmsg = PyUString_FromString("operands could not be broadcast "
-                                          "together with shapes ");
-            if (errmsg == NULL) {
+            PyObject *shape1 = PyUnicode_FromString("");
+            if (shape1 == NULL) {
                 return 0;
             }
             for (iop = 0; iop < nop; ++iop) {
                 if (op[iop] != NULL) {
-                    tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                                    PyArray_DIMS(op[iop]),
-                                                    " ");
+                    int ndims = PyArray_NDIM(op[iop]);
+                    npy_intp *dims = PyArray_DIMS(op[iop]);
+                    PyObject *tmp = convert_shape_to_string(ndims, dims, " ");
                     if (tmp == NULL) {
-                        Py_DECREF(errmsg);
+                        Py_DECREF(shape1);
                         return 0;
                     }
-                    PyUString_ConcatAndDel(&errmsg, tmp);
-                    if (errmsg == NULL) {
+                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                    Py_DECREF(tmp);
+                    if (shape1 == NULL) {
                         return 0;
                     }
                 }
             }
-            if (itershape != NULL) {
-                tmp = PyUString_FromString("and requested shape ");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
-                    return 0;
-                }
-
-                tmp = convert_shape_to_string(ndim, itershape, "");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
+            if (itershape == NULL) {
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "shapes %S", shape1);
+                Py_DECREF(shape1);
+                return 0;
+            }
+            else {
+                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
+                if (shape2 == NULL) {
+                    Py_DECREF(shape1);
                     return 0;
                 }
-
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "shapes %S and requested shape %S", shape1, shape2);
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
+                return 0;
             }
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
         }
         else {
-            errmsg = PyUString_FromString("operands could not be broadcast "
-                                          "together with remapped shapes "
-                                          "[original->remapped]: ");
+            PyObject *shape1 = PyUnicode_FromString("");
+            if (shape1 == NULL) {
+                return 0;
+            }
             for (iop = 0; iop < nop; ++iop) {
                 if (op[iop] != NULL) {
                     int *axes = op_axes[iop];
+                    int ndims = PyArray_NDIM(op[iop]);
+                    npy_intp *dims = PyArray_DIMS(op[iop]);
+                    char *tmpstr = (axes == NULL) ? " " : "->";
 
-                    tmpstr = (axes == NULL) ? " " : "->";
-                    tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                                    PyArray_DIMS(op[iop]),
-                                                    tmpstr);
+                    PyObject *tmp = convert_shape_to_string(ndims, dims, tmpstr);
                     if (tmp == NULL) {
+                        Py_DECREF(shape1);
                         return 0;
                     }
-                    PyUString_ConcatAndDel(&errmsg, tmp);
-                    if (errmsg == NULL) {
+                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                    Py_DECREF(tmp);
+                    if (shape1 == NULL) {
                         return 0;
                     }
 
@@ -1814,80 +1828,83 @@ broadcast_error: {
                                 remdims[idim] = -1;
                             }
                         }
-                        tmp = convert_shape_to_string(ndim, remdims, " ");
+                        PyObject *tmp = convert_shape_to_string(ndim, remdims, " ");
                         if (tmp == NULL) {
+                            Py_DECREF(shape1);
                             return 0;
                         }
-                        PyUString_ConcatAndDel(&errmsg, tmp);
-                        if (errmsg == NULL) {
+                        Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                        Py_DECREF(tmp);
+                        if (shape1 == NULL) {
                             return 0;
                         }
                     }
                 }
             }
-            if (itershape != NULL) {
-                tmp = PyUString_FromString("and requested shape ");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
-                    return 0;
-                }
-
-                tmp = convert_shape_to_string(ndim, itershape, "");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
+            if (itershape == NULL) {
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "remapped shapes [original->remapped]: %S", shape1);
+                Py_DECREF(shape1);
+                return 0;
+            }
+            else {
+                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
+                if (shape2 == NULL) {
+                    Py_DECREF(shape1);
                     return 0;
                 }
-
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "remapped shapes [original->remapped]: %S and "
+                        "requested shape %S", shape1, shape2);
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
+                return 0;
             }
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
         }
-
-        return 0;
     }
 
 operand_different_than_broadcast: {
-        npy_intp remdims[NPY_MAXDIMS];
-        PyObject *errmsg, *tmp;
-
-        /* Start of error message */
-        if (op_flags[iop] & NPY_ITER_READONLY) {
-            errmsg = PyUString_FromString("non-broadcastable operand "
-                                          "with shape ");
-        }
-        else {
-            errmsg = PyUString_FromString("non-broadcastable output "
-                                          "operand with shape ");
-        }
-        if (errmsg == NULL) {
+        /* operand shape */
+        int ndims = PyArray_NDIM(op[iop]);
+        npy_intp *dims = PyArray_DIMS(op[iop]);
+        PyObject *shape1 = convert_shape_to_string(ndims, dims, "");
+        if (shape1 == NULL) {
             return 0;
         }
 
-        /* Operand shape */
-        tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                        PyArray_DIMS(op[iop]), "");
-        if (tmp == NULL) {
+        /* Broadcast shape */
+        PyObject *shape2 = convert_shape_to_string(ndim, broadcast_shape, "");
+        if (shape2 == NULL) {
+            Py_DECREF(shape1);
             return 0;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
+
+        if (op_axes == NULL || op_axes[iop] == NULL) {
+            /* operand shape not remapped */
+
+            if (op_flags[iop] & NPY_ITER_READONLY) {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable operand with shape %S doesn't "
+                    "match the broadcast shape %S", shape1, shape2);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable output operand with shape %S doesn't "
+                    "match the broadcast shape %S", shape1, shape2);
+            }
+            Py_DECREF(shape1);
+            Py_DECREF(shape2);
             return 0;
         }
-        /* Remapped operand shape */
-        if (op_axes != NULL && op_axes[iop] != NULL) {
-            int *axes = op_axes[iop];
+        else {
+            /* operand shape remapped */
 
+            npy_intp remdims[NPY_MAXDIMS];
+            int *axes = op_axes[iop];
             for (idim = 0; idim < ndim; ++idim) {
-                npy_intp i = axes[ndim-idim-1];
-
+                npy_intp i = axes[ndim - idim - 1];
                 if (i >= 0 && i < PyArray_NDIM(op[iop])) {
                     remdims[idim] = PyArray_DIM(op[iop], i);
                 }
@@ -1896,48 +1913,30 @@ operand_different_than_broadcast: {
                 }
             }
 
-            tmp = PyUString_FromString(" [remapped to ");
-            if (tmp == NULL) {
-                return 0;
-            }
-            PyUString_ConcatAndDel(&errmsg, tmp);
-            if (errmsg == NULL) {
+            PyObject *shape3 = convert_shape_to_string(ndim, remdims, "");
+            if (shape3 == NULL) {
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
                 return 0;
             }
 
-            tmp = convert_shape_to_string(ndim, remdims, "]");
-            if (tmp == NULL) {
-                return 0;
+            if (op_flags[iop] & NPY_ITER_READONLY) {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable operand with shape %S "
+                    "[remapped to %S] doesn't match the broadcast shape %S",
+                    shape1, shape3, shape2);
             }
-            PyUString_ConcatAndDel(&errmsg, tmp);
-            if (errmsg == NULL) {
-                return 0;
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable output operand with shape %S "
+                    "[remapped to %S] doesn't match the broadcast shape %S",
+                    shape1, shape3, shape2);
             }
-        }
-
-        tmp = PyUString_FromString(" doesn't match the broadcast shape ");
-        if (tmp == NULL) {
-            return 0;
-        }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
-            return 0;
-        }
-
-        /* Broadcast shape */
-        tmp = convert_shape_to_string(ndim, broadcast_shape, "");
-        if (tmp == NULL) {
+            Py_DECREF(shape1);
+            Py_DECREF(shape2);
+            Py_DECREF(shape3);
             return 0;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
-            return 0;
-        }
-
-        PyErr_SetObject(PyExc_ValueError, errmsg);
-        Py_DECREF(errmsg);
-
-        return 0;
     }
 }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 1477c8631..378d6f711 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -269,7 +269,7 @@ struct NpyIter_AD {
 #define NAD_STRIDES(axisdata) ( \
         &(axisdata)->ad_flexdata + 0)
 #define NAD_PTRS(axisdata) ((char **) \
-        &(axisdata)->ad_flexdata + 1*(nop+1))
+        (&(axisdata)->ad_flexdata + 1*(nop+1)))
 
 #define NAD_NSTRIDES() \
         ((nop) + ((itflags&NPY_ITFLAG_HASINDEX) ? 1 : 0))
@@ -342,10 +342,11 @@ NPY_NO_EXPORT int
 npyiter_allocate_buffers(NpyIter *iter, char **errmsg);
 NPY_NO_EXPORT void
 npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex);
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_from_buffers(NpyIter *iter);
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs);
-
+NPY_NO_EXPORT void
+npyiter_clear_buffers(NpyIter *iter);
 
 #endif
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 7f31a5096..8839d1be7 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -894,7 +894,7 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
                 Py_DECREF(item);
                 return NULL;
             }
-            axis = PyInt_AsLong(v);
+            axis = PyLong_AsLong(v);
             Py_DECREF(v);
             if (axis < 0 || axis >= NPY_MAXDIMS) {
                 PyErr_SetString(PyExc_ValueError,
@@ -1142,7 +1142,7 @@ npyiter_dealloc(NewNpyArrayIterObject *self)
                     "results.", 1) < 0) {
                 PyObject *s;
 
-                s = PyUString_FromString("npyiter_dealloc");
+                s = PyUnicode_FromString("npyiter_dealloc");
                 if (s) {
                     PyErr_WriteUnraisable(s);
                     Py_DECREF(s);
@@ -1268,6 +1268,10 @@ npyiter_iternext(NewNpyArrayIterObject *self)
         Py_RETURN_TRUE;
     }
     else {
+        if (PyErr_Occurred()) {
+            /* casting error, buffer cleanup will occur at reset or dealloc */
+            return NULL;
+        }
         self->finished = 1;
         Py_RETURN_FALSE;
     }
@@ -1483,6 +1487,10 @@ npyiter_next(NewNpyArrayIterObject *self)
      */
     if (self->started) {
         if (!self->iternext(self->iter)) {
+            /*
+             * A casting error may be set here (or no error causing a
+             * StopIteration). Buffers may only be cleaned up later.
+             */
             self->finished = 1;
             return NULL;
         }
@@ -1514,7 +1522,7 @@ static PyObject *npyiter_shape_get(NewNpyArrayIterObject *self)
         if (ret != NULL) {
             for (idim = 0; idim < ndim; ++idim) {
                 PyTuple_SET_ITEM(ret, idim,
-                        PyInt_FromLong(shape[idim]));
+                        PyLong_FromLong(shape[idim]));
             }
             return ret;
         }
@@ -1543,7 +1551,7 @@ static PyObject *npyiter_multi_index_get(NewNpyArrayIterObject *self)
         }
         for (idim = 0; idim < ndim; ++idim) {
             PyTuple_SET_ITEM(ret, idim,
-                    PyInt_FromLong(multi_index[idim]));
+                    PyLong_FromLong(multi_index[idim]));
         }
         return ret;
     }
@@ -1597,7 +1605,7 @@ npyiter_multi_index_set(NewNpyArrayIterObject *self, PyObject *value)
         }
         for (idim = 0; idim < ndim; ++idim) {
             PyObject *v = PySequence_GetItem(value, idim);
-            multi_index[idim] = PyInt_AsLong(v);
+            multi_index[idim] = PyLong_AsLong(v);
             if (error_converting(multi_index[idim])) {
                 Py_XDECREF(v);
                 return -1;
@@ -1633,7 +1641,7 @@ static PyObject *npyiter_index_get(NewNpyArrayIterObject *self)
 
     if (NpyIter_HasIndex(self->iter)) {
         npy_intp ind = *NpyIter_GetIndexPtr(self->iter);
-        return PyInt_FromLong(ind);
+        return PyLong_FromLong(ind);
     }
     else {
         PyErr_SetString(PyExc_ValueError,
@@ -1657,7 +1665,7 @@ static int npyiter_index_set(NewNpyArrayIterObject *self, PyObject *value)
 
     if (NpyIter_HasIndex(self->iter)) {
         npy_intp ind;
-        ind = PyInt_AsLong(value);
+        ind = PyLong_AsLong(value);
         if (error_converting(ind)) {
             return -1;
         }
@@ -1689,7 +1697,7 @@ static PyObject *npyiter_iterindex_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetIterIndex(self->iter));
+    return PyLong_FromLong(NpyIter_GetIterIndex(self->iter));
 }
 
 static int npyiter_iterindex_set(NewNpyArrayIterObject *self, PyObject *value)
@@ -1707,7 +1715,7 @@ static int npyiter_iterindex_set(NewNpyArrayIterObject *self, PyObject *value)
         return -1;
     }
 
-    iterindex = PyInt_AsLong(value);
+    iterindex = PyLong_AsLong(value);
     if (error_converting(iterindex)) {
         return -1;
     }
@@ -1743,8 +1751,8 @@ static PyObject *npyiter_iterrange_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    PyTuple_SET_ITEM(ret, 0, PyInt_FromLong(istart));
-    PyTuple_SET_ITEM(ret, 1, PyInt_FromLong(iend));
+    PyTuple_SET_ITEM(ret, 0, PyLong_FromLong(istart));
+    PyTuple_SET_ITEM(ret, 1, PyLong_FromLong(iend));
 
     return ret;
 }
@@ -1892,7 +1900,7 @@ static PyObject *npyiter_ndim_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNDim(self->iter));
+    return PyLong_FromLong(NpyIter_GetNDim(self->iter));
 }
 
 static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
@@ -1903,7 +1911,7 @@ static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNOp(self->iter));
+    return PyLong_FromLong(NpyIter_GetNOp(self->iter));
 }
 
 static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
@@ -1914,7 +1922,7 @@ static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetIterSize(self->iter));
+    return PyLong_FromLong(NpyIter_GetIterSize(self->iter));
 }
 
 static PyObject *npyiter_finished_get(NewNpyArrayIterObject *self)
@@ -2213,7 +2221,7 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
         return NULL;
     }
 
-    if (PyInt_Check(op) || PyLong_Check(op) ||
+    if (PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
         if (error_converting(i)) {
@@ -2223,8 +2231,8 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
     }
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0, islicelength;
-        if (NpySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
-                                  &istart, &iend, &istep, &islicelength) < 0) {
+        if (PySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
+                                 &istart, &iend, &istep, &islicelength) < 0) {
             return NULL;
         }
         if (istep != 1) {
@@ -2262,7 +2270,7 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
         return -1;
     }
 
-    if (PyInt_Check(op) || PyLong_Check(op) ||
+    if (PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
         if (error_converting(i)) {
@@ -2272,8 +2280,8 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
     }
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0, islicelength = 0;
-        if (NpySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
-                                  &istart, &iend, &istep, &islicelength) < 0) {
+        if (PySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
+                                 &istart, &iend, &istep, &islicelength) < 0) {
             return -1;
         }
         if (istep != 1) {
diff --git a/numpy/core/src/multiarray/nditer_templ.c.src b/numpy/core/src/multiarray/nditer_templ.c.src
index 0f0d59972..05ce6ae75 100644
--- a/numpy/core/src/multiarray/nditer_templ.c.src
+++ b/numpy/core/src/multiarray/nditer_templ.c.src
@@ -249,7 +249,10 @@ npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
     memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
 
     /* Write back to the arrays */
-    npyiter_copy_from_buffers(iter);
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     /* Check if we're past the end */
     if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
@@ -262,7 +265,10 @@ npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
     }
 
     /* Prepare the next buffers and set iterend/size */
-    npyiter_copy_to_buffers(iter, prev_dataptrs);
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     return 1;
 }
@@ -303,7 +309,10 @@ npyiter_buffered_iternext(NpyIter *iter)
     }
 
     /* Write back to the arrays */
-    npyiter_copy_from_buffers(iter);
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     /* Check if we're past the end */
     if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
@@ -316,7 +325,10 @@ npyiter_buffered_iternext(NpyIter *iter)
     }
 
     /* Prepare the next buffers and set iterend/size */
-    npyiter_copy_to_buffers(iter, NULL);
+    if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     return 1;
 }
diff --git a/numpy/core/src/multiarray/npy_buffer.h b/numpy/core/src/multiarray/npy_buffer.h
index 5ff8b6c2c..d10f1a020 100644
--- a/numpy/core/src/multiarray/npy_buffer.h
+++ b/numpy/core/src/multiarray/npy_buffer.h
@@ -3,8 +3,8 @@
 
 extern NPY_NO_EXPORT PyBufferProcs array_as_buffer;
 
-NPY_NO_EXPORT void
-_dealloc_cached_buffer_info(PyObject *self);
+NPY_NO_EXPORT int
+_buffer_info_free(void *buffer_info, PyObject *obj);
 
 NPY_NO_EXPORT PyArray_Descr*
 _descriptor_from_pep3118_format(char const *s);
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 19ac7d7f9..a629dfe97 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -397,14 +397,21 @@ is_scalar_with_conversion(PyObject *o2, double* out_exponent)
     PyObject *temp;
     const int optimize_fpexps = 1;
 
-    if (PyInt_Check(o2)) {
-        *out_exponent = (double)PyInt_AsLong(o2);
+    if (PyLong_Check(o2)) {
+        long tmp = PyLong_AsLong(o2);
+        if (error_converting(tmp)) {
+            PyErr_Clear();
+            return NPY_NOSCALAR;
+        }
+        *out_exponent = (double)tmp;
         return NPY_INTPOS_SCALAR;
     }
+
     if (optimize_fpexps && PyFloat_Check(o2)) {
         *out_exponent = PyFloat_AsDouble(o2);
         return NPY_FLOAT_SCALAR;
     }
+
     if (PyArray_Check(o2)) {
         if ((PyArray_NDIM((PyArrayObject *)o2) == 0) &&
                 ((PyArray_ISINTEGER((PyArrayObject *)o2) ||
@@ -442,13 +449,13 @@ is_scalar_with_conversion(PyObject *o2, double* out_exponent)
     else if (PyIndex_Check(o2)) {
         PyObject* value = PyNumber_Index(o2);
         Py_ssize_t val;
-        if (value==NULL) {
+        if (value == NULL) {
             if (PyErr_Occurred()) {
                 PyErr_Clear();
             }
             return NPY_NOSCALAR;
         }
-        val = PyInt_AsSsize_t(value);
+        val = PyLong_AsSsize_t(value);
         if (error_converting(val)) {
             PyErr_Clear();
             return NPY_NOSCALAR;
@@ -826,7 +833,7 @@ _array_nonzero(PyArrayObject *mp)
     n = PyArray_SIZE(mp);
     if (n == 1) {
         int res;
-        if (Npy_EnterRecursiveCall(" while converting array to bool")) {
+        if (Py_EnterRecursiveCall(" while converting array to bool")) {
             return -1;
         }
         res = PyArray_DESCR(mp)->f->nonzero(PyArray_DATA(mp), mp);
@@ -880,7 +887,7 @@ array_scalar_forward(PyArrayObject *v,
     /* Need to guard against recursion if our array holds references */
     if (PyDataType_REFCHK(PyArray_DESCR(v))) {
         PyObject *res;
-        if (Npy_EnterRecursiveCall(where) != 0) {
+        if (Py_EnterRecursiveCall(where) != 0) {
             Py_DECREF(scalar);
             return NULL;
         }
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index c869b5eea..41dd059b0 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -36,7 +36,7 @@ PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
         return;
     }
     if (descr->type_num == NPY_OBJECT) {
-        NPY_COPY_PYOBJECT_PTR(&temp, data);
+        memcpy(&temp, data, sizeof(temp));
         Py_XINCREF(temp);
     }
     else if (PyDataType_HASFIELDS(descr)) {
@@ -98,7 +98,7 @@ PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
     }
 
     if (descr->type_num == NPY_OBJECT) {
-        NPY_COPY_PYOBJECT_PTR(&temp, data);
+        memcpy(&temp, data, sizeof(temp));
         Py_XDECREF(temp);
     }
     else if (PyDataType_HASFIELDS(descr)) {
@@ -181,7 +181,7 @@ PyArray_INCREF(PyArrayObject *mp)
         }
         else {
             for( i = 0; i < n; i++, data++) {
-                NPY_COPY_PYOBJECT_PTR(&temp, data);
+                memcpy(&temp, data, sizeof(temp));
                 Py_XINCREF(temp);
             }
         }
@@ -192,7 +192,7 @@ PyArray_INCREF(PyArrayObject *mp)
             return -1;
         }
         while(it->index < it->size) {
-            NPY_COPY_PYOBJECT_PTR(&temp, it->dataptr);
+            memcpy(&temp, it->dataptr, sizeof(temp));
             Py_XINCREF(temp);
             PyArray_ITER_NEXT(it);
         }
@@ -238,7 +238,7 @@ PyArray_XDECREF(PyArrayObject *mp)
         }
         else {
             for (i = 0; i < n; i++, data++) {
-                NPY_COPY_PYOBJECT_PTR(&temp, data);
+                memcpy(&temp, data, sizeof(temp));
                 Py_XDECREF(temp);
             }
         }
@@ -246,7 +246,7 @@ PyArray_XDECREF(PyArrayObject *mp)
     else { /* handles misaligned data too */
         PyArray_RawIterBaseInit(&it, mp);
         while(it.index < it.size) {
-            NPY_COPY_PYOBJECT_PTR(&temp, it.dataptr);
+            memcpy(&temp, it.dataptr, sizeof(temp));
             Py_XDECREF(temp);
             PyArray_ITER_NEXT(&it);
         }
@@ -292,24 +292,26 @@ static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype)
 {
     if (!PyDataType_FLAGCHK(dtype, NPY_ITEM_REFCOUNT)) {
-        if ((obj == Py_None) || (PyInt_Check(obj) && PyInt_AsLong(obj)==0)) {
+        PyObject *arr;
+
+        if ((obj == Py_None) ||
+                (PyLong_Check(obj) && PyLong_AsLong(obj) == 0)) {
             return;
         }
-        else {
-            PyObject *arr;
-            Py_INCREF(dtype);
-            arr = PyArray_NewFromDescr(&PyArray_Type, dtype,
-                                       0, NULL, NULL, NULL,
-                                       0, NULL);
-            if (arr!=NULL) {
-                dtype->f->setitem(obj, optr, arr);
-            }
-            Py_XDECREF(arr);
+        /* Clear possible long conversion error */
+        PyErr_Clear();
+        Py_INCREF(dtype);
+        arr = PyArray_NewFromDescr(&PyArray_Type, dtype,
+                                   0, NULL, NULL, NULL,
+                                   0, NULL);
+        if (arr!=NULL) {
+            dtype->f->setitem(obj, optr, arr);
         }
+        Py_XDECREF(arr);
     }
     if (dtype->type_num == NPY_OBJECT) {
         Py_XINCREF(obj);
-        NPY_COPY_PYOBJECT_PTR(optr, &obj);
+        memcpy(optr, &obj, sizeof(obj));
     }
     else if (PyDataType_HASFIELDS(dtype)) {
         PyObject *key, *value, *title = NULL;
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 6f3d102a4..0e93cbbe9 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -35,7 +35,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
 {
     int type_num;
     int align;
-    npy_intp memloc;
+    uintptr_t memloc;
     if (descr == NULL) {
         descr = PyArray_DescrFromScalar(scalar);
         type_num = descr->type_num;
@@ -138,7 +138,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
     }
     else if (_CHK(Flexible)) {
         if (_CHK(String)) {
-            return (void *)PyString_AS_STRING(scalar);
+            return (void *)PyBytes_AS_STRING(scalar);
         }
         if (_CHK(Unicode)) {
             /* Treat this the same as the NPY_UNICODE base class */
@@ -168,7 +168,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
      * Use the alignment flag to figure out where the data begins
      * after a PyObject_HEAD
      */
-    memloc = (npy_intp)scalar;
+    memloc = (uintptr_t)scalar;
     memloc += sizeof(PyObject);
     /* now round-up to the nearest alignment value */
     align = descr->alignment;
@@ -373,14 +373,15 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 NPY_NO_EXPORT PyObject *
 PyArray_ScalarFromObject(PyObject *object)
 {
-    PyObject *ret=NULL;
+    PyObject *ret = NULL;
+
     if (PyArray_IsZeroDim(object)) {
         return PyArray_ToScalar(PyArray_DATA((PyArrayObject *)object),
                                 (PyArrayObject *)object);
     }
     /*
      * Booleans in Python are implemented as a subclass of integers,
-     * so PyBool_Check must be called before PyInt_Check.
+     * so PyBool_Check must be called before PyLong_Check.
      */
     if (PyBool_Check(object)) {
         if (object == Py_True) {
@@ -390,42 +391,49 @@ PyArray_ScalarFromObject(PyObject *object)
             PyArrayScalar_RETURN_FALSE;
         }
     }
-    else if (PyInt_Check(object)) {
-        ret = PyArrayScalar_New(Long);
-        if (ret == NULL) {
-            return NULL;
+    else if (PyLong_Check(object)) {
+        /* Check if fits in long */
+        npy_long val_long = PyLong_AsLong(object);
+        if (!error_converting(val_long)) {
+            ret = PyArrayScalar_New(Long);
+            if (ret != NULL) {
+                PyArrayScalar_VAL(ret, Long) = val_long;
+            }
+            return ret;
         }
-        PyArrayScalar_VAL(ret, Long) = PyInt_AS_LONG(object);
+        PyErr_Clear();
+
+        /* Check if fits in long long */
+        npy_longlong val_longlong = PyLong_AsLongLong(object);
+        if (!error_converting(val_longlong)) {
+            ret = PyArrayScalar_New(LongLong);
+            if (ret != NULL) {
+                PyArrayScalar_VAL(ret, LongLong) = val_longlong;
+            }
+            return ret;
+        }
+        PyErr_Clear();
+
+        return NULL;
     }
     else if (PyFloat_Check(object)) {
         ret = PyArrayScalar_New(Double);
-        if (ret == NULL) {
-            return NULL;
+        if (ret != NULL) {
+            PyArrayScalar_VAL(ret, Double) = PyFloat_AS_DOUBLE(object);
         }
-        PyArrayScalar_VAL(ret, Double) = PyFloat_AS_DOUBLE(object);
+        return ret;
     }
     else if (PyComplex_Check(object)) {
         ret = PyArrayScalar_New(CDouble);
-        if (ret == NULL) {
-            return NULL;
+        if (ret != NULL) {
+            PyArrayScalar_VAL(ret, CDouble).real = PyComplex_RealAsDouble(object);
+            PyArrayScalar_VAL(ret, CDouble).imag = PyComplex_ImagAsDouble(object);
         }
-        PyArrayScalar_VAL(ret, CDouble).real = PyComplex_RealAsDouble(object);
-        PyArrayScalar_VAL(ret, CDouble).imag = PyComplex_ImagAsDouble(object);
+        return ret;
     }
-    else if (PyLong_Check(object)) {
-        npy_longlong val;
-        val = PyLong_AsLongLong(object);
-        if (error_converting(val)) {
-            PyErr_Clear();
-            return NULL;
-        }
-        ret = PyArrayScalar_New(LongLong);
-        if (ret == NULL) {
-            return NULL;
-        }
-        PyArrayScalar_VAL(ret, LongLong) = val;
+    else {
+        return NULL;
     }
-    return ret;
 }
 
 /*New reference */
@@ -613,7 +621,7 @@ PyArray_DescrFromScalar(PyObject *sc)
         PyArray_DESCR_REPLACE(descr);
         type_num = descr->type_num;
         if (type_num == NPY_STRING) {
-            descr->elsize = PyString_GET_SIZE(sc);
+            descr->elsize = PyBytes_GET_SIZE(sc);
         }
         else if (type_num == NPY_UNICODE) {
             descr->elsize = PyUnicode_GET_LENGTH(sc) * 4;
@@ -755,8 +763,8 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
     }
     if (PyTypeNum_ISFLEXIBLE(type_num)) {
         if (type_num == NPY_STRING) {
-            destptr = PyString_AS_STRING(obj);
-            ((PyStringObject *)obj)->ob_shash = -1;
+            destptr = PyBytes_AS_STRING(obj);
+            ((PyBytesObject *)obj)->ob_shash = -1;
             memcpy(destptr, data, itemsize);
             return obj;
         }
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 088b380aa..e480628e7 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -67,8 +67,11 @@ gentype_alloc(PyTypeObject *type, Py_ssize_t nitems)
     const size_t size = _PyObject_VAR_SIZE(type, nitems + 1);
 
     obj = (PyObject *)PyObject_Malloc(size);
+    if (obj == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
     /*
-     * Fixme. Need to check for no memory.
      * If we don't need to zero memory, we could use
      * PyObject_{New, NewVar} for this whole function.
      */
@@ -149,7 +152,7 @@ static PyObject *
 gentype_add(PyObject *m1, PyObject* m2)
 {
     /* special case str.__radd__, which should not call array_add */
-    if (PyString_Check(m1) || PyUnicode_Check(m1)) {
+    if (PyBytes_Check(m1) || PyUnicode_Check(m1)) {
         Py_INCREF(Py_NotImplemented);
         return Py_NotImplemented;
     }
@@ -274,7 +277,8 @@ gentype_format(PyObject *self, PyObject *args)
     if (Py_TYPE(self) == &PyBoolArrType_Type) {
         obj = PyBool_FromLong(PyArrayScalar_VAL(self, Bool));
     }
-    else if (PyArray_IsScalar(self, Integer)) {
+    else if (PyArray_IsScalar(self, Integer)
+             && !PyArray_IsScalar(self, Timedelta)) {
         obj = Py_TYPE(self)->tp_as_number->nb_int(self);
     }
     else if (PyArray_IsScalar(self, Floating)) {
@@ -447,7 +451,7 @@ _void_to_hex(const char* argbuf, const Py_ssize_t arglen,
     }
     memcpy(&retbuf[j], echars, strlen(echars));
 
-    retval = PyUString_FromStringAndSize(retbuf, slen);
+    retval = PyUnicode_FromStringAndSize(retbuf, slen);
     PyMem_Free(retbuf);
 
     return retval;
@@ -518,21 +522,15 @@ datetimetype_repr(PyObject *self)
      */
     if ((scal->obmeta.num == 1 && scal->obmeta.base != NPY_FR_h) ||
             scal->obmeta.base == NPY_FR_GENERIC) {
-        ret = PyUString_FromString("numpy.datetime64('");
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(iso));
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s')", iso);
     }
     else {
-        ret = PyUString_FromString("numpy.datetime64('");
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(iso));
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("','"));
-        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s','%S')", iso, meta);
+        Py_DECREF(meta);
     }
 
     return ret;
@@ -542,7 +540,7 @@ static PyObject *
 timedeltatype_repr(PyObject *self)
 {
     PyTimedeltaScalarObject *scal;
-    PyObject *ret;
+    PyObject *val, *ret;
 
     if (!PyArray_IsScalar(self, Timedelta)) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -554,32 +552,34 @@ timedeltatype_repr(PyObject *self)
 
     /* The value */
     if (scal->obval == NPY_DATETIME_NAT) {
-        ret = PyUString_FromString("numpy.timedelta64('NaT'");
+        val = PyUnicode_FromString("'NaT'");
     }
     else {
-        /*
-         * Can't use "%lld" if HAVE_LONG_LONG is not defined
-         */
+         /* Can't use "%lld" if HAVE_LONG_LONG is not defined */
 #if defined(HAVE_LONG_LONG)
-        ret = PyUString_FromFormat("numpy.timedelta64(%lld",
-                                            (long long)scal->obval);
+        val = PyUnicode_FromFormat("%lld", (long long)scal->obval);
 #else
-        ret = PyUString_FromFormat("numpy.timedelta64(%ld",
-                                            (long)scal->obval);
+        val = PyUnicode_FromFormat("%ld", (long)scal->obval);
 #endif
     }
+    if (val == NULL) {
+        return NULL;
+    }
+
     /* The metadata unit */
     if (scal->obmeta.base == NPY_FR_GENERIC) {
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(")"));
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S)", val);
     }
     else {
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(",'"));
-        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            Py_DECREF(val);
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S,'%S')", val, meta);
+        Py_DECREF(meta);
     }
+    Py_DECREF(val);
 
     return ret;
 }
@@ -611,7 +611,7 @@ datetimetype_str(PyObject *self)
         return NULL;
     }
 
-    return PyUString_FromString(iso);
+    return PyUnicode_FromString(iso);
 }
 
 static char *_datetime_verbose_strings[NPY_DATETIME_NUMUNITS] = {
@@ -657,21 +657,19 @@ timedeltatype_str(PyObject *self)
     }
 
     if (scal->obval == NPY_DATETIME_NAT) {
-        ret = PyUString_FromString("NaT");
+        ret = PyUnicode_FromString("NaT");
     }
     else {
         /*
          * Can't use "%lld" if HAVE_LONG_LONG is not defined
          */
 #if defined(HAVE_LONG_LONG)
-        ret = PyUString_FromFormat("%lld ",
-                                (long long)(scal->obval * scal->obmeta.num));
+        ret = PyUnicode_FromFormat("%lld %s",
+            (long long)(scal->obval * scal->obmeta.num), basestr);
 #else
-        ret = PyUString_FromFormat("%ld ",
-                                (long)(scal->obval * scal->obmeta.num));
+        ret = PyUnicode_FromFormat("%ld %s",
+            (long)(scal->obval * scal->obmeta.num), basestr);
 #endif
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(basestr));
     }
 
     return ret;
@@ -795,7 +793,7 @@ legacy_@name@_format@kind@(@type@ val)
         PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
     }
 
-    return PyUString_FromString(buf);
+    return PyUnicode_FromString(buf);
 }
 
 #undef _FMT1
@@ -836,7 +834,7 @@ legacy_@name@_format@kind@(npy_@name@ val){
         strcpy(&buf[cnt],".0");
     }
 
-    return PyUString_FromString(buf);
+    return PyUnicode_FromString(buf);
 }
 
 #undef _FMT1
@@ -890,7 +888,7 @@ static PyObject *
 static PyObject *
 c@name@type_@kind@(PyObject *self)
 {
-    PyObject *rstr, *istr, *ret;
+    PyObject *rstr, *istr;
     npy_c@name@ val = PyArrayScalar_VAL(self, C@Name@);
     TrimMode trim = TrimMode_DptZeros;
 
@@ -903,47 +901,47 @@ c@name@type_@kind@(PyObject *self)
         if (istr == NULL) {
             return NULL;
         }
-
-        PyUString_ConcatAndDel(&istr, PyUString_FromString("j"));
-        return istr;
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
     }
 
     if (npy_isfinite(val.real)) {
         rstr = @name@type_@kind@_either(val.real, trim, trim, 0);
-        if (rstr == NULL) {
-            return NULL;
-        }
     }
     else if (npy_isnan(val.real)) {
-        rstr = PyUString_FromString("nan");
+        rstr = PyUnicode_FromString("nan");
     }
     else if (val.real > 0){
-        rstr = PyUString_FromString("inf");
+        rstr = PyUnicode_FromString("inf");
     }
     else {
-        rstr = PyUString_FromString("-inf");
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
     }
 
     if (npy_isfinite(val.imag)) {
         istr = @name@type_@kind@_either(val.imag, trim, trim, 1);
-        if (istr == NULL) {
-            return NULL;
-        }
     }
     else if (npy_isnan(val.imag)) {
-        istr = PyUString_FromString("+nan");
+        istr = PyUnicode_FromString("+nan");
     }
     else if (val.imag > 0){
-        istr = PyUString_FromString("+inf");
+        istr = PyUnicode_FromString("+inf");
     }
     else {
-        istr = PyUString_FromString("-inf");
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
     }
 
-    ret = PyUString_FromString("(");
-    PyUString_ConcatAndDel(&ret, rstr);
-    PyUString_ConcatAndDel(&ret, istr);
-    PyUString_ConcatAndDel(&ret, PyUString_FromString("j)"));
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
     return ret;
 }
 
@@ -1058,7 +1056,7 @@ gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
 static PyObject *
 gentype_ndim_get(PyObject *NPY_UNUSED(self))
 {
-    return PyInt_FromLong(0);
+    return PyLong_FromLong(0);
 }
 
 static PyObject *
@@ -1099,7 +1097,7 @@ inttype_numerator_get(PyObject *self)
 static PyObject *
 inttype_denominator_get(PyObject *self)
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 
@@ -1119,7 +1117,7 @@ gentype_itemsize_get(PyObject *self)
 
     typecode = PyArray_DescrFromScalar(self);
     elsize = typecode->elsize;
-    ret = PyInt_FromLong((long) elsize);
+    ret = PyLong_FromLong((long) elsize);
     Py_DECREF(typecode);
     return ret;
 }
@@ -1127,7 +1125,7 @@ gentype_itemsize_get(PyObject *self)
 static PyObject *
 gentype_size_get(PyObject *NPY_UNUSED(self))
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 static PyObject *
@@ -1147,12 +1145,16 @@ gentype_sizeof(PyObject *self)
 NPY_NO_EXPORT void
 gentype_struct_free(PyObject *ptr)
 {
-    PyArrayInterface *arrif;
-    PyObject *context;
-
-    arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
-    context = (PyObject *)PyCapsule_GetContext(ptr);
-    Py_DECREF(context);
+    PyArrayInterface *arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
+    if (arrif == NULL) {
+        PyErr_WriteUnraisable(ptr);
+        return;
+    }
+    PyObject *context = (PyObject *)PyCapsule_GetContext(ptr);
+    if (context == NULL && PyErr_Occurred()) {
+        PyErr_WriteUnraisable(ptr);
+    }
+    Py_XDECREF(context);
     Py_XDECREF(arrif->descr);
     PyArray_free(arrif->shape);
     PyArray_free(arrif);
@@ -1307,7 +1309,7 @@ gentype_imag_get(PyObject *self)
         ret = PyObject_GetAttrString(obj, "imag");
         if (ret == NULL) {
             PyErr_Clear();
-            obj = PyInt_FromLong(0);
+            obj = PyLong_FromLong(0);
             newtype = PyArray_DescrFromType(NPY_OBJECT);
             ret = PyArray_Scalar((char *)&obj, newtype, NULL);
             Py_DECREF(newtype);
@@ -1743,13 +1745,8 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         if (arr == NULL) {
             return NULL;
         }
-        /* arr.item() */
-        PyObject *val = PyArray_GETITEM(arr, PyArray_DATA(arr));
-        Py_DECREF(arr);
-        if (val == NULL) {
-            return NULL;
-        }
-        PyObject *tup = Py_BuildValue("NN", obj, val);
+        /* Use the whole array which handles sturctured void correctly */
+        PyObject *tup = Py_BuildValue("NN", obj, arr);
         if (tup == NULL) {
             return NULL;
         }
@@ -2312,7 +2309,7 @@ voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val)
         return -1;
     }
 
-    if (PyBaseString_Check(ind)) {
+    if (PyUnicode_Check(ind)) {
         /*
          * Much like in voidtype_setfield, we cannot simply use ndarray's
          * __setitem__ since assignment to void scalars should not broadcast
@@ -2385,6 +2382,55 @@ static PySequenceMethods voidtype_as_sequence = {
 };
 
 
+/*
+ * This function implements simple buffer export for user defined subclasses
+ * of `np.generic`. All other scalar types override the buffer export.
+ */
+static int
+gentype_arrtype_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        PyErr_Format(PyExc_TypeError,
+                "NumPy scalar %R can only exported as a buffer without format.",
+                self);
+        return -1;
+    }
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyArray_Descr *descr = PyArray_DescrFromScalar(self);
+    if (descr == NULL) {
+        return -1;
+    }
+    if (!PyDataType_ISUSERDEF(descr)) {
+        /* This path would also reject the (hopefully) impossible "object" */
+        PyErr_Format(PyExc_TypeError,
+                "user-defined scalar %R registered for built-in dtype %S? "
+                "This should be impossible.",
+                self, descr);
+        return -1;
+    }
+    view->ndim = 0;
+    view->len = descr->elsize;
+    view->itemsize = descr->elsize;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;  /* assume general (user) scalars are readonly. */
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = scalar_value(self, descr);
+    Py_DECREF(descr);
+    view->format = NULL;
+    return 0;
+}
+
+
+static PyBufferProcs gentype_arrtype_as_buffer = {
+    .bf_getbuffer = (getbufferproc)gentype_arrtype_getbuffer,
+};
+
 
 /**begin repeat
  * #name = bool, byte, short, int, long, longlong, ubyte, ushort, uint, ulong,
@@ -2403,6 +2449,7 @@ static int
 @name@_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     Py@Name@ScalarObject *scalar = (Py@Name@ScalarObject *)self;
@@ -2415,6 +2462,7 @@ static int
     view->shape = NULL;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
     view->buf = &(scalar->obval);
@@ -2441,6 +2489,7 @@ static int
 unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     PyUnicodeScalarObject *scalar = (PyUnicodeScalarObject *)self;
@@ -2452,6 +2501,7 @@ unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
     view->shape = NULL;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
 
@@ -2481,7 +2531,7 @@ unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
         view->format = scalar->buffer_fmt;
     }
     else {
-        scalar->buffer_fmt = PyObject_Malloc(22);
+        scalar->buffer_fmt = PyMem_Malloc(22);
         if (scalar->buffer_fmt == NULL) {
             Py_SETREF(view->obj, NULL);
             return -1;
@@ -2508,6 +2558,7 @@ static int
 @name@_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     Py@Name@ScalarObject *scalar = (Py@Name@ScalarObject *)self;
@@ -2519,6 +2570,7 @@ static int
     view->shape = &length;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
 
@@ -2558,19 +2610,46 @@ NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type = {
     .tp_basicsize = sizeof(PyObject),
 };
 
+
 static void
 void_dealloc(PyVoidScalarObject *v)
 {
-    _dealloc_cached_buffer_info((PyObject *)v);
-
     if (v->flags & NPY_ARRAY_OWNDATA) {
         npy_free_cache(v->obval, Py_SIZE(v));
     }
     Py_XDECREF(v->descr);
     Py_XDECREF(v->base);
+    if (_buffer_info_free(v->_buffer_info, (PyObject *)v) < 0) {
+        PyErr_WriteUnraisable(NULL);
+    }
     Py_TYPE(v)->tp_free(v);
 }
 
+
+static PyObject *
+object_arrtype_alloc(PyTypeObject *type, Py_ssize_t items)
+{
+    /*
+     * Object scalars should not actually exist, if they exist we should
+     * consider it to be a bug.
+     */
+    static PyObject *visibleDeprecationWarning = NULL;
+    npy_cache_import("numpy", "VisibleDeprecationWarning",
+                     &visibleDeprecationWarning);
+    if (visibleDeprecationWarning == NULL) {
+        return NULL;
+    }
+    if (PyErr_WarnEx(visibleDeprecationWarning,
+            "Creating a NumPy object scalar.  NumPy object scalars should "
+            "never be created.  If you see this message please inform the "
+            "NumPy developers.  Since this message should never be shown "
+            "this will raise a TypeError in the future.", 1) < 0) {
+        return NULL;
+    }
+    return gentype_alloc(type, items);
+}
+
+
 static void
 object_arrtype_dealloc(PyObject *v)
 {
@@ -2583,6 +2662,7 @@ unicode_arrtype_dealloc(PyObject *v)
 {
     /* note: may be null if it was never requested */
     PyMem_Free(PyArrayScalar_VAL(v, Unicode));
+    PyMem_Free(((PyUnicodeScalarObject *)v)->buffer_fmt);
     /* delegate to the base class */
     PyUnicode_Type.tp_dealloc(v);
 }
@@ -2868,7 +2948,7 @@ bool_arrtype_nonzero(PyObject *a)
  *         ulong, ulonglong#
  * #Name = Byte, Short, Int, Long, UByte, UShort, LongLong, UInt,
  *         ULong, ULongLong#
- * #type = PyInt_FromLong*6, PyLong_FromLongLong*1,
+ * #type = PyLong_FromLong*6, PyLong_FromLongLong*1,
  *         PyLong_FromUnsignedLong*2, PyLong_FromUnsignedLongLong#
  */
 static PyNumberMethods @name@_arrtype_as_number;
@@ -2897,7 +2977,7 @@ bool_index(PyObject *a)
         return NULL;
     }
     else {
-        return PyInt_FromLong(PyArrayScalar_VAL(a, Bool));
+        return PyLong_FromLong(PyArrayScalar_VAL(a, Bool));
     }
 }
 
@@ -2923,7 +3003,7 @@ void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
      * For a VOID scalar first see if obj is an integer or long
      * and create new memory of that size (filled with 0) for the scalar
      */
-    if (PyLong_Check(obj) || PyInt_Check(obj) ||
+    if (PyLong_Check(obj) ||
             PyArray_IsScalar(obj, Integer) ||
             (PyArray_Check(obj) &&
                      PyArray_NDIM((PyArrayObject *)obj)==0 &&
@@ -3298,6 +3378,7 @@ NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy.object_",
     .tp_basicsize = sizeof(PyObjectScalarObject),
+    .tp_alloc = object_arrtype_alloc,
     .tp_dealloc = (destructor)object_arrtype_dealloc,
     .tp_as_sequence = &object_arrtype_as_sequence,
     .tp_as_mapping = &object_arrtype_as_mapping,
@@ -3770,6 +3851,7 @@ initialize_numeric_types(void)
     PyGenericArrType_Type.tp_alloc = gentype_alloc;
     PyGenericArrType_Type.tp_free = (freefunc)gentype_free;
     PyGenericArrType_Type.tp_richcompare = gentype_richcompare;
+    PyGenericArrType_Type.tp_as_buffer = &gentype_arrtype_as_buffer;
 
     PyBoolArrType_Type.tp_as_number = &bool_arrtype_as_number;
     /*
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 30507112d..02c349759 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -133,7 +133,7 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
     if (newnbytes > oldnbytes && PyArray_ISWRITEABLE(self)) {
         /* Fill new memory with zeros */
         if (PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_REFCOUNT)) {
-            PyObject *zero = PyInt_FromLong(0);
+            PyObject *zero = PyLong_FromLong(0);
             char *optr;
             optr = PyArray_BYTES(self) + oldnbytes;
             npy_intp n_new = newsize - oldsize;
@@ -332,7 +332,7 @@ _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype)
 
         for (i = 0; i < nsize; i++) {
             Py_INCREF(zero);
-            NPY_COPY_PYOBJECT_PTR(optr, &zero);
+            memcpy(optr, &zero, sizeof(zero));
             optr += sizeof(zero);
         }
     }
@@ -458,14 +458,12 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, const npy_intp *newdims,
 static void
 raise_reshape_size_mismatch(PyArray_Dims *newshape, PyArrayObject *arr)
 {
-    PyObject *msg = PyUString_FromFormat("cannot reshape array of size %zd "
-                                         "into shape ", PyArray_SIZE(arr));
     PyObject *tmp = convert_shape_to_string(newshape->len, newshape->ptr, "");
-
-    PyUString_ConcatAndDel(&msg, tmp);
-    if (msg != NULL) {
-        PyErr_SetObject(PyExc_ValueError, msg);
-        Py_DECREF(msg);
+    if (tmp != NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "cannot reshape array of size %zd into shape %S",
+                PyArray_SIZE(arr), tmp);
+        Py_DECREF(tmp);
     }
 }
 
@@ -979,55 +977,6 @@ PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
     return (PyObject *)ret;
 }
 
-/* See shape.h for parameters documentation */
-NPY_NO_EXPORT PyObject *
-build_shape_string(npy_intp n, npy_intp const *vals)
-{
-    npy_intp i;
-    PyObject *ret, *tmp;
-
-    /*
-     * Negative dimension indicates "newaxis", which can
-     * be discarded for printing if it's a leading dimension.
-     * Find the first non-"newaxis" dimension.
-     */
-    i = 0;
-    while (i < n && vals[i] < 0) {
-        ++i;
-    }
-
-    if (i == n) {
-        return PyUString_FromFormat("()");
-    }
-    else {
-        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    for (; i < n; ++i) {
-        if (vals[i] < 0) {
-            tmp = PyUString_FromString(",newaxis");
-        }
-        else {
-            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
-        }
-        if (tmp == NULL) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-
-        PyUString_ConcatAndDel(&ret, tmp);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    tmp = PyUString_FromFormat(")");
-    PyUString_ConcatAndDel(&ret, tmp);
-    return ret;
-}
 
 /*NUMPY_API
  *
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index d25292556..875b5430f 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -2,13 +2,6 @@
 #define _NPY_ARRAY_SHAPE_H_
 
 /*
- * Builds a string representation of the shape given in 'vals'.
- * A negative value in 'vals' gets interpreted as newaxis.
- */
-NPY_NO_EXPORT PyObject *
-build_shape_string(npy_intp n, npy_intp const *vals);
-
-/*
  * Creates a sorted stride perm matching the KEEPORDER behavior
  * of the NpyIter object. Because this operates based on multiple
  * input strides, the 'stride' member of the npy_stride_sort_item
diff --git a/numpy/core/src/multiarray/strfuncs.c b/numpy/core/src/multiarray/strfuncs.c
index 363cbdba2..d9d9b7c0a 100644
--- a/numpy/core/src/multiarray/strfuncs.c
+++ b/numpy/core/src/multiarray/strfuncs.c
@@ -3,14 +3,25 @@
 
 #include <Python.h>
 #include <numpy/arrayobject.h>
-
 #include "npy_pycompat.h"
-
+#include "npy_import.h"
 #include "strfuncs.h"
 
 static PyObject *PyArray_StrFunction = NULL;
 static PyObject *PyArray_ReprFunction = NULL;
 
+
+static void
+npy_PyErr_SetStringChained(PyObject *type, const char *message)
+{
+    PyObject *exc, *val, *tb;
+
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_SetString(type, message);
+    npy_PyErr_ChainExceptionsCause(exc, val, tb);
+}
+
+
 /*NUMPY_API
  * Set the array print function to be a Python function.
  */
@@ -36,164 +47,52 @@ PyArray_SetStringFunction(PyObject *op, int repr)
 }
 
 
-/*
- * Extend string. On failure, returns NULL and leaves *strp alone.
- * XXX we do this in multiple places; time for a string library?
- */
-static char *
-extend_str(char **strp, Py_ssize_t n, Py_ssize_t *maxp)
-{
-    char *str = *strp;
-    Py_ssize_t new_cap;
-
-    if (n >= *maxp - 16) {
-        new_cap = *maxp * 2;
-
-        if (new_cap <= *maxp) {     /* overflow */
-            return NULL;
-        }
-        str = PyArray_realloc(*strp, new_cap);
-        if (str != NULL) {
-            *strp = str;
-            *maxp = new_cap;
-        }
-    }
-    return str;
-}
-
-
-static int
-dump_data(char **string, Py_ssize_t *n, Py_ssize_t *max_n, char *data, int nd,
-          npy_intp const *dimensions, npy_intp const *strides, PyArrayObject* self)
-{
-    PyObject *op = NULL, *sp = NULL;
-    char *ostring;
-    npy_intp i, N, ret = 0;
-
-#define CHECK_MEMORY do {                           \
-        if (extend_str(string, *n, max_n) == NULL) {    \
-            ret = -1;                               \
-            goto end;                               \
-        }                                           \
-    } while (0)
-
-    if (nd == 0) {
-        if ((op = PyArray_GETITEM(self, data)) == NULL) {
-            return -1;
-        }
-        sp = PyObject_Repr(op);
-        if (sp == NULL) {
-            ret = -1;
-            goto end;
-        }
-        ostring = PyString_AsString(sp);
-        N = PyString_Size(sp)*sizeof(char);
-        *n += N;
-        CHECK_MEMORY;
-        memmove(*string + (*n - N), ostring, N);
-    }
-    else {
-        CHECK_MEMORY;
-        (*string)[*n] = '[';
-        *n += 1;
-        for (i = 0; i < dimensions[0]; i++) {
-            if (dump_data(string, n, max_n,
-                          data + (*strides)*i,
-                          nd - 1, dimensions + 1,
-                          strides + 1, self) < 0) {
-                return -1;
-            }
-            CHECK_MEMORY;
-            if (i < dimensions[0] - 1) {
-                (*string)[*n] = ',';
-                (*string)[*n+1] = ' ';
-                *n += 2;
-            }
-        }
-        CHECK_MEMORY;
-        (*string)[*n] = ']';
-        *n += 1;
-    }
-
-#undef CHECK_MEMORY
-
-end:
-    Py_XDECREF(op);
-    Py_XDECREF(sp);
-    return ret;
-}
-
-
-static PyObject *
-array_repr_builtin(PyArrayObject *self, int repr)
-{
-    PyObject *ret;
-    char *string;
-    /* max_n initial value is arbitrary, dump_data will extend it */
-    Py_ssize_t n = 0, max_n = PyArray_NBYTES(self) * 4 + 7;
-
-    if ((string = PyArray_malloc(max_n)) == NULL) {
-        return PyErr_NoMemory();
-    }
-
-    if (dump_data(&string, &n, &max_n, PyArray_DATA(self),
-                  PyArray_NDIM(self), PyArray_DIMS(self),
-                  PyArray_STRIDES(self), self) < 0) {
-        PyArray_free(string);
-        return NULL;
-    }
-
-    if (repr) {
-        if (PyArray_ISEXTENDED(self)) {
-            ret = PyUString_FromFormat("array(%s, '%c%d')",
-                                       string,
-                                       PyArray_DESCR(self)->type,
-                                       PyArray_DESCR(self)->elsize);
-        }
-        else {
-            ret = PyUString_FromFormat("array(%s, '%c')",
-                                       string,
-                                       PyArray_DESCR(self)->type);
-        }
-    }
-    else {
-        ret = PyUString_FromStringAndSize(string, n);
-    }
-
-    PyArray_free(string);
-    return ret;
-}
-
-
 NPY_NO_EXPORT PyObject *
 array_repr(PyArrayObject *self)
 {
-    PyObject *s;
+    static PyObject *repr = NULL;
 
-    if (PyArray_ReprFunction == NULL) {
-        s = array_repr_builtin(self, 1);
+    if (PyArray_ReprFunction != NULL) {
+        return PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL);
     }
-    else {
-        s = PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL);
+
+    /*
+     * We need to do a delayed import here as initialization on module load
+     * leads to circular import problems.
+     */
+    npy_cache_import("numpy.core.arrayprint", "_default_array_repr", &repr);
+    if (repr == NULL) {
+        npy_PyErr_SetStringChained(PyExc_RuntimeError,
+                "Unable to configure default ndarray.__repr__");
+        return NULL;
     }
-    return s;
+    return PyObject_CallFunctionObjArgs(repr, self, NULL);
 }
 
 
 NPY_NO_EXPORT PyObject *
 array_str(PyArrayObject *self)
 {
-    PyObject *s;
+    static PyObject *str = NULL;
 
-    if (PyArray_StrFunction == NULL) {
-        s = array_repr_builtin(self, 0);
+    if (PyArray_StrFunction != NULL) {
+        return PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL);
     }
-    else {
-        s = PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL);
+
+    /*
+     * We need to do a delayed import here as initialization on module load leads
+     * to circular import problems.
+     */
+    npy_cache_import("numpy.core.arrayprint", "_default_array_str", &str);
+    if (str == NULL) {
+        npy_PyErr_SetStringChained(PyExc_RuntimeError,
+                "Unable to configure default ndarray.__str__");
+        return NULL;
     }
-    return s;
+    return PyObject_CallFunctionObjArgs(str, self, NULL);
 }
 
+
 NPY_NO_EXPORT PyObject *
 array_format(PyArrayObject *self, PyObject *args)
 {
@@ -221,4 +120,3 @@ array_format(PyArrayObject *self, PyObject *args)
         );
     }
 }
-
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
index 09b948218..b19dee418 100644
--- a/numpy/core/src/multiarray/temp_elide.c
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -62,12 +62,8 @@
 #define NPY_ELIDE_DEBUG 0
 #define NPY_MAX_STACKSIZE 10
 
-#if PY_VERSION_HEX >= 0x03060000
 /* TODO can pep523 be used to somehow? */
 #define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
-#else
-#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
-#endif
 /*
  * Heuristic size of the array in bytes at which backtrace overhead generation
  * becomes less than speed gained by in-place operations. Depends on stack depth
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index bc320138d..3eaf99196 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -38,6 +38,11 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "usertypes.h"
 #include "dtypemeta.h"
+#include "scalartypes.h"
+#include "array_method.h"
+#include "convert_datatype.h"
+#include "legacy_dtype_implementation.h"
+
 
 NPY_NO_EXPORT PyArray_Descr **userdescrs=NULL;
 
@@ -127,6 +132,9 @@ PyArray_InitArrFuncs(PyArray_ArrFuncs *f)
     f->scalarkind = NULL;
     f->cancastscalarkindto = NULL;
     f->cancastto = NULL;
+    f->fastclip = NULL;
+    f->fastputmask = NULL;
+    f->fasttake = NULL;
 }
 
 
@@ -192,7 +200,7 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         }
     }
     typenum = NPY_USERDEF + NPY_NUMUSERTYPES;
-    descr->type_num = typenum;
+    descr->type_num = -1;
     if (PyDataType_ISUNSIZED(descr)) {
         PyErr_SetString(PyExc_ValueError, "cannot register a" \
                         "flexible data-type");
@@ -215,6 +223,27 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_ValueError, "missing typeobject");
         return -1;
     }
+    if (descr->flags & (NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT)) {
+        /*
+         * User dtype can't actually do reference counting, however, there
+         * are existing hacks (e.g. xpress), which use a structured one:
+         *     dtype((xpress.var, [('variable', 'O')]))
+         * so we have to support this. But such a structure must be constant
+         * (i.e. fixed at registration time, this is the case for `xpress`).
+         */
+        if (descr->names == NULL || descr->fields == NULL ||
+            !PyDict_CheckExact(descr->fields)) {
+            PyErr_Format(PyExc_ValueError,
+                    "Failed to register dtype for %S: Legacy user dtypes "
+                    "using `NPY_ITEM_IS_POINTER` or `NPY_ITEM_REFCOUNT` are"
+                    "unsupported.  It is possible to create such a dtype only "
+                    "if it is a structured dtype with names and fields "
+                    "hardcoded at registration time.\n"
+                    "Please contact the NumPy developers if this used to work "
+                    "but now fails.", descr->typeobj);
+            return -1;
+        }
+    }
 
     if (test_deprecated_arrfuncs_members(f) < 0) {
         return -1;
@@ -226,9 +255,13 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_MemoryError, "RegisterDataType");
         return -1;
     }
+
     userdescrs[NPY_NUMUSERTYPES++] = descr;
 
+    descr->type_num = typenum;
     if (dtypemeta_wrap_legacy_descriptor(descr) < 0) {
+        descr->type_num = -1;
+        NPY_NUMUSERTYPES--;
         return -1;
     }
 
@@ -260,11 +293,11 @@ PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
             return -1;
         }
     }
-    key = PyInt_FromLong(totype);
+    key = PyLong_FromLong(totype);
     if (PyErr_Occurred()) {
         return -1;
     }
-    cobj = NpyCapsule_FromVoidPtr((void *)castfunc, NULL);
+    cobj = PyCapsule_New((void *)castfunc, NULL, NULL);
     if (cobj == NULL) {
         Py_DECREF(key);
         return -1;
@@ -291,7 +324,7 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
     if (!PyTypeNum_ISUSERDEF(descr->type_num) &&
                                         !PyTypeNum_ISUSERDEF(totype)) {
         PyErr_SetString(PyExc_ValueError,
-                        "At least one of the types provided to"
+                        "At least one of the types provided to "
                         "RegisterCanCast must be user-defined.");
         return -1;
     }
@@ -339,3 +372,185 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
         return _append_new(&descr->f->cancastscalarkindto[scalar], totype);
     }
 }
+
+
+/*
+ * Legacy user DTypes implemented the common DType operation
+ * (as used in type promotion/result_type, and e.g. the type for
+ * concatenation), by using "safe cast" logic.
+ *
+ * New DTypes do have this behaviour generally, but we use can-cast
+ * when legacy user dtypes are involved.
+ */
+NPY_NO_EXPORT PyArray_DTypeMeta *
+legacy_userdtype_common_dtype_function(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    int skind1 = NPY_NOSCALAR, skind2 = NPY_NOSCALAR, skind;
+
+    if (!other->legacy) {
+        /* legacy DTypes can always defer to new style ones */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    /* Defer so that only one of the types handles the cast */
+    if (cls->type_num < other->type_num) {
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+
+    /* Check whether casting is possible from one type to the other */
+    if (PyArray_CanCastSafely(cls->type_num, other->type_num)) {
+        Py_INCREF(other);
+        return other;
+    }
+    if (PyArray_CanCastSafely(other->type_num, cls->type_num)) {
+        Py_INCREF(cls);
+        return cls;
+    }
+
+    /*
+     * The following code used to be part of PyArray_PromoteTypes().
+     * We can expect that this code is never used.
+     * In principle, it allows for promotion of two different user dtypes
+     * to a single NumPy dtype of the same "kind". In practice
+     * using the same `kind` as NumPy was never possible due to an
+     * simplification where `PyArray_EquivTypes(descr1, descr2)` will
+     * return True if both kind and element size match (e.g. bfloat16 and
+     * float16 would be equivalent).
+     * The option is also very obscure and not used in the examples.
+     */
+
+    /* Convert the 'kind' char into a scalar kind */
+    switch (cls->kind) {
+        case 'b':
+            skind1 = NPY_BOOL_SCALAR;
+            break;
+        case 'u':
+            skind1 = NPY_INTPOS_SCALAR;
+            break;
+        case 'i':
+            skind1 = NPY_INTNEG_SCALAR;
+            break;
+        case 'f':
+            skind1 = NPY_FLOAT_SCALAR;
+            break;
+        case 'c':
+            skind1 = NPY_COMPLEX_SCALAR;
+            break;
+    }
+    switch (other->kind) {
+        case 'b':
+            skind2 = NPY_BOOL_SCALAR;
+            break;
+        case 'u':
+            skind2 = NPY_INTPOS_SCALAR;
+            break;
+        case 'i':
+            skind2 = NPY_INTNEG_SCALAR;
+            break;
+        case 'f':
+            skind2 = NPY_FLOAT_SCALAR;
+            break;
+        case 'c':
+            skind2 = NPY_COMPLEX_SCALAR;
+            break;
+    }
+
+    /* If both are scalars, there may be a promotion possible */
+    if (skind1 != NPY_NOSCALAR && skind2 != NPY_NOSCALAR) {
+
+        /* Start with the larger scalar kind */
+        skind = (skind1 > skind2) ? skind1 : skind2;
+        int ret_type_num = _npy_smallest_type_of_kind_table[skind];
+
+        for (;;) {
+
+            /* If there is no larger type of this kind, try a larger kind */
+            if (ret_type_num < 0) {
+                ++skind;
+                /* Use -1 to signal no promoted type found */
+                if (skind < NPY_NSCALARKINDS) {
+                    ret_type_num = _npy_smallest_type_of_kind_table[skind];
+                }
+                else {
+                    break;
+                }
+            }
+
+            /* If we found a type to which we can promote both, done! */
+            if (PyArray_CanCastSafely(cls->type_num, ret_type_num) &&
+                PyArray_CanCastSafely(other->type_num, ret_type_num)) {
+                return PyArray_DTypeFromTypeNum(ret_type_num);
+            }
+
+            /* Try the next larger type of this kind */
+            ret_type_num = _npy_next_larger_type_table[ret_type_num];
+        }
+    }
+
+    Py_INCREF(Py_NotImplemented);
+    return (PyArray_DTypeMeta *)Py_NotImplemented;
+}
+
+
+/**
+ * This function wraps a legacy cast into an array-method. This is mostly
+ * used for legacy user-dtypes, but for example numeric to/from datetime
+ * casts were only defined that way as well.
+ *
+ * @param from
+ * @param to
+ * @param casting If `NPY_NO_CASTING` will check the legacy registered cast,
+ *        otherwise uses the provided cast.
+ */
+NPY_NO_EXPORT int
+PyArray_AddLegacyWrapping_CastingImpl(
+        PyArray_DTypeMeta *from, PyArray_DTypeMeta *to, NPY_CASTING casting)
+{
+    if (casting < 0) {
+        if (from == to) {
+            casting = NPY_NO_CASTING;
+        }
+        else if (PyArray_LegacyCanCastTypeTo(
+                from->singleton, to->singleton, NPY_SAFE_CASTING)) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else if (PyArray_LegacyCanCastTypeTo(
+                from->singleton, to->singleton, NPY_SAME_KIND_CASTING)) {
+            casting = NPY_SAME_KIND_CASTING;
+        }
+        else {
+            casting = NPY_UNSAFE_CASTING;
+        }
+    }
+
+    PyArray_DTypeMeta *dtypes[2] = {from, to};
+    PyArrayMethod_Spec spec = {
+            /* Name is not actually used, but allows identifying these. */
+            .name = "legacy_cast",
+            .nin = 1,
+            .nout = 1,
+            .casting = casting,
+            .dtypes = dtypes,
+    };
+
+    if (from == to) {
+        spec.flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED;
+        PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &legacy_same_dtype_resolve_descriptors},
+            {0, NULL}};
+        spec.slots = slots;
+        return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    }
+    else {
+        spec.flags = NPY_METH_REQUIRES_PYAPI;
+        PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &simple_cast_resolve_descriptors},
+            {0, NULL}};
+        spec.slots = slots;
+        return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    }
+}
diff --git a/numpy/core/src/multiarray/usertypes.h b/numpy/core/src/multiarray/usertypes.h
index b3e386c5c..8b2fc80e6 100644
--- a/numpy/core/src/multiarray/usertypes.h
+++ b/numpy/core/src/multiarray/usertypes.h
@@ -1,6 +1,8 @@
 #ifndef _NPY_PRIVATE_USERTYPES_H_
 #define _NPY_PRIVATE_USERTYPES_H_
 
+#include "array_method.h"
+
 extern NPY_NO_EXPORT PyArray_Descr **userdescrs;
 
 NPY_NO_EXPORT void
@@ -17,4 +19,12 @@ NPY_NO_EXPORT int
 PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
                          PyArray_VectorUnaryFunc *castfunc);
 
+NPY_NO_EXPORT PyArray_DTypeMeta *
+legacy_userdtype_common_dtype_function(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other);
+
+NPY_NO_EXPORT int
+PyArray_AddLegacyWrapping_CastingImpl(
+        PyArray_DTypeMeta *from, PyArray_DTypeMeta *to, NPY_CASTING casting);
+
 #endif
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index 18b6d1434..ff4663dc3 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -398,8 +398,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,fmod,copysign#
- * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ * #kind = atan2,hypot,pow,copysign#
+ * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
  */
 #ifdef @kind@@c@
 #undef @kind@@c@
@@ -412,6 +412,32 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
+/**begin repeat1
+ * #kind = fmod#
+ * #KIND = FMOD#
+ */
+#ifdef @kind@@c@
+#undef @kind@@c@
+#endif
+#ifndef HAVE_MODF@C@
+NPY_INPLACE @type@
+npy_@kind@@c@(@type@ x, @type@ y)
+{
+    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(x) || npy_isnan(y)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (are_inputs_inf || !y) {
+        if (!npy_isnan(x)) {
+            npy_set_floatstatus_invalid();
+        }
+    }
+    return (@type@) npy_@kind@((double)x, (double) y);
+}
+#endif
+/**end repeat1**/
+
 #ifdef modf@c@
 #undef modf@c@
 #endif
@@ -473,8 +499,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,fmod,copysign#
- * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ * #kind = atan2,hypot,pow,copysign#
+ * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
  */
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
@@ -484,6 +510,29 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
+/**begin repeat1
+ * #kind = fmod#
+ * #KIND = FMOD#
+ */
+#ifdef HAVE_FMOD@C@
+NPY_INPLACE @type@
+npy_@kind@@c@(@type@ x, @type@ y)
+{
+    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(x) || npy_isnan(y)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (are_inputs_inf || !y) {
+        if (!npy_isnan(x)) {
+            npy_set_floatstatus_invalid();
+        }
+    }
+    return @kind@@c@(x, y);
+}
+#endif
+/**end repeat1**/
+
 #ifdef HAVE_MODF@C@
 NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
 {
@@ -625,6 +674,38 @@ NPY_INPLACE @type@ npy_logaddexp2@c@(@type@ x, @type@ y)
 }
 
 /*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE @type@
+npy_remainder@c@(@type@ a, @type@ b)
+{
+    @type@ mod;
+    if (NPY_UNLIKELY(!b)) {
+        mod = npy_fmod@c@(a, b);
+    } else {
+        npy_divmod@c@(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE @type@
+npy_floor_divide@c@(@type@ a, @type@ b) {
+    @type@ div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        div = a / b;
+        if (!a || npy_isnan(a)) {
+            npy_set_floatstatus_invalid();
+        } else {
+            npy_set_floatstatus_divbyzero();
+        }
+    } else {
+        div = npy_divmod@c@(a, b, &mod);
+    }
+    return div;
+}
+
+/*
  * Python version of divmod.
  *
  * The implementation is mostly copied from cpython 3.5.
@@ -634,12 +715,19 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 {
     @type@ div, mod, floordiv;
 
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(a) || npy_isnan(b)) {
+        npy_set_floatstatus_invalid();
+    }
     mod = npy_fmod@c@(a, b);
-
-    if (!b) {
+    if (NPY_UNLIKELY(!b)) {
+        div = a / b;
+        if (a && !npy_isnan(a)) {
+            npy_set_floatstatus_divbyzero();
+        }
         /* If b == 0, return result of fmod. For IEEE is nan */
         *modulus = mod;
-        return mod;
+        return div;
     }
 
     /* a - mod should be very nearly an integer multiple of b */
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index e4a919db6..212d11a0b 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -25,7 +25,6 @@
 #include "npy_fpmath.h"
 
 #include "numpy/npy_math.h"
-#include "numpy/npy_cpu.h"
 #include "numpy/npy_endian.h"
 #include "numpy/npy_common.h"
 
diff --git a/numpy/core/src/npysort/binsearch.c.src b/numpy/core/src/npysort/binsearch.c.src
index c04e197b7..41165897b 100644
--- a/numpy/core/src/npysort/binsearch.c.src
+++ b/numpy/core/src/npysort/binsearch.c.src
@@ -35,7 +35,7 @@
  * #CMP  = LT, LTE#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
                         npy_intp arr_len, npy_intp key_len,
                         npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
@@ -81,7 +81,7 @@ binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
     }
 }
 
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            const char *sort, char *ret,
                            npy_intp arr_len, npy_intp key_len,
@@ -153,7 +153,7 @@ argbinsearch_@side@_@suff@(const char *arr, const char *key,
  * #CMP  = <, <=#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 npy_binsearch_@side@(const char *arr, const char *key, char *ret,
                      npy_intp arr_len, npy_intp key_len,
                      npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
@@ -195,7 +195,7 @@ npy_binsearch_@side@(const char *arr, const char *key, char *ret,
     }
 }
 
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 npy_argbinsearch_@side@(const char *arr, const char *key,
                         const char *sort, char *ret,
                         npy_intp arr_len, npy_intp key_len,
diff --git a/numpy/core/src/npysort/heapsort.c.src b/numpy/core/src/npysort/heapsort.c.src
index c2e3b63cb..4bfea1388 100644
--- a/numpy/core/src/npysort/heapsort.c.src
+++ b/numpy/core/src/npysort/heapsort.c.src
@@ -60,7 +60,7 @@
  *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
  */
 
-int
+NPY_NO_EXPORT int
 heapsort_@suff@(void *start, npy_intp n, void *NOT_USED)
 {
     @type@ tmp, *a;
@@ -111,7 +111,7 @@ heapsort_@suff@(void *start, npy_intp n, void *NOT_USED)
 }
 
 
-int
+NPY_NO_EXPORT int
 aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *NOT_USED)
 {
     @type@ *v = vv;
@@ -177,7 +177,7 @@ aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *NOT_USED)
  * #type = npy_char, npy_ucs4#
  */
 
-int
+NPY_NO_EXPORT int
 heapsort_@suff@(void *start, npy_intp n, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -231,7 +231,7 @@ heapsort_@suff@(void *start, npy_intp n, void *varr)
 }
 
 
-int
+NPY_NO_EXPORT int
 aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *varr)
 {
     @type@ *v = vv;
@@ -291,7 +291,7 @@ aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *varr)
  */
 
 
-int
+NPY_NO_EXPORT int
 npy_heapsort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -348,7 +348,7 @@ npy_heapsort(void *start, npy_intp num, void *varr)
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_aheapsort(void *vv, npy_intp *tosort, npy_intp n, void *varr)
 {
     char *v = vv;
diff --git a/numpy/core/src/npysort/mergesort.c.src b/numpy/core/src/npysort/mergesort.c.src
index 6f659617a..f83fbf758 100644
--- a/numpy/core/src/npysort/mergesort.c.src
+++ b/numpy/core/src/npysort/mergesort.c.src
@@ -103,7 +103,7 @@ mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
 }
 
 
-int
+NPY_NO_EXPORT int
 mergesort_@suff@(void *start, npy_intp num, void *NOT_USED)
 {
     @type@ *pl, *pr, *pw;
@@ -166,7 +166,7 @@ amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
 }
 
 
-int
+NPY_NO_EXPORT int
 amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
 {
     npy_intp *pl, *pr, *pw;
@@ -245,7 +245,7 @@ mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
 }
 
 
-int
+NPY_NO_EXPORT int
 mergesort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -326,7 +326,7 @@ amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, size_t l
 }
 
 
-int
+NPY_NO_EXPORT int
 amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -407,7 +407,7 @@ npy_mergesort0(char *pl, char *pr, char *pw, char *vp, npy_intp elsize,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_mergesort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -485,7 +485,7 @@ npy_amergesort0(npy_intp *pl, npy_intp *pr, char *v, npy_intp *pw,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_amergesort(void *v, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src
index 49a2c4906..933f75808 100644
--- a/numpy/core/src/npysort/quicksort.c.src
+++ b/numpy/core/src/npysort/quicksort.c.src
@@ -85,7 +85,7 @@
  *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
  */
 
-int
+NPY_NO_EXPORT int
 quicksort_@suff@(void *start, npy_intp num, void *NOT_USED)
 {
     @type@ vp;
@@ -160,7 +160,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *NOT_USED)
 {
     @type@ *v = vv;
@@ -253,7 +253,7 @@ stack_pop:
  * #type = npy_char, npy_ucs4#
  */
 
-int
+NPY_NO_EXPORT int
 quicksort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -341,7 +341,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *varr)
 {
     @type@ *v = vv;
@@ -434,7 +434,7 @@ stack_pop:
  */
 
 
-int
+NPY_NO_EXPORT int
 npy_quicksort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -539,7 +539,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_aquicksort(void *vv, npy_intp* tosort, npy_intp num, void *varr)
 {
     char *v = vv;
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
index 72887d7e4..99d8ed42a 100644
--- a/numpy/core/src/npysort/radixsort.c.src
+++ b/numpy/core/src/npysort/radixsort.c.src
@@ -46,7 +46,7 @@ nth_byte_@suff@(@type@ key, npy_intp l) {
     return (key >> (l << 3)) & 0xFF;
 }
 
-@type@*
+static @type@*
 radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
 {
     npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
@@ -95,7 +95,7 @@ radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
     return arr;
 }
 
-int
+NPY_NO_EXPORT int
 radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
 {
     void *sorted;
@@ -136,7 +136,7 @@ radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
     return 0;
 }
 
-npy_intp*
+static npy_intp*
 aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
 {
     npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
@@ -185,7 +185,7 @@ aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
     return tosort;
 }
 
-int
+NPY_NO_EXPORT int
 aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
 {
     npy_intp *sorted;
diff --git a/numpy/core/src/npysort/selection.c.src b/numpy/core/src/npysort/selection.c.src
index be645450f..0e285b320 100644
--- a/numpy/core/src/npysort/selection.c.src
+++ b/numpy/core/src/npysort/selection.c.src
@@ -280,7 +280,7 @@ static int
  * kth 8:   0  1  2  3  4  5  6 [8  7] -> stack []
  *
  */
-int
+NPY_NO_EXPORT int
 @name@introselect_@suff@(@type@ *v,
 #if @arg@
                          npy_intp* tosort,
@@ -323,7 +323,8 @@ int
         store_pivot(kth, kth, pivots, npiv);
         return 0;
     }
-    else if (@inexact@ && kth == num - 1) {
+    // Parenthesis around @inexact@ tells clang dead code as intentional
+    else if ((@inexact@) && kth == num - 1) {
         /* useful to check if NaN present via partition(d, (x, -1)) */
         npy_intp k;
         npy_intp maxidx = low;
diff --git a/numpy/core/src/npysort/timsort.c.src b/numpy/core/src/npysort/timsort.c.src
index 26313ca5b..3fdd46f61 100644
--- a/numpy/core/src/npysort/timsort.c.src
+++ b/numpy/core/src/npysort/timsort.c.src
@@ -42,7 +42,7 @@
 
 
 
-npy_intp compute_min_run(npy_intp num)
+static npy_intp compute_min_run(npy_intp num)
 {
     npy_intp r = 0;
 
@@ -476,7 +476,7 @@ force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 timsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
 {
     int ret;
@@ -854,7 +854,7 @@ aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num,
                 void *NPY_UNUSED(varr))
 {
@@ -904,7 +904,7 @@ cleanup:
  * run length to reduce the cost of insertion sort.
  */
 
-npy_intp compute_min_run_short(npy_intp num)
+static npy_intp compute_min_run_short(npy_intp num)
 {
     npy_intp r = 0;
 
@@ -1303,7 +1303,7 @@ force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 timsort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -1691,7 +1691,7 @@ aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 atimsort_@suff@(void *start, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -2128,7 +2128,7 @@ npy_force_collapse(char *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_timsort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -2524,7 +2524,7 @@ npy_aforce_collapse(char *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_atimsort(void *start, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
diff --git a/numpy/core/src/umath/_rational_tests.c.src b/numpy/core/src/umath/_rational_tests.c.src
index 13e33d0a5..7b1e5627a 100644
--- a/numpy/core/src/umath/_rational_tests.c.src
+++ b/numpy/core/src/umath/_rational_tests.c.src
@@ -406,8 +406,9 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
             Py_INCREF(x[0]);
             return x[0];
         }
-        else if (PyString_Check(x[0])) {
-            const char* s = PyString_AS_STRING(x[0]);
+        // TODO: allow construction from unicode strings
+        else if (PyBytes_Check(x[0])) {
+            const char* s = PyBytes_AS_STRING(x[0]);
             rational x;
             if (scan_rational(&s,&x)) {
                 const char* p;
@@ -429,7 +430,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
         PyObject* y;
         int eq;
         x[i] = PyTuple_GET_ITEM(args, i);
-        n[i] = PyInt_AsLong(x[i]);
+        n[i] = PyLong_AsLong(x[i]);
         if (error_converting(n[i])) {
             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
                 PyErr_Format(PyExc_TypeError,
@@ -440,7 +441,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
             return 0;
         }
         /* Check that we had an exact integer */
-        y = PyInt_FromLong(n[i]);
+        y = PyLong_FromLong(n[i]);
         if (!y) {
             return 0;
         }
@@ -477,7 +478,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
         else { \
             PyObject* y_; \
             int eq_; \
-            long n_ = PyInt_AsLong(object); \
+            long n_ = PyLong_AsLong(object); \
             if (error_converting(n_)) { \
                 if (PyErr_ExceptionMatches(PyExc_TypeError)) { \
                     PyErr_Clear(); \
@@ -486,7 +487,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
                 } \
                 return 0; \
             } \
-            y_ = PyInt_FromLong(n_); \
+            y_ = PyLong_FromLong(n_); \
             if (!y_) { \
                 return 0; \
             } \
@@ -526,11 +527,11 @@ static PyObject*
 pyrational_repr(PyObject* self) {
     rational x = ((PyRational*)self)->r;
     if (d(x)!=1) {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "rational(%ld,%ld)",(long)x.n,(long)d(x));
     }
     else {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "rational(%ld)",(long)x.n);
     }
 }
@@ -539,11 +540,11 @@ static PyObject*
 pyrational_str(PyObject* self) {
     rational x = ((PyRational*)self)->r;
     if (d(x)!=1) {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "%ld/%ld",(long)x.n,(long)d(x));
     }
     else {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "%ld",(long)x.n);
     }
 }
@@ -590,7 +591,7 @@ RATIONAL_BINOP_2(floor_divide,
     }
 RATIONAL_UNOP(negative,rational,rational_negative(x),PyRational_FromRational)
 RATIONAL_UNOP(absolute,rational,rational_abs(x),PyRational_FromRational)
-RATIONAL_UNOP(int,long,rational_int(x),PyInt_FromLong)
+RATIONAL_UNOP(int,long,rational_int(x),PyLong_FromLong)
 RATIONAL_UNOP(float,double,rational_double(x),PyFloat_FromDouble)
 
 static PyObject*
@@ -646,12 +647,12 @@ static PyNumberMethods pyrational_as_number = {
 
 static PyObject*
 pyrational_n(PyObject* self, void* closure) {
-    return PyInt_FromLong(((PyRational*)self)->r.n);
+    return PyLong_FromLong(((PyRational*)self)->r.n);
 }
 
 static PyObject*
 pyrational_d(PyObject* self, void* closure) {
-    return PyInt_FromLong(d(((PyRational*)self)->r));
+    return PyLong_FromLong(d(((PyRational*)self)->r));
 }
 
 static PyGetSetDef pyrational_getset[] = {
@@ -662,7 +663,7 @@ static PyGetSetDef pyrational_getset[] = {
 
 static PyTypeObject PyRational_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    "rational",                               /* tp_name */
+    "numpy.core._rational_tests.rational",  /* tp_name */
     sizeof(PyRational),                       /* tp_basicsize */
     0,                                        /* tp_itemsize */
     0,                                        /* tp_dealloc */
@@ -726,17 +727,17 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
         r = ((PyRational*)item)->r;
     }
     else {
-        long n = PyInt_AsLong(item);
+        long long n = PyLong_AsLongLong(item);
         PyObject* y;
         int eq;
         if (error_converting(n)) {
             return -1;
         }
-        y = PyInt_FromLong(n);
+        y = PyLong_FromLongLong(n);
         if (!y) {
             return -1;
         }
-        eq = PyObject_RichCompareBool(item,y,Py_EQ);
+        eq = PyObject_RichCompareBool(item, y, Py_EQ);
         Py_DECREF(y);
         if (eq<0) {
             return -1;
@@ -748,7 +749,7 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
         }
         r = make_rational_int(n);
     }
-    memcpy(data,&r,sizeof(rational));
+    memcpy(data, &r, sizeof(rational));
     return 0;
 }
 
@@ -1126,7 +1127,7 @@ PyMODINIT_FUNC PyInit__rational_tests(void) {
     if (PyErr_Occurred()) {
         goto fail;
     }
-    numpy_str = PyUString_FromString("numpy");
+    numpy_str = PyUnicode_FromString("numpy");
     if (!numpy_str) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index d08aabd64..750fbeb92 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -461,6 +461,15 @@ addUfuncs(PyObject *dictionary) {
     PyDict_SetItemString(dictionary, "cross1d", f);
     Py_DECREF(f);
 
+    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL,
+            NULL, 0, 0, 0, PyUFunc_None, "_pickleable_module_global.ufunc",
+            "A dotted name for pickle testing, does nothing.", 0, NULL);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "_pickleable_module_global_ufunc", f);
+    Py_DECREF(f);
+
     return 0;
 }
 
@@ -480,7 +489,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    if (PyString_Check(signature)) {
+    if (PyBytes_Check(signature)) {
         sig_str = signature;
     } else if (PyUnicode_Check(signature)) {
         sig_str = PyUnicode_AsUTF8String(signature);
@@ -493,7 +502,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         NULL, NULL, NULL,
         0, nin, nout, PyUFunc_None, "no name",
         "doc:none",
-        1, PyString_AS_STRING(sig_str));
+        1, PyBytes_AS_STRING(sig_str));
     if (sig_str != signature) {
         Py_DECREF(sig_str);
     }
@@ -588,11 +597,11 @@ static PyObject *
 UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2))
 {
     const char *highest_func, *highest_var;
-    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ())
-    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var)
+    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var);
     const char *highest_func_xb = "nobase", *highest_var_xb = "nobase";
-    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ())
-    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var)
+    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var);
 
     PyObject *dict = PyDict_New(), *item;
     if (dict == NULL) {
@@ -610,7 +619,7 @@ UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dumm
     if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) {
         goto err;
     }
-    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item))
+    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item));
     if (PyErr_Occurred()) {
         goto err;
     }
@@ -671,7 +680,7 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
 
     d = PyModule_GetDict(m);
 
-    version = PyString_FromString("0.1");
+    version = PyUnicode_FromString("0.1");
     PyDict_SetItemString(d, "__version__", version);
     Py_DECREF(version);
 
diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
index 3404a0c6a..cd81f7734 100644
--- a/numpy/core/src/umath/extobj.c
+++ b/numpy/core/src/umath/extobj.c
@@ -109,8 +109,8 @@ _error_handler(int method, PyObject *errobj, char *errtype, int retstatus, int *
                     errtype, name);
             goto fail;
         }
-        args = Py_BuildValue("NN", PyUString_FromString(errtype),
-                PyInt_FromLong((long) retstatus));
+        args = Py_BuildValue("NN", PyUnicode_FromString(errtype),
+                PyLong_FromLong((long) retstatus));
         if (args == NULL) {
             goto fail;
         }
@@ -212,7 +212,7 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize,
     }
 
     if (bufsize != NULL) {
-        *bufsize = PyInt_AsLong(PyList_GET_ITEM(ref, 0));
+        *bufsize = PyLong_AsLong(PyList_GET_ITEM(ref, 0));
         if (error_converting(*bufsize)) {
             return -1;
         }
@@ -229,7 +229,7 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize,
     }
 
     if (errmask != NULL) {
-        *errmask = PyInt_AsLong(PyList_GET_ITEM(ref, 1));
+        *errmask = PyLong_AsLong(PyList_GET_ITEM(ref, 1));
         if (*errmask < 0) {
             if (PyErr_Occurred()) {
                 return -1;
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 74bf01643..5c22c6f1c 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -46,14 +46,20 @@ abs_ptrdiff(char *a, char *b)
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
 
-/** (ip1, ip2) -> (op1) */
-#define BINARY_LOOP\
+#define BINARY_DEFS\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
+
+#define BINARY_LOOP_SLIDING\
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
 
+/** (ip1, ip2) -> (op1) */
+#define BINARY_LOOP\
+    BINARY_DEFS\
+    BINARY_LOOP_SLIDING
+
 /** (ip1, ip2) -> (op1, op2) */
 #define BINARY_LOOP_TWO_OUT\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
@@ -155,10 +161,7 @@ abs_ptrdiff(char *a, char *b)
 #define IVDEP_LOOP
 #endif
 #define BASE_BINARY_LOOP_INP(tin, tout, op) \
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
-    npy_intp n = dimensions[0];\
-    npy_intp i;\
+    BINARY_DEFS\
     IVDEP_LOOP \
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
         const tin in1 = *(tin *)ip1; \
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index 273779ee8..9b04dc779 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -26,13 +26,13 @@ Py_square(PyObject *o)
 static PyObject *
 Py_get_one(PyObject *NPY_UNUSED(o))
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 static PyObject *
 Py_reciprocal(PyObject *o)
 {
-    PyObject *one = PyInt_FromLong(1);
+    PyObject *one = PyLong_FromLong(1);
     PyObject *result;
 
     if (!one) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0cfa1cea7..6403efaee 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,9 @@
 
 #include <string.h> /* for memchr */
 
+/* Use Libdivide for faster division */
+#include "numpy/libdivide/libdivide.h"
+
 /*
  * cutoff blocksize for pairwise summation
  * decreasing it decreases errors slightly as more pairs are summed but
@@ -570,7 +573,7 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #isa = , _avx2#
  * #ISA = , AVX2#
- * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
  * #ATTR = , NPY_GCC_TARGET_AVX2#
  */
 
@@ -840,28 +843,88 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
 }
 
+/* Libdivide only supports 32 and 64 bit types
+ * We try to pick the best possible one */
+#if NPY_BITSOF_@TYPE@ <= 32
+#define libdivide_@type@_t libdivide_s32_t
+#define libdivide_@type@_gen libdivide_s32_gen
+#define libdivide_@type@_do libdivide_s32_do
+#else
+#define libdivide_@type@_t libdivide_s64_t
+#define libdivide_@type@_gen libdivide_s64_gen
+#define libdivide_@type@_do libdivide_s64_do
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
         const @type@ in2 = *(@type@ *)ip2;
-        /*
-         * FIXME: On x86 at least, dividing the smallest representable integer
-         * by -1 causes a SIFGPE (division overflow). We treat this case here
-         * (to avoid a SIGFPE crash at python level), but a good solution would
-         * be to treat integer division problems separately from FPU exceptions
-         * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-         */
-        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+
+        /* If divisor is 0, we need not compute anything */
+        if (in2 == 0) {
             npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
-            *((@type@ *)op1) = in1/in2 - 1;
+            BINARY_LOOP_SLIDING {
+                *((@type@ *)op1) = 0;
+            }
         }
         else {
-            *((@type@ *)op1) = in1/in2;
+            struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
+            BINARY_LOOP_SLIDING {
+                const @type@ in1 = *(@type@ *)ip1;
+                /*
+                 * FIXME: On x86 at least, dividing the smallest representable integer
+                 * by -1 causes a SIFGPE (division overflow). We treat this case here
+                 * (to avoid a SIGFPE crash at python level), but a good solution would
+                 * be to treat integer division problems separately from FPU exceptions
+                 * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+                 */
+                if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
+                    npy_set_floatstatus_divbyzero();
+                    *((@type@ *)op1) = 0;
+                }
+                else {
+                    *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);
+
+                    /* Negative quotients needs to be rounded down */
+                    if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
+                        *((@type@ *)op1) = *((@type@ *)op1) - 1;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            /*
+             * FIXME: On x86 at least, dividing the smallest representable integer
+             * by -1 causes a SIFGPE (division overflow). We treat this case here
+             * (to avoid a SIGFPE crash at python level), but a good solution would
+             * be to treat integer division problems separately from FPU exceptions
+             * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+             */
+            if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+                npy_set_floatstatus_divbyzero();
+                *((@type@ *)op1) = 0;
+            }
+            else {
+                *((@type@ *)op1) = in1/in2;
+
+                /* Negative quotients needs to be rounded down */
+                if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
+                    *((@type@ *)op1) = *((@type@ *)op1) - 1;
+                }
+            }
         }
     }
 }
@@ -1352,14 +1415,48 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const
 NPY_NO_EXPORT void
 TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
         const npy_int64 in2 = *(npy_int64 *)ip2;
-        if (in1 == NPY_DATETIME_NAT || in2 == 0) {
-            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+
+        /* If divisor is 0, we need not compute anything */
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            BINARY_LOOP_SLIDING {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
         }
         else {
-            *((npy_timedelta *)op1) = in1 / in2;
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+            BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+                }
+                else {
+                    *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);;
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_int64 in2 = *(npy_int64 *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == 0) {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+            else {
+                *((npy_timedelta *)op1) = in1 / in2;
+            }
         }
     }
 }
@@ -1431,23 +1528,69 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const
 NPY_NO_EXPORT void
 TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const npy_timedelta in1 = *(npy_timedelta *)ip1;
-        const npy_timedelta in2 = *(npy_timedelta *)ip2;
-        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
-            npy_set_floatstatus_invalid();
-            *((npy_int64 *)op1) = 0;
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
         }
-        else if (in2 == 0) {
+
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+
+        /* If divisor is 0 or NAT, we need not compute anything */
+        if (in2 == 0) {
             npy_set_floatstatus_divbyzero();
-            *((npy_int64 *)op1) = 0;
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
         }
         else {
-            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
-                *((npy_int64 *)op1) = in1/in2 - 1;
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+             BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    npy_set_floatstatus_invalid();
+                    *((npy_int64 *)op1) = 0;
+                }
+                else {
+                    *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d);
+
+                    /* Negative quotients needs to be rounded down */
+                    if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                        *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_timedelta in2 = *(npy_timedelta *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+                npy_set_floatstatus_invalid();
+                *((npy_int64 *)op1) = 0;
+            }
+            else if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_int64 *)op1) = 0;
             }
             else {
                 *((npy_int64 *)op1) = in1/in2;
+
+                /* Negative quotients needs to be rounded down */
+                if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                    *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                }
             }
         }
     }
@@ -1491,26 +1634,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@(in1);
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
  */
@@ -1558,6 +1681,14 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         *(npy_double *)op1 = npy_exp(in1);
     }
 }
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log(in1);
+    }
+}
 
 /**begin repeat
  * #isa = avx512f, fma#
@@ -1571,53 +1702,6 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *  #typesub = f, #
  */
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = 1.0f/in1;
-        }
-    }
-}
-
 /**begin repeat2
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -1700,6 +1784,16 @@ DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step
     }
 }
 
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_avx512f_log_DOUBLE(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const npy_double in1 = *(npy_double *)ip1;
+            *(npy_double *)op1 = npy_log(in1);
+        }
+    }
+}
 
 /**begin repeat
  * Float types
@@ -2004,8 +2098,7 @@ NPY_NO_EXPORT void
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        @type@ mod;
-        *((@type@ *)op1) = npy_divmod@c@(in1, in2, &mod);
+        *((@type@ *)op1) = npy_floor_divide@c@(in1, in2);
     }
 }
 
@@ -2015,7 +2108,7 @@ NPY_NO_EXPORT void
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        npy_divmod@c@(in1, in2, (@type@ *)op1);
+        *((@type@ *) op1) = npy_remainder@c@(in1, in2);
     }
 }
 
@@ -2030,33 +2123,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    char * margs[] = {args[0], args[0], args[1]};
-    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
-    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    @type@ one = 1.@c@;
-    char * margs[] = {(char*)&one, args[0], args[1]};
-    npy_intp msteps[] = {0, steps[0], steps[1]};
-    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = 1/in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -2074,20 +2140,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
 @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
@@ -2198,6 +2250,42 @@ NPY_NO_EXPORT void
 
 /*
  *****************************************************************************
+ **                          LONGDOUBLE LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble*)ip1;
+        *((npy_longdouble *)op1) = 1/in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
+        /* add 0 to clear -0.0 */
+        *((npy_longdouble *)op1) = tmp + 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1*in1;
+    }
+}
+
+/*
+ *****************************************************************************
  **                          HALF-FLOAT LOOPS                               **
  *****************************************************************************
  */
@@ -2360,8 +2448,13 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     BINARY_LOOP {
         const npy_half in1 = *(npy_half *)ip1;
         const npy_half in2 = *(npy_half *)ip2;
-        npy_half mod;
-        *((npy_half *)op1) = npy_half_divmod(in1, in2, &mod);
+
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float div;
+
+        div = npy_floor_dividef(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(div);
     }
 }
 
@@ -2371,7 +2464,11 @@ HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, v
     BINARY_LOOP {
         const npy_half in1 = *(npy_half *)ip1;
         const npy_half in2 = *(npy_half *)ip2;
-        npy_half_divmod(in1, in2, (npy_half *)op1);
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float mod;
+        mod = npy_remainderf(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(mod);
     }
 }
 
@@ -2629,7 +2726,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (IS_BINARY_REDUCE && @PW@) {
+    // Parenthesis around @PW@ tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (@PW@)) {
         npy_intp n = dimensions[0];
         @ftype@ * or = ((@ftype@ *)args[0]);
         @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5dd49c465..a0b68d168 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -6,6 +6,10 @@
 #ifndef _NPY_UMATH_LOOPS_H_
 #define _NPY_UMATH_LOOPS_H_
 
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
 #define BOOL_invert BOOL_logical_not
 #define BOOL_add BOOL_logical_or
 #define BOOL_bitwise_and BOOL_logical_and
@@ -167,32 +171,29 @@ NPY_NO_EXPORT void
  **                             FLOAT LOOPS                                 **
  *****************************************************************************
  */
-
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 /**begin repeat1
- * #func = maximum, minimum#
+ * #kind = sqrt, absolute, square, reciprocal#
  */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
+/**end repeat**/
 
-/**begin repeat1
- * #isa = avx512f, fma#
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
  */
-
-/**begin repeat2
- * #func = sqrt, absolute, square, reciprocal#
+/**begin repeat1
+ * #func = maximum, minimum#
  */
 NPY_NO_EXPORT void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-/**end repeat2**/
 /**end repeat1**/
 /**end repeat**/
 
@@ -202,6 +203,12 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 NPY_NO_EXPORT void
 DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
 /**begin repeat
  *  #func = sin, cos, exp, log#
  */
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
new file mode 100644
index 000000000..3a1ea82f9
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -0,0 +1,219 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 vsx2 neon
+ **/
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/**********************************************************
+ ** Scalars
+ **********************************************************/
+#if !NPY_SIMD
+NPY_FINLINE float c_recip_f32(float a)
+{ return 1.0f / a; }
+NPY_FINLINE float c_abs_f32(float a)
+{
+    const float tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE float c_square_f32(float a)
+{ return a * a; }
+#endif // !NPY_SIMD
+
+#if !NPY_SIMD_F64
+NPY_FINLINE double c_recip_f64(double a)
+{ return 1.0 / a; }
+NPY_FINLINE double c_abs_f64(double a)
+{
+    const double tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE double c_square_f64(double a)
+{ return a * a; }
+#endif // !NPY_SIMD_F64
+/**
+ * MSVC(32-bit mode) requires a clarified contiguous loop
+ * in order to use SSE, otherwise it uses a soft version of square root
+ * that doesn't raise a domain error.
+ */
+#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
+    #include <emmintrin.h>
+    NPY_FINLINE float c_sqrt_f32(float _a)
+    {
+        __m128 a = _mm_load_ss(&_a);
+        __m128 lower = _mm_sqrt_ss(a);
+        return _mm_cvtss_f32(lower);
+    }
+    NPY_FINLINE double c_sqrt_f64(double _a)
+    {
+        __m128d a = _mm_load_sd(&_a);
+        __m128d lower = _mm_sqrt_pd(a);
+        return _mm_cvtsd_f64(lower);
+    }
+#else
+    #define c_sqrt_f32 npy_sqrtf
+    #define c_sqrt_f64 npy_sqrt
+#endif
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind     = sqrt, absolute, square, reciprocal#
+ * #intr     = sqrt, abs,      square, recip#
+ * #repl_0w1 = 0,    0,        0,      1#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ * #unroll = 4,      4,       2,       2#
+ */
+static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *src = _src;
+          npyv_lanetype_@sfx@ *dst = _dst;
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * @unroll@;
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @STYPE@ == CONTIG
+                npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
+            #else
+                npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
+            #endif
+            npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
+        #endif
+        /**end repeat3**/
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @DTYPE@ == CONTIG
+                npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
+            #else
+                npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
+            #endif
+        #endif
+        /**end repeat3**/
+    }
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if @STYPE@ == CONTIG
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
+        #endif
+    #else
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+        #endif
+    #endif
+        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+    #if @DTYPE@ == CONTIG
+        npyv_store_till_@sfx@(dst, len, v_unary0);
+    #else
+        npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
+    #endif
+    }
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind  = sqrt, absolute, square, reciprocal#
+ * #intr  = sqrt, abs,      square, recip#
+ * #clear = 0,    1,        0,      0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if @VCHK@
+    const int lsize = sizeof(npyv_lanetype_@sfx@);
+    assert(src_step % lsize == 0 && dst_step % lsize == 0);
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // @VCHK@
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if @VCHK@
+        // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
+        *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
+    #endif
+    }
+#if @VCHK@
+clear:;
+#endif
+#if @clear@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
new file mode 100644
index 000000000..f5540bdae
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h
@@ -0,0 +1,42 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 5cbb6e94d..0e47d1ab5 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -27,6 +27,7 @@
  *****************************************************************************
  */
 
+#if defined(HAVE_CBLAS)
 /*
  * -1 to be conservative, in case blas internally uses a for loop with an
  * inclusive upper bound
@@ -61,7 +62,6 @@ is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
     return NPY_FALSE;
 }
 
-#if defined(HAVE_CBLAS)
 static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
 static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
 
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index 36c8b6c03..45487d0a8 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -1,6 +1,7 @@
 #ifndef __NPY_SIMD_DATA_H_
 #define __NPY_SIMD_DATA_H_
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
 /*
  * Constants used in vector implementation of float64 exp(x)
  */
@@ -85,6 +86,7 @@ static npy_uint64 EXP_Table_tail[32] = {
     0x3C99D3E12DD8A18B,
 };
 #endif
+#endif
 
 /*
  * Constants used in vector implementation of exp(x)
@@ -134,4 +136,156 @@ static npy_uint64 EXP_Table_tail[32] = {
 #define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
 #define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
 
+/*
+ * Lookup table of log(c_k)
+ * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the 
+ *     logarithm function in IEEE floating-point arithmetic." ACM Transactions 
+ *     on Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static npy_uint64 LOG_TABLE_TOP[64] = {
+    0x0000000000000000,
+    0x3F8FC0A8B1000000,
+    0x3F9F829B0E780000,
+    0x3FA77458F6340000,
+    0x3FAF0A30C0100000,
+    0x3FB341D7961C0000,
+    0x3FB6F0D28AE60000,
+    0x3FBA926D3A4A0000,
+    0x3FBE27076E2A0000,
+    0x3FC0D77E7CD10000,
+    0x3FC29552F8200000,
+    0x3FC44D2B6CCB0000,
+    0x3FC5FF3070A80000,
+    0x3FC7AB8902110000,
+    0x3FC9525A9CF40000,
+    0x3FCAF3C94E810000,
+    0x3FCC8FF7C79B0000,
+    0x3FCE27076E2B0000,
+    0x3FCFB9186D5E0000,
+    0x3FD0A324E2738000,
+    0x3FD1675CABAB8000,
+    0x3FD22941FBCF8000,
+    0x3FD2E8E2BAE10000,
+    0x3FD3A64C55698000,
+    0x3FD4618BC21C8000,
+    0x3FD51AAD872E0000,
+    0x3FD5D1BDBF580000,
+    0x3FD686C81E9B0000,
+    0x3FD739D7F6BC0000,
+    0x3FD7EAF83B828000,
+    0x3FD89A3386C18000,
+    0x3FD947941C210000,
+    0x3FD9F323ECBF8000,
+    0x3FDA9CEC9A9A0000,
+    0x3FDB44F77BCC8000,
+    0x3FDBEB4D9DA70000,
+    0x3FDC8FF7C79A8000,
+    0x3FDD32FE7E010000,
+    0x3FDDD46A04C20000,
+    0x3FDE744261D68000,
+    0x3FDF128F5FAF0000,
+    0x3FDFAF588F790000,
+    0x3FE02552A5A5C000,
+    0x3FE0723E5C1CC000,
+    0x3FE0BE72E4254000,
+    0x3FE109F39E2D4000,
+    0x3FE154C3D2F4C000,
+    0x3FE19EE6B467C000,
+    0x3FE1E85F5E704000,
+    0x3FE23130D7BEC000,
+    0x3FE2795E1289C000,
+    0x3FE2C0E9ED448000,
+    0x3FE307D7334F0000,
+    0x3FE34E289D9D0000,
+    0x3FE393E0D3564000,
+    0x3FE3D9026A714000,
+    0x3FE41D8FE8468000,
+    0x3FE4618BC21C4000,
+    0x3FE4A4F85DB04000,
+    0x3FE4E7D811B74000,
+    0x3FE52A2D265BC000,
+    0x3FE56BF9D5B40000,
+    0x3FE5AD404C358000,
+    0x3FE5EE02A9240000,
+};
+
+static npy_uint64 LOG_TABLE_TAIL[64] = {
+    0x0000000000000000,
+    0xBD5FE0E183092C59,
+    0x3D2980267C7E09E4,
+    0xBD62303B9CB0D5E1,
+    0x3D662A6617CC9717,
+    0xBD4717B6B33E44F8,
+    0xBD62968C836CC8C2,
+    0x3D6AAC6CA17A4554,
+    0x3D6E5CBD3D50FFFC,
+    0xBD6C69A65A23A170,
+    0xBD35B967F4471DFC,
+    0x3D6F4799F4F6543E,
+    0xBD6B0B0DE3077D7E,
+    0xBD537B720E4A694B,
+    0x3D65AD1D904C1D4E,
+    0xBD600349CC67F9B2,
+    0xBD697794F689F843,
+    0xBD3A342C2AF0003C,
+    0x3D5F1546AAA3361C,
+    0x3D50E35F73F7A018,
+    0x3D630701CE63EAB9,
+    0xBD3A6976F5EB0963,
+    0x3D5D309C2CC91A85,
+    0xBD6D0B1C68651946,
+    0xBD609EC17A426426,
+    0xBD3F4BD8DB0A7CC1,
+    0x3D4394A11B1C1EE4,
+    0x3D54AEC442BE1015,
+    0xBD67FCB18ED9D603,
+    0x3D67E1B259D2F3DA,
+    0xBD6ED2A52C73BF78,
+    0x3D56FABA4CDD147D,
+    0x3D584BF2B68D766F,
+    0x3D40931A909FEA5E,
+    0x3D4EC5197DDB55D3,
+    0x3D5B7BF7861D37AC,
+    0x3D5A21AC25DB1EF3,
+    0xBD542A9E21373414,
+    0xBD6DAFA08CECADB1,
+    0x3D3E1F8DF68DBCF3,
+    0x3D3BB2CD720EC44C,
+    0xBD49C24CA098362B,
+    0x3D60FEC69C695D7F,
+    0x3D6F404E57963891,
+    0xBD657D49676844CC,
+    0x3D592DFBC7D93617,
+    0x3D65E9A98F33A396,
+    0x3D52DD98B97BAEF0,
+    0x3D1A07BD8B34BE7C,
+    0xBD17AFA4392F1BA7,
+    0xBD5DCA290F818480,
+    0x3D5D1772F5386374,
+    0x3D60BE1FB590A1F5,
+    0xBD6E2CE9146D271A,
+    0xBD65E6563BBD9FC9,
+    0x3D66FAA404263D0B,
+    0xBD5AA33736867A17,
+    0x3D6EC27D0B7B37B3,
+    0xBD244FDD840B8591,
+    0x3D6BB09CB0985646,
+    0x3D46ABB9DF22BC57,
+    0xBD58CD7DC73BD194,
+    0x3D6F2CFB29AAA5F0,
+    0x3D66757006095FD2,
+};
+
+#define NPY_TANG_LOG_A1 0x1.55555555554e6p-4
+#define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7
+#define NPY_TANG_LOG_A3 0x1.2492307f1519fp-9
+#define NPY_TANG_LOG_A4 0x1.c8034c85dfffp-12
+
+#define NPY_TANG_LOG_LN2HI 0x1.62e42fefa4p-1
+#define NPY_TANG_LOG_LN2LO -0x1.8432a1b0e2634p-43
+#endif
+#endif
+
 #endif
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index bf6e5a698..a0090e302 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -605,7 +605,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
         goto fail;
     }
 
-    method_name = PyUString_FromString(method);
+    method_name = PyUnicode_FromString(method);
     if (method_name == NULL) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 4037a4757..f1423d8b9 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -16,7 +16,6 @@
 #include "npy_config.h"
 #include <numpy/arrayobject.h>
 
-#include "npy_config.h"
 #include "npy_pycompat.h"
 #include "ctors.h"
 
@@ -254,7 +253,6 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         }
         op_flags[2] = NPY_ITER_READONLY;
     }
-
     /* Set up result array axes mapping, operand and wheremask use default */
     int result_axes[NPY_MAXDIMS];
     int *op_axes[3] = {result_axes, NULL, NULL};
@@ -363,7 +361,6 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
 
         if (loop(iter, dataptr, strideptr, countptr,
                         iternext, needs_api, skip_first_count, data) < 0) {
-
             goto fail;
         }
     }
@@ -379,7 +376,10 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     }
     Py_INCREF(result);
 
-    NpyIter_Deallocate(iter);
+    if (!NpyIter_Deallocate(iter)) {
+        Py_DECREF(result);
+        return NULL;
+    }
     return result;
 
 fail:
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 90cc7a513..86dade0f1 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -285,7 +285,11 @@ static void
 @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
     @type@ mod;
 
-    *out = npy_divmod@c@(a, b, &mod);
+    if (!b) {
+        *out = a / b;
+    } else {
+        *out = npy_divmod@c@(a, b, &mod);
+    }
 }
 
 
@@ -318,7 +322,11 @@ static void
 half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) {
     npy_half mod;
 
-    *out = npy_half_divmod(a, b, &mod);
+    if (!b) {
+        *out = a / b;
+    } else {
+        *out = npy_half_divmod(a, b, &mod);
+    }
 }
 
 
@@ -794,15 +802,8 @@ static PyObject *
 {
     PyObject *ret;
     @type@ arg1, arg2;
-    /*
-     * NOTE: In gcc >= 4.1, the compiler will reorder floating point
-     *       operations and floating point error state checks. In
-     *       particular, the arithmetic operations were being reordered
-     *       so that the errors weren't caught.  Declaring this output
-     *       variable volatile was the minimal fix for the issue.
-     *       (Ticket #1671)
-     */
-    volatile @otype@ out;
+    @otype@ out;
+
 #if @twoout@
     @otype@ out2;
     PyObject *obj;
@@ -932,96 +933,14 @@ static PyObject *
  *          Double, LongDouble,
  *          CFloat, CDouble, CLongDouble#
  *
- * #isint = (1,0)*5,0*7#
+ * #isint = 1*10,0*7#
+ * #isuint = (0,1)*5,0*7#
  * #cmplx = 0*14,1*3#
  * #iszero = _IS_ZERO*10, npy_half_iszero, _IS_ZERO*6#
  * #zero = 0*10, NPY_HALF_ZERO, 0*6#
  * #one = 1*10, NPY_HALF_ONE, 1*6#
  */
 
-#if @cmplx@
-static PyObject *
-@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
-{
-    PyObject *ret;
-    @type@ arg1, arg2;
-    int retstatus;
-    int first;
-    @type@ out = {@zero@, @zero@};
-
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
-
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* can't cast both safely mixed-types? */
-            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
-        case -3:
-        default:
-            /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    if (modulo != Py_None) {
-        /* modular exponentiation is not implemented (gh-8804) */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-
-    npy_clear_floatstatus_barrier((char*)&out);
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * as a function call.
-     */
-    if (@iszero@(arg2.real) && @iszero@(arg2.imag)) {
-        out.real = @one@;
-        out.imag = @zero@;
-    }
-    else {
-        @name@_ctype_power(arg1, arg2, &out);
-    }
-
-    /* Check status flag.  If it is set, then look up what to do */
-    retstatus = npy_get_floatstatus_barrier((char*)&out);
-    if (retstatus) {
-        int bufsize, errmask;
-        PyObject *errobj;
-
-        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
-                                &errobj) < 0) {
-            return NULL;
-        }
-        first = 1;
-        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
-            Py_XDECREF(errobj);
-            return NULL;
-        }
-        Py_XDECREF(errobj);
-    }
-
-    ret = PyArrayScalar_New(@Name@);
-    if (ret == NULL) {
-        return NULL;
-    }
-    PyArrayScalar_ASSIGN(ret, @Name@, out);
-
-    return ret;
-}
-
-#elif @isint@
-
 static PyObject *
 @name@_power(PyObject *a, PyObject *b, PyObject *modulo)
 {
@@ -1058,85 +977,25 @@ static PyObject *
         return Py_NotImplemented;
     }
 
+#if !@isint@
     npy_clear_floatstatus_barrier((char*)&out);
-
+#endif
     /*
      * here we do the actual calculation with arg1 and arg2
      * as a function call.
      */
+#if @isint@ && !@isuint@
     if (arg2 < 0) {
         PyErr_SetString(PyExc_ValueError,
                 "Integers to negative integer powers are not allowed.");
         return NULL;
     }
+#endif
     @name@_ctype_power(arg1, arg2, &out);
 
-    ret = PyArrayScalar_New(@Name@);
-    if (ret == NULL) {
-        return NULL;
-    }
-    PyArrayScalar_ASSIGN(ret, @Name@, out);
-
-    return ret;
-}
-
-#else
-
-static PyObject *
-@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
-{
-    PyObject *ret;
-    @type@ arg1, arg2;
-    int retstatus;
-    int first;
-
-    @type@ out = @zero@;
-
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
-
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* can't cast both safely mixed-types? */
-            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
-        case -3:
-        default:
-            /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    if (modulo != Py_None) {
-        /* modular exponentiation is not implemented (gh-8804) */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-
-    npy_clear_floatstatus_barrier((char*)&out);
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * as a function call.
-     */
-    if (@iszero@(arg2)) {
-        out = @one@;
-    }
-    else {
-        @name@_ctype_power(arg1, arg2, &out);
-    }
-
+#if !@isint@
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    int retstatus = npy_get_floatstatus_barrier((char*)&out);
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -1145,13 +1004,14 @@ static PyObject *
                                 &errobj) < 0) {
             return NULL;
         }
-        first = 1;
+        int first = 1;
         if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
             Py_XDECREF(errobj);
             return NULL;
         }
         Py_XDECREF(errobj);
     }
+#endif
 
     ret = PyArrayScalar_New(@Name@);
     if (ret == NULL) {
@@ -1162,7 +1022,6 @@ static PyObject *
     return ret;
 }
 
-#endif
 
 /**end repeat**/
 #undef _IS_ZERO
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8f01d33fa..a118fb0d0 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,6 +28,8 @@
 #undef __AVX512F__
 #endif
 #endif
+#include "simd/simd.h"
+#include "loops_utils.h" // nomemoverlap
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
@@ -50,37 +52,6 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-/*
- * nomemoverlap - returns true if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-static NPY_INLINE npy_bool
-nomemoverlap(char *ip,
-             npy_intp ip_size,
-             char *op,
-             npy_intp op_size)
-{
-    char *ip_start, *ip_end, *op_start, *op_end;
-    if (ip_size < 0) {
-        ip_start = ip + ip_size;
-        ip_end = ip;
-    }
-    else {
-        ip_start = ip;
-        ip_end = ip + ip_size;
-    }
-    if (op_size < 0) {
-        op_start = op + op_size;
-        op_end = op;
-    }
-    else {
-        op_start = op;
-        op_end = op + op_size;
-    }
-    return (ip_start > op_end) | (op_start > ip_end);
-}
-
 #define IS_BINARY_STRIDE_ONE(esize, vsize) \
     ((steps[0] == esize) && \
      (steps[1] == esize) && \
@@ -114,16 +85,16 @@ nomemoverlap(char *ip,
  * should have no overlap in memory.
  */
 #define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((abs(steps[0]) < MAX_STEP_SIZE)  && \
-     (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
      (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
 
 #define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((abs(steps[0]) < MAX_STEP_SIZE)  && \
-     (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
 
@@ -134,7 +105,7 @@ nomemoverlap(char *ip,
  */
 #define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
     ((steps[0] & (esizein-1)) == 0 && \
-     steps[1] == (esizeout) && abs(steps[0]) < MAX_STEP_SIZE && \
+     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
      (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
 
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
@@ -389,7 +360,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat2
- *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ *  #func = rint, floor, ceil, trunc#
  */
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -480,17 +451,38 @@ run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
     return 0;
 }
 
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
+#endif
+static NPY_INLINE int
+run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
+        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+#endif
+    return 0;
+}
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
+ *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
 
 /**begin repeat1
- * #func = sqrt, absolute, negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary*3, unary_reduce*2#
+ * #func = absolute, negative, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 #
+ * #name = unary*2, unary_reduce*2#
  */
 
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -533,6 +525,18 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
+#elif @VECTOR@
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
 #endif
 
 static NPY_INLINE int
@@ -564,6 +568,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
+#elif @VECTOR@
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
 #endif
     return 0;
 }
@@ -841,7 +864,7 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
             }
         }
     }
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
@@ -983,7 +1006,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
     }
 
 
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
@@ -1050,7 +1073,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
         }
     }
 
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
@@ -1304,33 +1327,6 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
 }
 /**end repeat1**/
 
-static void
-sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    /* align output to VECTOR_SIZE_BYTES bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = @scalarf@(ip[i]);
-    }
-    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
-           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
-    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = @scalarf@(ip[i]);
-    }
-}
-
-
 static NPY_INLINE
 @type@ scalar_abs_@type@(@type@ v)
 {
@@ -1795,14 +1791,27 @@ avx512_permute_x4var_pd(__m512d t0,
                         __m512d t3,
                         __m512i index)
 {
-
-    __mmask8 lut_mask = _mm512_cmp_epi64_mask(index, _mm512_set1_epi64(15),
-                                  _MM_CMPINT_GT);
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
     __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
     __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
     return _mm512_mask_blend_pd(lut_mask, res1, res2);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
+                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
+    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
 /**begin repeat
  *  #vsub  = ps, pd#
  *  #type= npy_float, npy_double#
@@ -2107,7 +2116,7 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
             x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
         }
 #if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit); 
+        x1 = _mm512_and_@vsuffix@(x1,signbit);
 #endif
 
         @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
@@ -2200,7 +2209,7 @@ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const
         }
 
         @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
-        
+
         if (stride_op == 1) {
             _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
         }
@@ -2392,9 +2401,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf = rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2409,10 +2417,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     npy_intp num_remaining_elements = array_size;
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
     @mask@ load_mask = @isa@_get_full_load_mask_ps();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
-
     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via
@@ -2429,20 +2433,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_ps(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
-#endif
         }
         else {
             x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
@@ -2478,9 +2472,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf =  rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2494,9 +2487,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
     npy_intp num_remaining_elements = array_size;
     @mask@ load_mask = @isa@_get_full_load_mask_pd();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
     @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
 
     /*
@@ -2514,20 +2504,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_pd(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
-#endif
         }
         else {
             x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
@@ -2550,6 +2530,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #BYTES = 32, 64#
+ * #NUM_LANES = 8, 16#
  * #mask = __m256, __mmask16#
  * #vsub = , _mask#
  * #or_masks =_mm256_or_ps, _mm512_kor#
@@ -2593,7 +2574,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                    NPY_TRIG_OP my_trig_op)
 {
     const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = @NUM_LANES@;
     npy_float large_number = 71476.0625f;
     if (my_trig_op == npy_compute_sin) {
         large_number = 117435.992f;
@@ -2642,12 +2623,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                                          num_lanes);
         }
 
-        @vtype@ x;
+        @vtype@ x_in;
         if (stride == 1) {
-            x = @isa@_masked_load_ps(load_mask, ip);
+            x_in = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
+            x_in = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
         }
 
         /*
@@ -2656,10 +2637,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * these numbers
          */
 
-        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
+        glibc_mask = @isa@_in_range_mask(x_in, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
-        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -2698,20 +2679,23 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         }
 
         /* process elements using glibc for large elements */
-        if (my_trig_op == npy_compute_cos) {
-            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
-                if (iglibc_mask & 0x01) {
-                    op[ii] = npy_cosf(ip[jj]);
+        if (iglibc_mask != 0) {
+            float NPY_DECL_ALIGNED(@BYTES@) ip_fback[@NUM_LANES@];
+            _mm@vsize@_store_ps(ip_fback, x_in);
+
+            if (my_trig_op == npy_compute_cos) {
+                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
+                    if (iglibc_mask & 0x01) {
+                        op[ii] = npy_cosf(ip_fback[ii]);
+                    }
                 }
-                iglibc_mask  = iglibc_mask >> 1;
             }
-        }
-        else {
-            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
-                if (iglibc_mask & 0x01) {
-                    op[ii] = npy_sinf(ip[jj]);
+            else {
+                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
+                    if (iglibc_mask & 0x01) {
+                        op[ii] = npy_sinf(ip_fback[ii]);
+                    }
                 }
-                iglibc_mask  = iglibc_mask >> 1;
             }
         }
         ip += num_lanes*stride;
@@ -3051,7 +3035,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
     __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
     __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
     __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
-    
+
     __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
     __mmask8 load_mask = avx512_get_full_load_mask_pd();
     __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
@@ -3075,12 +3059,12 @@ AVX512F_exp_DOUBLE(npy_double * op,
         xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
         xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
         inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
-        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x), 
+        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
                                 _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
-        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs), 
+        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
                                     mTH_nearzero, _CMP_LT_OQ);
         nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
-        overflow_mask = _mm512_kor(overflow_mask, 
+        overflow_mask = _mm512_kor(overflow_mask,
                                 _mm512_kxor(xmax_mask, inf_mask));
         x = avx512_set_masked_lanes_pd(x, zeros_d,
                         _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
@@ -3088,7 +3072,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
 
         /* z = x * 32/ln2 */
         __m512d z = _mm512_mul_pd(x, InvLn2N);
-        
+
         /* round to nearest */
         __m512d kd = _mm512_add_pd(z, mShift);
         __m512i ki = _mm512_castpd_si512(kd);
@@ -3115,9 +3099,9 @@ AVX512F_exp_DOUBLE(npy_double * op,
         __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
                                   mTable_tail_2, mTable_tail_3, j);
 
-        /* 
+        /*
          * s = top + tail;
-         * exp(x) = 2^m * (top + (tail + s * p)); 
+         * exp(x) = 2^m * (top + (tail + s * p));
          */
         __m512d s = _mm512_add_pd(top, tail);
         __m512d res = _mm512_fmadd_pd(s, p, tail);
@@ -3125,9 +3109,9 @@ AVX512F_exp_DOUBLE(npy_double * op,
         res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
 
         /* return special cases */
-        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d), 
+        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
                                         nearzero_mask);
-        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN), 
+        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
                                         nan_mask);
         res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
         res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
@@ -3145,6 +3129,209 @@ AVX512F_exp_DOUBLE(npy_double * op,
 #endif
 #endif
 
+/*
+ * Vectorized implementation of log double using AVX512
+ * Reference:
+ * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
+ *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
+ *     IL (USA), 1991.
+ * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
+ *     function in IEEE floating-point arithmetic." ACM Transactions on
+ *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
+ *     implementation." (2016).
+ * 1) if x = 0; return -INF
+ * 2) if x < 0; return NAN
+ * 3) if x is INF; return INF
+ * 4) if x is NAN; return NAN
+ * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
+ * 6) Range reduction:
+ *    log(x) = log(2^m * z)
+ *           = mln2 + log(z)
+ * 7) log(z) = log(z / c_k) + log(c_k);
+ *    where c_k = 1 + k/64, k = 0,1,...,64
+ *    s.t. |x - c_k| <= 1/128 when x on[1,2].
+ * 8) r = 2(x - c_k)/(x + c_k)
+ *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
+ *               = p(r)
+ *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
+AVX512F_log_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000));
+    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
+    __m512d mNan = _mm512_set1_pd(NPY_NAN);
+    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
+    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
+    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
+
+    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
+    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    /* Load lookup table data */
+    /**begin repeat
+     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+     */
+
+    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
+    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
+
+    /**end repeat**/
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 divide_by_zero_mask = invalid_mask;
+
+    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
+             glibc_mask;
+
+    __m512d x_in;
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        if (1 == stride) {
+            x_in = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
+        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
+        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
+        glibc_mask =  m1 & m2;
+
+        if (glibc_mask != 0xFF) {
+            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
+            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
+            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
+            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);
+
+            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
+            invalid_mask = invalid_mask | negx_mask;
+
+            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
+            __m512i ix = (__m512i)x;
+
+            /* Normalize x when it is denormal */
+            __m512i top12 = _mm512_and_epi64(ix,
+                                _mm512_set1_epi64(0xfff0000000000000));
+            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
+                                _CMP_EQ_OQ);
+            denormal_mask = (~zero_mask) & denormal_mask;
+            ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask,
+                                    x, _mm512_set1_pd(0x1p52));
+            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
+                                    ix, _mm512_set1_epi64(52ULL << 52));
+
+            /*
+             * x = 2^k * z; where z in range [1,2]
+             */
+            __m512i tmp = _mm512_sub_epi64(ix,
+                              _mm512_set1_epi64(0x3ff0000000000000));
+            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
+                            _mm512_set1_epi64(0x3fULL));
+            __m512i ik = _mm512_srai_epi64(tmp, 52);
+            __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
+                            _mm512_set1_epi64(0xfff0000000000000))));
+            /* c = i/64 + 1 */
+            __m256i i_32 = _mm512_cvtepi64_epi32(i);
+            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
+
+            /* u = 2 * (z - c) / (z + c) */
+            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
+            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
+
+            /* v = u * u */
+            __m512d v = _mm512_mul_pd(u,u);
+
+            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
+            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
+            res = _mm512_fmadd_pd(v, res, mA2);
+            res = _mm512_fmadd_pd(v, res, mA1);
+            res = _mm512_mul_pd(v, res);
+            res = _mm512_fmadd_pd(u, res, u);
+
+            /* Load lookup table data */
+            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
+                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
+                            mLUT_TOP_6, mLUT_TOP_7, i);
+            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
+                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
+                              mLUT_TAIL_6, mLUT_TAIL_7, i);
+
+            /*
+             * log(x) = k * ln2_hi + c_hi +
+             *          k * ln2_lo + c_lo +
+             *          log(z/c)
+             */
+            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
+            __m512d k = _mm512_cvtepi32_pd(ik_32);
+            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
+            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
+            tt = _mm512_add_pd(tt, tt2);
+            res = _mm512_add_pd(tt, res);
+
+            /* return special cases */
+            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
+            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
+
+            _mm512_mask_storeu_pd(op, load_mask, res);
+        }
+
+        /* call glibc's log func when x around 1.0f */
+        if (glibc_mask != 0) {
+            double NPY_DECL_ALIGNED(64) ip_fback[8];
+            _mm512_store_pd(ip_fback, x_in);
+
+            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
+                if (glibc_mask & 0x01) {
+                    op[ii] = npy_log(ip_fback[ii]);
+                }
+            }
+        }
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (invalid_mask) {
+        npy_set_floatstatus_invalid();
+    }
+    if (divide_by_zero_mask) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif
+#endif
+
 /**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
@@ -3317,7 +3504,7 @@ AVX512F_absolute_@TYPE@(@type@ * op,
         ip += 2*@num_lanes@*stride_ip1;
         num_remaining_elements -= 2*@num_lanes@;
     }
-    npy_clear_floatstatus_barrier((char*)op);
+    npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
 }
 
 #endif
@@ -3461,7 +3648,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 /**end repeat**/
 
 #undef VECTOR_SIZE_BYTES
+#else  /* NPY_HAVE_SSE2_INTRINSICS */
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #sfx = f32, f64#
+ *  #CHK =    , _F64#
+ */
+
+#if NPY_SIMD@CHK@
+
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
 
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
 
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index b35f377d7..1a035eb61 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -1536,7 +1536,14 @@ iterator_loop(PyUFuncObject *ufunc,
 
         NPY_END_THREADS;
     }
-    return NpyIter_Deallocate(iter);
+    /*
+     * Currently `innerloop` may leave an error set, in this case
+     * NpyIter_Deallocate will always return an error as well.
+     */
+    if (NpyIter_Deallocate(iter) == NPY_FAIL) {
+        return -1;
+    }
+    return 0;
 }
 
 /*
@@ -2425,15 +2432,15 @@ _get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
     switch(ufunc->identity) {
     case PyUFunc_One:
         *reorderable = 1;
-        return PyInt_FromLong(1);
+        return PyLong_FromLong(1);
 
     case PyUFunc_Zero:
         *reorderable = 1;
-        return PyInt_FromLong(0);
+        return PyLong_FromLong(0);
 
     case PyUFunc_MinusOne:
         *reorderable = 1;
-        return PyInt_FromLong(-1);
+        return PyLong_FromLong(-1);
 
     case PyUFunc_ReorderableNone:
         *reorderable = 1;
@@ -3233,9 +3240,13 @@ PyUFunc_GenericFunction_int(PyUFuncObject *ufunc,
         goto fail;
     }
 
-    /* Check whether any errors occurred during the loop */
+    /*
+     * Check whether any errors occurred during the loop. The loops should
+     * indicate this in retval, but since the inner-loop currently does not
+     * report errors, this does not happen in all branches (at this time).
+     */
     if (PyErr_Occurred() ||
-        _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
+            _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
         retval = -1;
         goto fail;
     }
@@ -3307,7 +3318,6 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
                         void **out_innerloopdata)
 {
     int i;
-    PyUFunc_Loop1d *funcdata;
 
     NPY_UF_DBG_PRINT1("Getting binary op function for type number %d\n",
                                 *otype);
@@ -3315,7 +3325,7 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
     /* If the type is custom and there are userloops, search for it here */
     if (ufunc->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
         PyObject *key, *obj;
-        key = PyInt_FromLong(*otype);
+        key = PyLong_FromLong(*otype);
         if (key == NULL) {
             return -1;
         }
@@ -3325,7 +3335,10 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
             return -1;
         }
         else if (obj != NULL) {
-            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
             while (funcdata != NULL) {
                 int *types = funcdata->arg_types;
 
@@ -3997,8 +4010,17 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
 finish:
     Py_XDECREF(op_dtypes[0]);
-    NpyIter_Deallocate(iter);
-    NpyIter_Deallocate(iter_inner);
+    int res = 0;
+    if (!NpyIter_Deallocate(iter)) {
+        res = -1;
+    }
+    if (!NpyIter_Deallocate(iter_inner)) {
+        res = -1;
+    }
+    if (res < 0) {
+        Py_DECREF(out);
+        return NULL;
+    }
 
     return (PyObject *)out;
 
@@ -4379,7 +4401,10 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
 
 finish:
     Py_XDECREF(op_dtypes[0]);
-    NpyIter_Deallocate(iter);
+    if (!NpyIter_Deallocate(iter)) {
+        Py_DECREF(out);
+        return NULL;
+    }
 
     return (PyObject *)out;
 
@@ -4388,7 +4413,6 @@ fail:
     Py_XDECREF(op_dtypes[0]);
 
     NpyIter_Deallocate(iter);
-
     return NULL;
 }
 
@@ -4812,8 +4836,8 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (res == NULL) {
         return NULL;
     }
-    PyList_SET_ITEM(res, 0, PyInt_FromLong(NPY_BUFSIZE));
-    PyList_SET_ITEM(res, 1, PyInt_FromLong(UFUNC_ERR_DEFAULT));
+    PyList_SET_ITEM(res, 0, PyLong_FromLong(NPY_BUFSIZE));
+    PyList_SET_ITEM(res, 1, PyLong_FromLong(UFUNC_ERR_DEFAULT));
     PyList_SET_ITEM(res, 2, Py_None); Py_INCREF(Py_None);
     return res;
 }
@@ -5133,7 +5157,7 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
         return -1;
     }
 
-    key = PyInt_FromLong((long) user_dtype->type_num);
+    key = PyLong_FromLong((long) user_dtype->type_num);
     if (key == NULL) {
         return -1;
     }
@@ -5168,9 +5192,12 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
             result = -1;
         }
         else {
-            PyUFunc_Loop1d *current;
             int cmp = 1;
-            current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj);
+            PyUFunc_Loop1d *current = PyCapsule_GetPointer(cobj, NULL);
+            if (current == NULL) {
+                result = -1;
+                goto done;
+            }
             while (current != NULL) {
                 cmp = cmp_arg_types(current->arg_types,
                     arg_typenums, ufunc->nargs);
@@ -5204,6 +5231,7 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
         }
     }
 
+done:
     PyArray_free(arg_typenums);
 
     Py_DECREF(key);
@@ -5235,7 +5263,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     if (ufunc->userloops == NULL) {
         ufunc->userloops = PyDict_New();
     }
-    key = PyInt_FromLong((long) usertype);
+    key = PyLong_FromLong((long) usertype);
     if (key == NULL) {
         return -1;
     }
@@ -5272,7 +5300,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     }
     /* If it's not there, then make one and return. */
     else if (cobj == NULL) {
-        cobj = NpyCapsule_FromVoidPtr((void *)funcdata, _loop1d_list_free);
+        cobj = PyCapsule_New((void *)funcdata, NULL, _loop1d_list_free);
         if (cobj == NULL) {
             goto fail;
         }
@@ -5290,7 +5318,10 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
          * is exactly like this one, then just replace.
          * Otherwise insert.
          */
-        current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj);
+        current = PyCapsule_GetPointer(cobj, NULL);
+        if (current == NULL) {
+            goto fail;
+        }
         while (current != NULL) {
             cmp = cmp_arg_types(current->arg_types, newtypes, ufunc->nargs);
             if (cmp >= 0) {
@@ -5361,7 +5392,7 @@ ufunc_dealloc(PyUFuncObject *ufunc)
 static PyObject *
 ufunc_repr(PyUFuncObject *ufunc)
 {
-    return PyUString_FromFormat("<ufunc '%s'>", ufunc->name);
+    return PyUnicode_FromFormat("<ufunc '%s'>", ufunc->name);
 }
 
 static int
@@ -5388,13 +5419,11 @@ ufunc_traverse(PyUFuncObject *self, visitproc visit, void *arg)
 static PyObject *
 ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
-    int i;
     int errval;
     PyObject *override = NULL;
     PyObject *ret;
     PyArrayObject *ap1 = NULL, *ap2 = NULL, *ap_new = NULL;
     PyObject *new_args, *tmp;
-    PyObject *shape1, *shape2, *newshape;
     static PyObject *_numpy_matrix;
 
 
@@ -5435,7 +5464,19 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         "matrix",
         &_numpy_matrix);
 
+    const char *matrix_deprecation_msg = (
+            "%s.outer() was passed a numpy matrix as %s argument. "
+            "Special handling of matrix is deprecated and will result in an "
+            "error in most cases. Please convert the matrix to a NumPy "
+            "array to retain the old behaviour. You can use `matrix.A` "
+            "to achieve this.");
+
     if (PyObject_IsInstance(tmp, _numpy_matrix)) {
+        /* DEPRECATED 2020-05-13, NumPy 1.20 */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                matrix_deprecation_msg, ufunc->name, "first") < 0) {
+            return NULL;
+        }
         ap1 = (PyArrayObject *) PyArray_FromObject(tmp, NPY_NOTYPE, 0, 0);
     }
     else {
@@ -5450,6 +5491,12 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         return NULL;
     }
     if (PyObject_IsInstance(tmp, _numpy_matrix)) {
+        /* DEPRECATED 2020-05-13, NumPy 1.20 */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                matrix_deprecation_msg, ufunc->name, "second") < 0) {
+            Py_DECREF(ap1);
+            return NULL;
+        }
         ap2 = (PyArrayObject *) PyArray_FromObject(tmp, NPY_NOTYPE, 0, 0);
     }
     else {
@@ -5460,34 +5507,45 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         Py_DECREF(ap1);
         return NULL;
     }
-    /* Construct new shape tuple */
-    shape1 = PyTuple_New(PyArray_NDIM(ap1));
-    if (shape1 == NULL) {
-        goto fail;
-    }
-    for (i = 0; i < PyArray_NDIM(ap1); i++) {
-        PyTuple_SET_ITEM(shape1, i,
-                PyLong_FromLongLong((npy_longlong)PyArray_DIMS(ap1)[i]));
-    }
-    shape2 = PyTuple_New(PyArray_NDIM(ap2));
-    for (i = 0; i < PyArray_NDIM(ap2); i++) {
-        PyTuple_SET_ITEM(shape2, i, PyInt_FromLong((long) 1));
+    /* Construct new shape from ap1 and ap2 and then reshape */
+    PyArray_Dims newdims;
+    npy_intp newshape[NPY_MAXDIMS];
+    newdims.len = PyArray_NDIM(ap1) + PyArray_NDIM(ap2);
+    newdims.ptr = newshape;
+
+    if (newdims.len > NPY_MAXDIMS) {
+        PyErr_Format(PyExc_ValueError,
+                "maximum supported dimension for an ndarray is %d, but "
+                "`%s.outer()` result would have %d.",
+                NPY_MAXDIMS, ufunc->name, newdims.len);
+        return NPY_FAIL;
     }
-    if (shape2 == NULL) {
-        Py_DECREF(shape1);
+    if (newdims.ptr == NULL) {
         goto fail;
     }
-    newshape = PyNumber_Add(shape1, shape2);
-    Py_DECREF(shape1);
-    Py_DECREF(shape2);
-    if (newshape == NULL) {
-        goto fail;
+    memcpy(newshape, PyArray_DIMS(ap1), PyArray_NDIM(ap1) * sizeof(npy_intp));
+    for (int i = PyArray_NDIM(ap1); i < newdims.len; i++) {
+        newshape[i] = 1;
     }
-    ap_new = (PyArrayObject *)PyArray_Reshape(ap1, newshape);
-    Py_DECREF(newshape);
+
+    ap_new = (PyArrayObject *)PyArray_Newshape(ap1, &newdims, NPY_CORDER);
     if (ap_new == NULL) {
         goto fail;
     }
+    if (PyArray_NDIM(ap_new) != newdims.len ||
+           !PyArray_CompareLists(PyArray_DIMS(ap_new), newshape, newdims.len)) {
+        PyErr_Format(PyExc_TypeError,
+                "%s.outer() called with ndarray-subclass of type '%s' "
+                "which modified its shape after a reshape. `outer()` relies "
+                "on reshaping the inputs and is for example not supported for "
+                "the 'np.matrix' class (the usage of matrix is generally "
+                "discouraged). "
+                "To work around this issue, please convert the inputs to "
+                "numpy arrays.",
+                ufunc->name, Py_TYPE(ap_new)->tp_name);
+        goto fail;
+    }
+
     new_args = Py_BuildValue("(OO)", ap_new, ap2);
     Py_DECREF(ap1);
     Py_DECREF(ap2);
@@ -5920,6 +5978,7 @@ _typecharfromnum(int num) {
     return ret;
 }
 
+
 static PyObject *
 ufunc_get_doc(PyUFuncObject *ufunc)
 {
@@ -5940,40 +5999,40 @@ ufunc_get_doc(PyUFuncObject *ufunc)
      * introspection on name and nin + nout to automate the first part
      * of it the doc string shouldn't need the calling convention
      */
-    doc = PyObject_CallFunctionObjArgs(
-        _sig_formatter, (PyObject *)ufunc, NULL);
+    doc = PyObject_CallFunctionObjArgs(_sig_formatter,
+                                       (PyObject *)ufunc, NULL);
     if (doc == NULL) {
         return NULL;
     }
     if (ufunc->doc != NULL) {
-        PyUString_ConcatAndDel(&doc,
-            PyUString_FromFormat("\n\n%s", ufunc->doc));
+        Py_SETREF(doc, PyUnicode_FromFormat("%S\n\n%s", doc, ufunc->doc));
     }
     return doc;
 }
 
+
 static PyObject *
 ufunc_get_nin(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nin);
+    return PyLong_FromLong(ufunc->nin);
 }
 
 static PyObject *
 ufunc_get_nout(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nout);
+    return PyLong_FromLong(ufunc->nout);
 }
 
 static PyObject *
 ufunc_get_nargs(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nargs);
+    return PyLong_FromLong(ufunc->nargs);
 }
 
 static PyObject *
 ufunc_get_ntypes(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->ntypes);
+    return PyLong_FromLong(ufunc->ntypes);
 }
 
 static PyObject *
@@ -6003,7 +6062,7 @@ ufunc_get_types(PyUFuncObject *ufunc)
             t[ni + 2 + j] = _typecharfromnum(ufunc->types[n]);
             n++;
         }
-        str = PyUString_FromStringAndSize(t, no + ni + 2);
+        str = PyUnicode_FromStringAndSize(t, no + ni + 2);
         PyList_SET_ITEM(list, k, str);
     }
     PyArray_free(t);
@@ -6013,7 +6072,7 @@ ufunc_get_types(PyUFuncObject *ufunc)
 static PyObject *
 ufunc_get_name(PyUFuncObject *ufunc)
 {
-    return PyUString_FromString(ufunc->name);
+    return PyUnicode_FromString(ufunc->name);
 }
 
 static PyObject *
@@ -6029,7 +6088,7 @@ ufunc_get_signature(PyUFuncObject *ufunc)
     if (!ufunc->core_enabled) {
         Py_RETURN_NONE;
     }
-    return PyUString_FromString(ufunc->core_signature);
+    return PyUnicode_FromString(ufunc->core_signature);
 }
 
 #undef _typecharfromnum
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index ea20bb24f..3ce06322f 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -12,6 +12,11 @@
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
+// printif debug tracing
+#ifndef NPY_UF_DBG_TRACING
+    #define NPY_UF_DBG_TRACING 0
+#endif
+
 #include <stdbool.h>
 
 #include "Python.h"
@@ -36,17 +41,17 @@ npy_casting_to_py_object(NPY_CASTING casting)
 {
     switch (casting) {
         case NPY_NO_CASTING:
-            return PyUString_FromString("no");
+            return PyUnicode_FromString("no");
         case NPY_EQUIV_CASTING:
-            return PyUString_FromString("equiv");
+            return PyUnicode_FromString("equiv");
         case NPY_SAFE_CASTING:
-            return PyUString_FromString("safe");
+            return PyUnicode_FromString("safe");
         case NPY_SAME_KIND_CASTING:
-            return PyUString_FromString("same_kind");
+            return PyUnicode_FromString("same_kind");
         case NPY_UNSAFE_CASTING:
-            return PyUString_FromString("unsafe");
+            return PyUnicode_FromString("unsafe");
         default:
-            return PyInt_FromLong(casting);
+            return PyLong_FromLong(casting);
     }
 }
 
@@ -236,21 +241,6 @@ PyUFunc_ValidateCasting(PyUFuncObject *ufunc,
     return 0;
 }
 
-/*
- * Returns a new reference to type if it is already NBO, otherwise
- * returns a copy converted to NBO.
- */
-static PyArray_Descr *
-ensure_dtype_nbo(PyArray_Descr *type)
-{
-    if (PyArray_ISNBO(type->byteorder)) {
-        Py_INCREF(type);
-        return type;
-    }
-    else {
-        return PyArray_DescrNewByteorder(type, NPY_NATIVE);
-    }
-}
 
 /*UFUNC_API
  *
@@ -1336,7 +1326,6 @@ find_userloop(PyUFuncObject *ufunc,
                 void **out_innerloopdata)
 {
     npy_intp i, nin = ufunc->nin, j, nargs = nin + ufunc->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1356,7 +1345,7 @@ find_userloop(PyUFuncObject *ufunc,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1368,9 +1357,11 @@ find_userloop(PyUFuncObject *ufunc,
             else if (obj == NULL) {
                 continue;
             }
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
 
                 for (j = 0; j < nargs; ++j) {
@@ -1744,7 +1735,6 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
                         char *out_err_dst_typecode)
 {
     npy_intp i, nop = self->nin + self->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1764,7 +1754,7 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1776,9 +1766,11 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
             else if (obj == NULL) {
                 continue;
             }
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
                 switch (ufunc_loop_matches(self, op,
                             input_casting, output_casting,
@@ -1816,7 +1808,6 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
                         PyArray_Descr **out_dtype)
 {
     int i, j, nin = self->nin, nop = nin + self->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1831,7 +1822,7 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1844,9 +1835,11 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
                 continue;
             }
 
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
                 int matched = 1;
 
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index bad42d657..474db0245 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -75,7 +75,8 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) {
     int nin, nout, i, nargs;
     PyUFunc_PyFuncData *fdata;
     PyUFuncObject *self;
-    char *fname, *str, *types, *doc;
+    const char *fname = NULL;
+    char *str, *types, *doc;
     Py_ssize_t fname_len = -1;
     void * ptr, **data;
     int offset[2];
@@ -95,12 +96,12 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) {
 
     pyname = PyObject_GetAttrString(function, "__name__");
     if (pyname) {
-        (void) PyString_AsStringAndSize(pyname, &fname, &fname_len);
+        fname = PyUnicode_AsUTF8AndSize(pyname, &fname_len);
     }
-    if (PyErr_Occurred()) {
+    if (fname == NULL) {
+        PyErr_Clear();
         fname = "?";
         fname_len = 1;
-        PyErr_Clear();
     }
 
     /*
@@ -173,25 +174,22 @@ PyObject *
 add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyUFuncObject *ufunc;
-    PyObject *str, *tmp;
-    char *docstr, *newdocstr;
-
+    PyObject *str;
     if (!PyArg_ParseTuple(args, "O!O!:_add_newdoc_ufunc", &PyUFunc_Type, &ufunc,
                                         &PyUnicode_Type, &str)) {
         return NULL;
     }
-    tmp = PyUnicode_AsUTF8String(str);
-    if (tmp == NULL) {
+    if (ufunc->doc != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot change docstring of ufunc with non-NULL docstring");
         return NULL;
     }
-    docstr = PyBytes_AS_STRING(tmp);
 
-    if (NULL != ufunc->doc) {
-        PyErr_SetString(PyExc_ValueError,
-                "Cannot change docstring of ufunc with non-NULL docstring");
-        Py_DECREF(tmp);
+    PyObject *tmp = PyUnicode_AsUTF8String(str);
+    if (tmp == NULL) {
         return NULL;
     }
+    char *docstr = PyBytes_AS_STRING(tmp);
 
     /*
      * This introduces a memory leak, as the memory allocated for the doc
@@ -199,7 +197,11 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
      * this should not be a problem since the user would have to
      * repeatedly create, document, and throw away ufuncs.
      */
-    newdocstr = malloc(strlen(docstr) + 1);
+    char *newdocstr = malloc(strlen(docstr) + 1);
+    if (!newdocstr) {
+        Py_DECREF(tmp);
+        return PyErr_NoMemory();
+    }
     strcpy(newdocstr, docstr);
     ufunc->doc = newdocstr;
 
@@ -232,30 +234,28 @@ NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_finalize = NULL;
 NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_ufunc = NULL;
 NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL;
 
-/* intern some strings used in ufuncs */
+/* intern some strings used in ufuncs, returns 0 on success */
 static int
 intern_strings(void)
 {
-    npy_um_str_out = PyUString_InternFromString("out");
-    npy_um_str_where = PyUString_InternFromString("where");
-    npy_um_str_axes = PyUString_InternFromString("axes");
-    npy_um_str_axis = PyUString_InternFromString("axis");
-    npy_um_str_keepdims = PyUString_InternFromString("keepdims");
-    npy_um_str_casting = PyUString_InternFromString("casting");
-    npy_um_str_order = PyUString_InternFromString("order");
-    npy_um_str_dtype = PyUString_InternFromString("dtype");
-    npy_um_str_subok = PyUString_InternFromString("subok");
-    npy_um_str_signature = PyUString_InternFromString("signature");
-    npy_um_str_sig = PyUString_InternFromString("sig");
-    npy_um_str_extobj = PyUString_InternFromString("extobj");
-    npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__");
-    npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__");
-    npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__");
-    npy_um_str_ufunc = PyUString_InternFromString("__array_ufunc__");
-    npy_um_str_pyvals_name = PyUString_InternFromString(UFUNC_PYVALS_NAME);
-
-    return npy_um_str_out && npy_um_str_subok && npy_um_str_array_prepare &&
-        npy_um_str_array_wrap && npy_um_str_array_finalize && npy_um_str_ufunc;
+    if (!(npy_um_str_out = PyUnicode_InternFromString("out"))) return -1;
+    if (!(npy_um_str_where = PyUnicode_InternFromString("where"))) return -1;
+    if (!(npy_um_str_axes = PyUnicode_InternFromString("axes"))) return -1;
+    if (!(npy_um_str_axis = PyUnicode_InternFromString("axis"))) return -1;
+    if (!(npy_um_str_keepdims = PyUnicode_InternFromString("keepdims"))) return -1;
+    if (!(npy_um_str_casting = PyUnicode_InternFromString("casting"))) return -1;
+    if (!(npy_um_str_order = PyUnicode_InternFromString("order"))) return -1;
+    if (!(npy_um_str_dtype = PyUnicode_InternFromString("dtype"))) return -1;
+    if (!(npy_um_str_subok = PyUnicode_InternFromString("subok"))) return -1;
+    if (!(npy_um_str_signature = PyUnicode_InternFromString("signature"))) return -1;
+    if (!(npy_um_str_sig = PyUnicode_InternFromString("sig"))) return -1;
+    if (!(npy_um_str_extobj = PyUnicode_InternFromString("extobj"))) return -1;
+    if (!(npy_um_str_array_prepare = PyUnicode_InternFromString("__array_prepare__"))) return -1;
+    if (!(npy_um_str_array_wrap = PyUnicode_InternFromString("__array_wrap__"))) return -1;
+    if (!(npy_um_str_array_finalize = PyUnicode_InternFromString("__array_finalize__"))) return -1;
+    if (!(npy_um_str_ufunc = PyUnicode_InternFromString("__array_ufunc__"))) return -1;
+    if (!(npy_um_str_pyvals_name = PyUnicode_InternFromString(UFUNC_PYVALS_NAME))) return -1;
+    return 0;
 }
 
 /* Setup the umath part of the module */
@@ -326,7 +326,7 @@ int initumath(PyObject *m)
     PyDict_SetItemString(d, "conj", s);
     PyDict_SetItemString(d, "mod", s2);
 
-    if (!intern_strings()) {
+    if (intern_strings() < 0) {
         PyErr_SetString(PyExc_RuntimeError,
            "cannot intern umath strings while initializing _multiarray_umath.");
         return -1;
diff --git a/numpy/core/tests/data/umath-validation-set-log b/numpy/core/tests/data/umath-validation-set-log
index a7bd98481..b8f6b0875 100644
--- a/numpy/core/tests/data/umath-validation-set-log
+++ b/numpy/core/tests/data/umath-validation-set-log
@@ -116,3 +116,156 @@ np.float32,0x3f494ab1,0xbe763131,4
 np.float32,0x3f476b69,0xbe7fc2c6,4
 np.float32,0x3f4884e8,0xbe7a214a,4
 np.float32,0x3f486945,0xbe7aae76,4
+#float64
+## +ve denormal ##
+np.float64,0x0000000000000001,0xc0874385446d71c3,1
+np.float64,0x0001000000000000,0xc086395a2079b70c,1
+np.float64,0x000fffffffffffff,0xc086232bdd7abcd2,1
+np.float64,0x0007ad63e2168cb6,0xc086290bc0b2980f,1
+## -ve denormal ##
+np.float64,0x8000000000000001,0xfff8000000000001,1
+np.float64,0x8001000000000000,0xfff8000000000001,1
+np.float64,0x800fffffffffffff,0xfff8000000000001,1
+np.float64,0x8007ad63e2168cb6,0xfff8000000000001,1
+## +/-0.0f, MAX, MIN##
+np.float64,0x0000000000000000,0xfff0000000000000,1
+np.float64,0x8000000000000000,0xfff0000000000000,1
+np.float64,0x7fefffffffffffff,0x40862e42fefa39ef,1
+np.float64,0xffefffffffffffff,0xfff8000000000001,1
+## near 1.0f ##
+np.float64,0x3ff0000000000000,0x0000000000000000,1
+np.float64,0x3fe8000000000000,0xbfd269621134db92,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3ff0000020000000,0x3e7fffffe000002b,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3fefffffe0000000,0xbe70000008000005,1
+np.float64,0x3fefffffffffffff,0xbca0000000000000,1
+## random numbers ##
+np.float64,0x02500186f3d9da56,0xc0855b8abf135773,1
+np.float64,0x09200815a3951173,0xc082ff1ad7131bdc,1
+np.float64,0x0da029623b0243d4,0xc0816fc994695bb5,1
+np.float64,0x48703b8ac483a382,0x40579213a313490b,1
+np.float64,0x09207b74c87c9860,0xc082fee20ff349ef,1
+np.float64,0x62c077698e8df947,0x407821c996d110f0,1
+np.float64,0x2350b45e87c3cfb0,0xc073d6b16b51d072,1
+np.float64,0x3990a23f9ff2b623,0xc051aa60eadd8c61,1
+np.float64,0x0d011386a116c348,0xc081a6cc7ea3b8fb,1
+np.float64,0x1fe0f0303ebe273a,0xc0763870b78a81ca,1
+np.float64,0x0cd1260121d387da,0xc081b7668d61a9d1,1
+np.float64,0x1e6135a8f581d422,0xc077425ac10f08c2,1
+np.float64,0x622168db5fe52d30,0x4077b3c669b9fadb,1
+np.float64,0x69f188e1ec6d1718,0x407d1e2f18c63889,1
+np.float64,0x3aa1bf1d9c4dd1a3,0xc04d682e24bde479,1
+np.float64,0x6c81c4011ce4f683,0x407ee5190e8a8e6a,1
+np.float64,0x2191fa55aa5a5095,0xc0750c0c318b5e2d,1
+np.float64,0x32a1f602a32bf360,0xc06270caa493fc17,1
+np.float64,0x16023c90ba93249b,0xc07d0f88e0801638,1
+np.float64,0x1c525fe6d71fa9ff,0xc078af49c66a5d63,1
+np.float64,0x1a927675815d65b7,0xc079e5bdd7fe376e,1
+np.float64,0x41227b8fe70da028,0x402aa0c9f9a84c71,1
+np.float64,0x4962bb6e853fe87d,0x405a34aa04c83747,1
+np.float64,0x23d2cda00b26b5a4,0xc0737c13a06d00ea,1
+np.float64,0x2d13083fd62987fa,0xc06a25055aeb474e,1
+np.float64,0x10e31e4c9b4579a1,0xc0804e181929418e,1
+np.float64,0x26d3247d556a86a9,0xc0716774171da7e8,1
+np.float64,0x6603379398d0d4ac,0x407a64f51f8a887b,1
+np.float64,0x02d38af17d9442ba,0xc0852d955ac9dd68,1
+np.float64,0x6a2382b4818dd967,0x407d4129d688e5d4,1
+np.float64,0x2ee3c403c79b3934,0xc067a091fefaf8b6,1
+np.float64,0x6493a699acdbf1a4,0x4079663c8602bfc5,1
+np.float64,0x1c8413c4f0de3100,0xc0788c99697059b6,1
+np.float64,0x4573f1ed350d9622,0x404e9bd1e4c08920,1
+np.float64,0x2f34265c9200b69c,0xc067310cfea4e986,1
+np.float64,0x19b43e65fa22029b,0xc07a7f8877de22d6,1
+np.float64,0x0af48ab7925ed6bc,0xc0825c4fbc0e5ade,1
+np.float64,0x4fa49699cad82542,0x4065c76d2a318235,1
+np.float64,0x7204a15e56ade492,0x40815bb87484dffb,1
+np.float64,0x4734aa08a230982d,0x40542a4bf7a361a9,1
+np.float64,0x1ae4ed296c2fd749,0xc079ac4921f20abb,1
+np.float64,0x472514ea4370289c,0x4053ff372bd8f18f,1
+np.float64,0x53a54b3f73820430,0x406b5411fc5f2e33,1
+np.float64,0x64754de5a15684fa,0x407951592e99a5ab,1
+np.float64,0x69358e279868a7c3,0x407c9c671a882c31,1
+np.float64,0x284579ec61215945,0xc0706688e55f0927,1
+np.float64,0x68b5c58806447adc,0x407c43d6f4eff760,1
+np.float64,0x1945a83f98b0e65d,0xc07acc15eeb032cc,1
+np.float64,0x0fc5eb98a16578bf,0xc080b0d02eddca0e,1
+np.float64,0x6a75e208f5784250,0x407d7a7383bf8f05,1
+np.float64,0x0fe63a029c47645d,0xc080a59ca1e98866,1
+np.float64,0x37963ac53f065510,0xc057236281f7bdb6,1
+np.float64,0x135661bb07067ff7,0xc07ee924930c21e4,1
+np.float64,0x4b4699469d458422,0x405f73843756e887,1
+np.float64,0x1a66d73e4bf4881b,0xc07a039ba1c63adf,1
+np.float64,0x12a6b9b119a7da59,0xc07f62e49c6431f3,1
+np.float64,0x24c719aa8fd1bdb5,0xc072d26da4bf84d3,1
+np.float64,0x0fa6ff524ffef314,0xc080bb8514662e77,1
+np.float64,0x1db751d66fdd4a9a,0xc077b77cb50d7c92,1
+np.float64,0x4947374c516da82c,0x4059e9acfc7105bf,1
+np.float64,0x1b1771ab98f3afc8,0xc07989326b8e1f66,1
+np.float64,0x25e78805baac8070,0xc0720a818e6ef080,1
+np.float64,0x4bd7a148225d3687,0x406082d004ea3ee7,1
+np.float64,0x53d7d6b2bbbda00a,0x406b9a398967cbd5,1
+np.float64,0x6997fb9f4e1c685f,0x407ce0a703413eba,1
+np.float64,0x069802c2ff71b951,0xc083df39bf7acddc,1
+np.float64,0x4d683ac9890f66d8,0x4062ae21d8c2acf0,1
+np.float64,0x5a2825863ec14f4c,0x40722d718d549552,1
+np.float64,0x0398799a88f4db80,0xc084e93dab8e2158,1
+np.float64,0x5ed87a8b77e135a5,0x40756d7051777b33,1
+np.float64,0x5828cd6d79b9bede,0x4070cafb22fc6ca1,1
+np.float64,0x7b18ba2a5ec6f068,0x408481386b3ed6fe,1
+np.float64,0x4938fd60922198fe,0x4059c206b762ea7e,1
+np.float64,0x31b8f44fcdd1a46e,0xc063b2faa8b6434e,1
+np.float64,0x5729341c0d918464,0x407019cac0c4a7d7,1
+np.float64,0x13595e9228ee878e,0xc07ee7235a7d8088,1
+np.float64,0x17698b0dc9dd4135,0xc07c1627e3a5ad5f,1
+np.float64,0x63b977c283abb0cc,0x4078cf1ec6ed65be,1
+np.float64,0x7349cc0d4dc16943,0x4081cc697ce4cb53,1
+np.float64,0x4e49a80b732fb28d,0x4063e67e3c5cbe90,1
+np.float64,0x07ba14b848a8ae02,0xc0837ac032a094e0,1
+np.float64,0x3da9f17b691bfddc,0xc03929c25366acda,1
+np.float64,0x02ea39aa6c3ac007,0xc08525af6f21e1c4,1
+np.float64,0x3a6a42f04ed9563d,0xc04e98e825dca46b,1
+np.float64,0x1afa877cd7900be7,0xc0799d6648cb34a9,1
+np.float64,0x58ea986649e052c6,0x4071512e939ad790,1
+np.float64,0x691abbc04647f536,0x407c89aaae0fcb83,1
+np.float64,0x43aabc5063e6f284,0x4044b45d18106fd2,1
+np.float64,0x488b003c893e0bea,0x4057df012a2dafbe,1
+np.float64,0x77eb076ed67caee5,0x40836720de94769e,1
+np.float64,0x5c1b46974aba46f4,0x40738731ba256007,1
+np.float64,0x1a5b29ecb5d3c261,0xc07a0becc77040d6,1
+np.float64,0x5d8b6ccf868c6032,0x4074865c1865e2db,1
+np.float64,0x4cfb6690b4aaf5af,0x406216cd8c7e8ddb,1
+np.float64,0x76cbd8eb5c5fc39e,0x4083038dc66d682b,1
+np.float64,0x28bbd1fec5012814,0xc07014c2dd1b9711,1
+np.float64,0x33dc1b3a4fd6bf7a,0xc060bd0756e07d8a,1
+np.float64,0x52bbe89b37de99f3,0x406a10041aa7d343,1
+np.float64,0x07bc479d15eb2dd3,0xc0837a1a6e3a3b61,1
+np.float64,0x18fc5275711a901d,0xc07aff3e9d62bc93,1
+np.float64,0x114c9758e247dc71,0xc080299a7cf15b05,1
+np.float64,0x25ac8f6d60755148,0xc07233c4c0c511d4,1
+np.float64,0x260cae2bb9e9fd7e,0xc071f128c7e82eac,1
+np.float64,0x572ccdfe0241de82,0x40701bedc84bb504,1
+np.float64,0x0ddcef6c8d41f5ee,0xc0815a7e16d07084,1
+np.float64,0x6dad1d59c988af68,0x407fb4a0bc0142b1,1
+np.float64,0x025d200580d8b6d1,0xc08556c0bc32b1b2,1
+np.float64,0x7aad344b6aa74c18,0x40845bbc453f22be,1
+np.float64,0x5b5d9d6ad9d14429,0x4073036d2d21f382,1
+np.float64,0x49cd8d8dcdf19954,0x405b5c034f5c7353,1
+np.float64,0x63edb9483335c1e6,0x4078f2dd21378786,1
+np.float64,0x7b1dd64c9d2c26bd,0x408482b922017bc9,1
+np.float64,0x782e13e0b574be5f,0x40837e2a0090a5ad,1
+np.float64,0x592dfe18b9d6db2f,0x40717f777fbcb1ec,1
+np.float64,0x654e3232ac60d72c,0x4079e71a95a70446,1
+np.float64,0x7b8e42ad22091456,0x4084a9a6f1e61722,1
+np.float64,0x570e88dfd5860ae6,0x407006ae6c0d137a,1
+np.float64,0x294e98346cb98ef1,0xc06f5edaac12bd44,1
+np.float64,0x1adeaa4ab792e642,0xc079b1431d5e2633,1
+np.float64,0x7b6ead3377529ac8,0x40849eabc8c7683c,1
+np.float64,0x2b8eedae8a9b2928,0xc06c400054deef11,1
+np.float64,0x65defb45b2dcf660,0x407a4b53f181c05a,1
+np.float64,0x1baf582d475e7701,0xc07920bcad4a502c,1
+np.float64,0x461f39cf05a0f15a,0x405126368f984fa1,1
+np.float64,0x7e5f6f5dcfff005b,0x4085a37d610439b4,1
+np.float64,0x136f66e4d09bd662,0xc07ed8a2719f2511,1
+np.float64,0x65afd8983fb6ca1f,0x407a2a7f48bf7fc1,1
+np.float64,0x572fa7f95ed22319,0x40701d706cf82e6f,1
diff --git a/numpy/core/tests/examples/checks.pyx b/numpy/core/tests/examples/checks.pyx
index ecf0ad3fa..151979db7 100644
--- a/numpy/core/tests/examples/checks.pyx
+++ b/numpy/core/tests/examples/checks.pyx
@@ -24,3 +24,7 @@ def get_td64_value(obj):
 
 def get_dt64_unit(obj):
     return cnp.get_datetime64_unit(obj)
+
+
+def is_integer(obj):
+    return isinstance(obj, (cnp.integer, int))
diff --git a/numpy/core/tests/examples/setup.py b/numpy/core/tests/examples/setup.py
index 9860bf5f7..6e34aa778 100644
--- a/numpy/core/tests/examples/setup.py
+++ b/numpy/core/tests/examples/setup.py
@@ -9,12 +9,11 @@ from Cython.Build import cythonize
 from setuptools.extension import Extension
 import os
 
-here = os.path.dirname(__file__)
 macros = [("NPY_NO_DEPRECATED_API", 0)]
 
 checks = Extension(
     "checks",
-    sources=[os.path.join(here, "checks.pyx")],
+    sources=[os.path.join('.', "checks.pyx")],
     include_dirs=[np.get_include()],
     define_macros=macros,
 )
diff --git a/numpy/core/tests/test__exceptions.py b/numpy/core/tests/test__exceptions.py
index 494b51f34..51c056936 100644
--- a/numpy/core/tests/test__exceptions.py
+++ b/numpy/core/tests/test__exceptions.py
@@ -1,11 +1,21 @@
 """
 Tests of the ._exceptions module. Primarily for exercising the __str__ methods.
 """
+
+import pickle
+
 import numpy as np
 
 _ArrayMemoryError = np.core._exceptions._ArrayMemoryError
+_UFuncNoLoopError = np.core._exceptions._UFuncNoLoopError
 
 class TestArrayMemoryError:
+    def test_pickling(self):
+        """ Test that _ArrayMemoryError can be pickled """
+        error = _ArrayMemoryError((1023,), np.dtype(np.uint8))
+        res = pickle.loads(pickle.dumps(error))
+        assert res._total_size == error._total_size
+
     def test_str(self):
         e = _ArrayMemoryError((1023,), np.dtype(np.uint8))
         str(e)  # not crashing is enough
@@ -40,3 +50,9 @@ class TestArrayMemoryError:
 
         e = _ArrayMemoryError((2, 4), np.dtype((np.uint64, 16)))
         assert e._total_size == 1024
+
+
+class TestUFuncNoLoopError:
+    def test_pickling(self):
+        """ Test that _UFuncNoLoopError can be pickled """
+        assert isinstance(pickle.dumps(_UFuncNoLoopError), bytes)
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 067cadf78..0f42f7076 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -281,6 +281,19 @@ def test_array_astype():
     a = np.array(1000, dtype='i4')
     assert_raises(TypeError, a.astype, 'U1', casting='safe')
 
+
+@pytest.mark.parametrize("dt", ["d", "f", "S13", "U32"])
+def test_array_astype_to_void(dt):
+    dt = np.dtype(dt)
+    arr = np.array([], dtype=dt)
+    assert arr.astype("V").dtype.itemsize == dt.itemsize
+
+def test_object_array_astype_to_void():
+    # This is different to `test_array_astype_to_void` as object arrays
+    # are inspected.  The default void is "V8" (8 is the length of double)
+    arr = np.array([], dtype="O").astype("V")
+    assert arr.dtype == "V8"
+
 @pytest.mark.parametrize("t",
     np.sctypes['uint'] + np.sctypes['int'] + np.sctypes['float']
 )
@@ -317,6 +330,29 @@ def test_string_to_boolean_cast_errors(dtype, out_dtype):
         with assert_raises(ValueError):
             arr.astype(out_dtype)
 
+@pytest.mark.parametrize("str_type", [str, bytes, np.str_, np.unicode_])
+@pytest.mark.parametrize("scalar_type",
+        [np.complex64, np.complex128, np.clongdouble])
+def test_string_to_complex_cast(str_type, scalar_type):
+    value = scalar_type(b"1+3j")
+    assert scalar_type(value) == 1+3j
+    assert np.array([value], dtype=object).astype(scalar_type)[()] == 1+3j
+    assert np.array(value).astype(scalar_type)[()] == 1+3j
+    arr = np.zeros(1, dtype=scalar_type)
+    arr[0] = value
+    assert arr[0] == 1+3j
+
+@pytest.mark.parametrize("dtype", np.typecodes["AllFloat"])
+def test_none_to_nan_cast(dtype):
+    # Note that at the time of writing this test, the scalar constructors
+    # reject None
+    arr = np.zeros(1, dtype=dtype)
+    arr[0] = None
+    assert np.isnan(arr)[0]
+    assert np.isnan(np.array(None, dtype=dtype))[()]
+    assert np.isnan(np.array([None], dtype=dtype))[0]
+    assert np.isnan(np.array(None).astype(dtype))[()]
+
 def test_copyto_fromscalar():
     a = np.arange(6, dtype='f4').reshape(2, 3)
 
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index 30019b253..78def9360 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -11,6 +11,7 @@ from itertools import product
 
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_umath import _discover_array_parameters
 
 from numpy.testing import (
     assert_array_equal, assert_warns, IS_PYPY)
@@ -308,6 +309,13 @@ class TestScalarDiscovery:
                 # coercion should also raise (error type may change)
                 with pytest.raises(Exception):
                     np.array(scalar, dtype=dtype)
+
+                if (isinstance(scalar, rational) and
+                        np.issubdtype(dtype, np.signedinteger)):
+                    return
+
+                with pytest.raises(Exception):
+                    np.array([scalar], dtype=dtype)
                 # assignment should also raise
                 res = np.zeros((), dtype=dtype)
                 with pytest.raises(Exception):
@@ -323,6 +331,46 @@ class TestScalarDiscovery:
             ass[()] = scalar
             assert_array_equal(ass, cast)
 
+    @pytest.mark.parametrize("dtype_char", np.typecodes["All"])
+    def test_default_dtype_instance(self, dtype_char):
+        if dtype_char in "SU":
+            dtype = np.dtype(dtype_char + "1")
+        elif dtype_char == "V":
+            # Legacy behaviour was to use V8. The reason was float64 being the
+            # default dtype and that having 8 bytes.
+            dtype = np.dtype("V8")
+        else:
+            dtype = np.dtype(dtype_char)
+
+        discovered_dtype, _ = _discover_array_parameters([], type(dtype))
+
+        assert discovered_dtype == dtype
+        assert discovered_dtype.itemsize == dtype.itemsize
+
+    @pytest.mark.parametrize("dtype", np.typecodes["Integer"])
+    def test_scalar_to_int_coerce_does_not_cast(self, dtype):
+        """
+        Signed integers are currently different in that they do not cast other
+        NumPy scalar, but instead use scalar.__int__(). The harcoded
+        exception to this rule is `np.array(scalar, dtype=integer)`.
+        """
+        dtype = np.dtype(dtype)
+        invalid_int = np.ulonglong(-1)
+
+        float_nan = np.float64(np.nan)
+
+        for scalar in [float_nan, invalid_int]:
+            # This is a special case using casting logic and thus not failing:
+            coerced = np.array(scalar, dtype=dtype)
+            cast = np.array(scalar).astype(dtype)
+            assert_array_equal(coerced, cast)
+
+            # However these fail:
+            with pytest.raises((ValueError, OverflowError)):
+                np.array([scalar], dtype=dtype)
+            with pytest.raises((ValueError, OverflowError)):
+                cast[()] = scalar
+
 
 class TestTimeScalars:
     @pytest.mark.parametrize("dtype", [np.int64, np.float32])
@@ -332,13 +380,21 @@ class TestTimeScalars:
              param(np.datetime64("NaT", "generic"), id="datetime64[generic](NaT)"),
              param(np.datetime64(1, "D"), id="datetime64[D]")],)
     def test_coercion_basic(self, dtype, scalar):
+        # Note the `[scalar]` is there because np.array(scalar) uses stricter
+        # `scalar.__int__()` rules for backward compatibility right now.
         arr = np.array(scalar, dtype=dtype)
         cast = np.array(scalar).astype(dtype)
-        ass = np.ones((), dtype=dtype)
-        ass[()] = scalar  # raises, as would np.array([scalar], dtype=dtype)
-
         assert_array_equal(arr, cast)
-        assert_array_equal(cast, cast)
+
+        ass = np.ones((), dtype=dtype)
+        if issubclass(dtype, np.integer):
+            with pytest.raises(TypeError):
+                # raises, as would np.array([scalar], dtype=dtype), this is
+                # conversion from times, but behaviour of integers.
+                ass[()] = scalar
+        else:
+            ass[()] = scalar
+            assert_array_equal(ass, cast)
 
     @pytest.mark.parametrize("dtype", [np.int64, np.float32])
     @pytest.mark.parametrize("scalar",
@@ -441,7 +497,7 @@ class TestNested:
         for i in range(np.MAXDIMS - 1):
             nested = [nested]
 
-        with pytest.raises(ValueError):
+        with pytest.warns(DeprecationWarning):
             # It will refuse to assign the array into
             np.array(nested, dtype="float64")
 
@@ -478,6 +534,27 @@ class TestNested:
         with pytest.raises(ValueError):
             np.array([[], np.empty((0, 1))], dtype=object)
 
+    def test_array_of_different_depths(self):
+        # When multiple arrays (or array-likes) are included in a
+        # sequences and have different depth, we currently discover
+        # as many dimensions as they share. (see also gh-17224)
+        arr = np.zeros((3, 2))
+        mismatch_first_dim = np.zeros((1, 2))
+        mismatch_second_dim = np.zeros((3, 3))
+
+        dtype, shape = _discover_array_parameters(
+            [arr, mismatch_second_dim], dtype=np.dtype("O"))
+        assert shape == (2, 3)
+
+        dtype, shape = _discover_array_parameters(
+            [arr, mismatch_first_dim], dtype=np.dtype("O"))
+        assert shape == (2,)
+        # The second case is currently supported because the arrays
+        # can be stored as objects:
+        res = np.asarray([arr, mismatch_first_dim], dtype=np.dtype("O"))
+        assert res[0] is arr
+        assert res[1] is mismatch_first_dim
+
 
 class TestBadSequences:
     # These are tests for bad objects passed into `np.array`, in general
@@ -570,3 +647,45 @@ class TestArrayLikes:
         with pytest.raises(ValueError):
             # The error type does not matter much here.
             np.array([obj])
+
+    def test_arraylike_classes(self):
+        # The classes of array-likes should generally be acceptable to be
+        # stored inside a numpy (object) array.  This tests all of the
+        # special attributes (since all are checked during coercion).
+        arr = np.array(np.int64)
+        assert arr[()] is np.int64
+        arr = np.array([np.int64])
+        assert arr[0] is np.int64
+
+        # This also works for properties/unbound methods:
+        class ArrayLike:
+            @property
+            def __array_interface__(self):
+                pass
+
+            @property
+            def __array_struct__(self):
+                pass
+
+            def __array__(self):
+                pass
+
+        arr = np.array(ArrayLike)
+        assert arr[()] is ArrayLike
+        arr = np.array([ArrayLike])
+        assert arr[0] is ArrayLike
+
+    @pytest.mark.skipif(
+            np.dtype(np.intp).itemsize < 8, reason="Needs 64bit platform")
+    def test_too_large_array_error_paths(self):
+        """Test the error paths, including for memory leaks"""
+        arr = np.array(0, dtype="uint8")
+        # Guarantees that a contiguous copy won't work:
+        arr = np.broadcast_to(arr, 2**62)
+
+        for i in range(5):
+            # repeat, to ensure caching cannot have an effect:
+            with pytest.raises(MemoryError):
+                np.array(arr)
+            with pytest.raises(MemoryError):
+                np.array([arr])
diff --git a/numpy/core/tests/test_casting_unittests.py b/numpy/core/tests/test_casting_unittests.py
new file mode 100644
index 000000000..fec0ae7c7
--- /dev/null
+++ b/numpy/core/tests/test_casting_unittests.py
@@ -0,0 +1,301 @@
+"""
+The tests exercise the casting machinery in a more low-level manner.
+The reason is mostly to test a new implementation of the casting machinery.
+
+Unlike most tests in NumPy, these are closer to unit-tests rather
+than integration tests.
+"""
+
+import pytest
+import textwrap
+import enum
+
+import numpy as np
+
+from numpy.core._multiarray_umath import (
+    _get_castingimpl as get_castingimpl)
+from numpy.core._multiarray_tests import uses_new_casts
+
+
+# Simple skips object, parametric and long double (unsupported by struct)
+simple_dtypes = "?bhilqBHILQefdFD"
+if np.dtype("l").itemsize != np.dtype("q").itemsize:
+    # Remove l and L, the table was generated with 64bit linux in mind.
+    # TODO: Should have two tables or no a different solution.
+    simple_dtypes = simple_dtypes.replace("l", "").replace("L", "")
+simple_dtypes = [type(np.dtype(c)) for c in simple_dtypes]
+
+
+def simple_dtype_instances():
+    for dtype_class in simple_dtypes:
+        dt = dtype_class()
+        yield pytest.param(dt, id=str(dt))
+        if dt.byteorder != "|":
+            dt = dt.newbyteorder()
+            yield pytest.param(dt, id=str(dt))
+
+
+def get_expected_stringlength(dtype):
+    """Returns the string length when casting the basic dtypes to strings.
+    """
+    if dtype == np.bool_:
+        return 5
+    if dtype.kind in "iu":
+        if dtype.itemsize == 1:
+            length = 3
+        elif dtype.itemsize == 2:
+            length = 5
+        elif dtype.itemsize == 4:
+            length = 10
+        elif dtype.itemsize == 8:
+            length = 20
+        else:
+            raise AssertionError(f"did not find expected length for {dtype}")
+
+        if dtype.kind == "i":
+            length += 1  # adds one character for the sign
+
+        return length
+
+    # Note: Can't do dtype comparison for longdouble on windows
+    if dtype.char == "g":
+        return 48
+    elif dtype.char == "G":
+        return 48 * 2
+    elif dtype.kind == "f":
+        return 32  # also for half apparently.
+    elif dtype.kind == "c":
+        return 32 * 2
+
+    raise AssertionError(f"did not find expected length for {dtype}")
+
+
+class Casting(enum.IntEnum):
+    no = 0
+    equiv = 1
+    safe = 2
+    same_kind = 3
+    unsafe = 4
+    cast_is_view = 1 << 16
+
+
+def _get_cancast_table():
+    table = textwrap.dedent("""
+        X ? b h i l q B H I L Q e f d g F D G S U V O M m
+        ? # = = = = = = = = = = = = = = = = = = = = = . =
+        b . # = = = = . . . . . = = = = = = = = = = = . =
+        h . ~ # = = = . . . . . ~ = = = = = = = = = = . =
+        i . ~ ~ # = = . . . . . ~ ~ = = ~ = = = = = = . =
+        l . ~ ~ ~ # # . . . . . ~ ~ = = ~ = = = = = = . =
+        q . ~ ~ ~ # # . . . . . ~ ~ = = ~ = = = = = = . =
+        B . ~ = = = = # = = = = = = = = = = = = = = = . =
+        H . ~ ~ = = = ~ # = = = ~ = = = = = = = = = = . =
+        I . ~ ~ ~ = = ~ ~ # = = ~ ~ = = ~ = = = = = = . =
+        L . ~ ~ ~ ~ ~ ~ ~ ~ # # ~ ~ = = ~ = = = = = = . ~
+        Q . ~ ~ ~ ~ ~ ~ ~ ~ # # ~ ~ = = ~ = = = = = = . ~
+        e . . . . . . . . . . . # = = = = = = = = = = . .
+        f . . . . . . . . . . . ~ # = = = = = = = = = . .
+        d . . . . . . . . . . . ~ ~ # = ~ = = = = = = . .
+        g . . . . . . . . . . . ~ ~ ~ # ~ ~ = = = = = . .
+        F . . . . . . . . . . . . . . . # = = = = = = . .
+        D . . . . . . . . . . . . . . . ~ # = = = = = . .
+        G . . . . . . . . . . . . . . . ~ ~ # = = = = . .
+        S . . . . . . . . . . . . . . . . . . # = = = . .
+        U . . . . . . . . . . . . . . . . . . . # = = . .
+        V . . . . . . . . . . . . . . . . . . . . # = . .
+        O . . . . . . . . . . . . . . . . . . . . = # . .
+        M . . . . . . . . . . . . . . . . . . . . = = # .
+        m . . . . . . . . . . . . . . . . . . . . = = . #
+        """).strip().split("\n")
+    dtypes = [type(np.dtype(c)) for c in table[0][2::2]]
+
+    convert_cast = {".": Casting.unsafe, "~": Casting.same_kind,
+                    "=": Casting.safe, "#": Casting.equiv,
+                    " ": -1}
+
+    cancast = {}
+    for from_dt, row in zip(dtypes, table[1:]):
+        cancast[from_dt] = {}
+        for to_dt, c in zip(dtypes, row[2::2]):
+            cancast[from_dt][to_dt] = convert_cast[c]
+
+    return cancast
+
+CAST_TABLE = _get_cancast_table()
+
+
+class TestChanges:
+    """
+    These test cases excercise some behaviour changes
+    """
+    @pytest.mark.parametrize("string", ["S", "U"])
+    @pytest.mark.parametrize("floating", ["e", "f", "d", "g"])
+    def test_float_to_string(self, floating, string):
+        assert np.can_cast(floating, string)
+        # 100 is long enough to hold any formatted floating
+        if uses_new_casts():
+            assert np.can_cast(floating, f"{string}100")
+        else:
+            assert not np.can_cast(floating, f"{string}100")
+            assert np.can_cast(floating, f"{string}100", casting="same_kind")
+
+    def test_to_void(self):
+        # But in general, we do consider these safe:
+        assert np.can_cast("d", "V")
+        assert np.can_cast("S20", "V")
+
+        # Do not consider it a safe cast if the void is too smaller:
+        if uses_new_casts():
+            assert not np.can_cast("d", "V1")
+            assert not np.can_cast("S20", "V1")
+            assert not np.can_cast("U1", "V1")
+            # Structured to unstructured is just like any other:
+            assert np.can_cast("d,i", "V", casting="same_kind")
+        else:
+            assert np.can_cast("d", "V1")
+            assert np.can_cast("S20", "V1")
+            assert np.can_cast("U1", "V1")
+            assert not np.can_cast("d,i", "V", casting="same_kind")
+
+
+class TestCasting:
+    @pytest.mark.parametrize("from_Dt", simple_dtypes)
+    def test_simple_cancast(self, from_Dt):
+        for to_Dt in simple_dtypes:
+            cast = get_castingimpl(from_Dt, to_Dt)
+
+            for from_dt in [from_Dt(), from_Dt().newbyteorder()]:
+                default = cast._resolve_descriptors((from_dt, None))[1][1]
+                assert default == to_Dt()
+                del default
+
+                for to_dt in [to_Dt(), to_Dt().newbyteorder()]:
+                    casting, (from_res, to_res) = cast._resolve_descriptors(
+                        (from_dt, to_dt))
+                    assert(type(from_res) == from_Dt)
+                    assert(type(to_res) == to_Dt)
+                    if casting & Casting.cast_is_view:
+                        # If a view is acceptable, this is "no" casting
+                        # and byte order must be matching.
+                        assert casting == Casting.no | Casting.cast_is_view
+                        # The above table lists this as "equivalent"
+                        assert Casting.equiv == CAST_TABLE[from_Dt][to_Dt]
+                        # Note that to_res may not be the same as from_dt
+                        assert from_res.isnative == to_res.isnative
+                    else:
+                        if from_Dt == to_Dt:
+                            # Note that to_res may not be the same as from_dt
+                            assert from_res.isnative != to_res.isnative
+                        assert casting == CAST_TABLE[from_Dt][to_Dt]
+
+                    if from_Dt is to_Dt:
+                        assert(from_dt is from_res)
+                        assert(to_dt is to_res)
+
+
+    def string_with_modified_length(self, dtype, change_length):
+        fact = 1 if dtype.char == "S" else 4
+        length = dtype.itemsize // fact + change_length
+        return np.dtype(f"{dtype.byteorder}{dtype.char}{length}")
+
+    @pytest.mark.parametrize("other_DT", simple_dtypes)
+    @pytest.mark.parametrize("string_char", ["S", "U"])
+    def test_string_cancast(self, other_DT, string_char):
+        fact = 1 if string_char == "S" else 4
+
+        string_DT = type(np.dtype(string_char))
+        cast = get_castingimpl(other_DT, string_DT)
+
+        other_dt = other_DT()
+        expected_length = get_expected_stringlength(other_dt)
+        string_dt = np.dtype(f"{string_char}{expected_length}")
+
+        safety, (res_other_dt, res_dt) = cast._resolve_descriptors((other_dt, None))
+        assert res_dt.itemsize == expected_length * fact
+        assert safety == Casting.safe  # we consider to string casts "safe"
+        assert isinstance(res_dt, string_DT)
+
+        # These casts currently implement changing the string length, so
+        # check the cast-safety for too long/fixed string lengths:
+        for change_length in [-1, 0, 1]:
+            if change_length >= 0:
+                expected_safety = Casting.safe
+            else:
+                expected_safety = Casting.same_kind
+
+            to_dt = self.string_with_modified_length(string_dt, change_length)
+            safety, (_, res_dt) = cast._resolve_descriptors((other_dt, to_dt))
+            assert res_dt is to_dt
+            assert safety == expected_safety
+
+        # The opposite direction is always considered unsafe:
+        cast = get_castingimpl(string_DT, other_DT)
+
+        safety, _ = cast._resolve_descriptors((string_dt, other_dt))
+        assert safety == Casting.unsafe
+
+        cast = get_castingimpl(string_DT, other_DT)
+        safety, (_, res_dt) = cast._resolve_descriptors((string_dt, None))
+        assert safety == Casting.unsafe
+        assert other_dt is res_dt  # returns the singleton for simple dtypes
+
+    @pytest.mark.parametrize("other_dt", ["S8", "<U8", ">U8"])
+    @pytest.mark.parametrize("string_char", ["S", "U"])
+    def test_string_to_string_cancast(self, other_dt, string_char):
+        other_dt = np.dtype(other_dt)
+
+        fact = 1 if string_char == "S" else 4
+        div = 1 if other_dt.char == "S" else 4
+
+        string_DT = type(np.dtype(string_char))
+        cast = get_castingimpl(type(other_dt), string_DT)
+
+        expected_length = other_dt.itemsize // div
+        string_dt = np.dtype(f"{string_char}{expected_length}")
+
+        safety, (res_other_dt, res_dt) = cast._resolve_descriptors((other_dt, None))
+        assert res_dt.itemsize == expected_length * fact
+        assert isinstance(res_dt, string_DT)
+
+        if other_dt.char == string_char:
+            if other_dt.isnative:
+                expected_safety = Casting.no | Casting.cast_is_view
+            else:
+                expected_safety = Casting.equiv
+        elif string_char == "U":
+            expected_safety = Casting.safe
+        else:
+            expected_safety = Casting.unsafe
+
+        assert expected_safety == safety
+
+        for change_length in [-1, 0, 1]:
+            to_dt = self.string_with_modified_length(string_dt, change_length)
+            safety, (_, res_dt) = cast._resolve_descriptors((other_dt, to_dt))
+
+            assert res_dt is to_dt
+            if expected_safety == Casting.unsafe:
+                assert safety == expected_safety
+            elif change_length < 0:
+                assert safety == Casting.same_kind
+            elif change_length == 0:
+                assert safety == expected_safety
+            elif change_length > 0:
+                assert safety == Casting.safe
+
+    def test_void_to_string_special_case(self):
+        # Cover a small special case in void to string casting that could
+        # probably just as well be turned into an error (compare
+        # `test_object_to_parametric_internal_error` below).
+        assert np.array([], dtype="V5").astype("S").dtype.itemsize == 5
+        assert np.array([], dtype="V5").astype("U").dtype.itemsize == 4 * 5
+
+    def test_object_to_parametric_internal_error(self):
+        # We reject casting from object to a parametric type, without
+        # figuring out the correct instance first.
+        object_dtype = type(np.dtype(object))
+        other_dtype = type(np.dtype(str))
+        cast = get_castingimpl(object_dtype, other_dtype)
+        with pytest.raises(TypeError,
+                    match="casting from object to the parametric DType"):
+            cast._resolve_descriptors((np.dtype("O"), None))
diff --git a/numpy/core/tests/test_cython.py b/numpy/core/tests/test_cython.py
index 63524b269..a1f09d0fe 100644
--- a/numpy/core/tests/test_cython.py
+++ b/numpy/core/tests/test_cython.py
@@ -34,21 +34,19 @@ def install_temp(request, tmp_path):
     here = os.path.dirname(__file__)
     ext_dir = os.path.join(here, "examples")
 
-    tmp_path = tmp_path._str
-    cytest = os.path.join(tmp_path, "cytest")
+    cytest = str(tmp_path / "cytest")
 
     shutil.copytree(ext_dir, cytest)
     # build the examples and "install" them into a temporary directory
 
-    install_log = os.path.join(tmp_path, "tmp_install_log.txt")
+    install_log = str(tmp_path / "tmp_install_log.txt")
     subprocess.check_call(
         [
             sys.executable,
             "setup.py",
             "build",
             "install",
-            "--prefix",
-            os.path.join(tmp_path, "installdir"),
+            "--prefix", str(tmp_path / "installdir"),
             "--single-version-externally-managed",
             "--record",
             install_log,
@@ -126,3 +124,11 @@ def test_get_datetime64_unit(install_temp):
     result = checks.get_dt64_unit(td64)
     expected = 5
     assert result == expected
+
+
+def test_abstract_scalars(install_temp):
+    import checks
+
+    assert checks.is_integer(1)
+    assert checks.is_integer(np.int8(1))
+    assert checks.is_integer(np.uint64(1))
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 59a3954fd..62f6381d5 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -26,6 +26,7 @@ class TestDateTime:
     def test_datetime_dtype_creation(self):
         for unit in ['Y', 'M', 'W', 'D',
                      'h', 'm', 's', 'ms', 'us',
+                     'μs',  # alias for us
                      'ns', 'ps', 'fs', 'as']:
             dt1 = np.dtype('M8[750%s]' % unit)
             assert_(dt1 == np.dtype('datetime64[750%s]' % unit))
@@ -429,6 +430,10 @@ class TestDateTime:
                             np.timedelta64)
         assert_equal(actual, expected)
 
+    def test_timedelta_nat_format(self):
+        # gh-17552
+        assert_equal('NaT', '{0}'.format(np.timedelta64('nat')))
+
     def test_timedelta_scalar_construction_units(self):
         # String construction detecting units
         assert_equal(np.datetime64('2010').dtype,
@@ -1653,8 +1658,9 @@ class TestDateTime:
                      '1959-10-13T12:34:56')
         assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ms')),
                      '1959-10-13T12:34:56.789')
-        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'us')),
-                     '1959-10-13T12:34:56.789012')
+        for us in ['us', 'μs', b'us']:  # check non-ascii and bytes too
+            assert_equal(np.datetime_as_string(np.datetime64(datetime, us)),
+                         '1959-10-13T12:34:56.789012')
 
         datetime = '1969-12-31T23:34:56.789012345678901234'
 
@@ -2389,3 +2395,19 @@ class TestDateTimeData:
     def test_basic(self):
         a = np.array(['1980-03-23'], dtype=np.datetime64)
         assert_equal(np.datetime_data(a.dtype), ('D', 1))
+
+    def test_bytes(self):
+        # byte units are converted to unicode
+        dt = np.datetime64('2000', (b'ms', 5))
+        assert np.datetime_data(dt.dtype) == ('ms', 5)
+
+        dt = np.datetime64('2000', b'5ms')
+        assert np.datetime_data(dt.dtype) == ('ms', 5)
+
+    def test_non_ascii(self):
+        # μs is normalized to μ
+        dt = np.datetime64('2000', ('μs', 5))
+        assert np.datetime_data(dt.dtype) == ('us', 5)
+
+        dt = np.datetime64('2000', '5μs')
+        assert np.datetime_data(dt.dtype) == ('us', 5)
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index bbb94f7d3..59fc54722 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -179,6 +179,12 @@ class TestComparisons:
     def test_less(self):
         assert_array_equal((self.A < self.B), [[True, False], [False, False]])
 
+    def test_type(self):
+        out1 = np.char.equal(self.A, self.B)
+        out2 = np.char.equal('a', 'a')
+        assert_(isinstance(out1, np.ndarray))
+        assert_(isinstance(out2, np.ndarray))
+
 class TestComparisonsMixed1(TestComparisons):
     """Ticket #1276"""
 
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 68502adda..a67fe62c3 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -81,6 +81,8 @@ class _DeprecationTestCase:
         kwargs : dict
             Keyword arguments for `function`
         """
+        __tracebackhide__ = True  # Hide traceback for py.test
+
         # reset the log
         self.log[:] = []
 
@@ -615,7 +617,7 @@ class BuiltInRoundComplexDType(_DeprecationTestCase):
             self.assert_deprecated(round, args=(scalar,))
             self.assert_deprecated(round, args=(scalar, 0))
             self.assert_deprecated(round, args=(scalar,), kwargs={'ndigits': 0})
-    
+
     def test_not_deprecated(self):
         for scalar_type in self.not_deprecated_types:
             scalar = scalar_type(0)
@@ -678,3 +680,108 @@ class TestDeprecatedGlobals(_DeprecationTestCase):
         # from np.compat
         self.assert_deprecated(lambda: np.long)
         self.assert_deprecated(lambda: np.unicode)
+
+
+class TestMatrixInOuter(_DeprecationTestCase):
+    # 2020-05-13 NumPy 1.20.0
+    message = (r"add.outer\(\) was passed a numpy matrix as "
+               r"(first|second) argument.")
+
+    def test_deprecated(self):
+        arr = np.array([1, 2, 3])
+        m = np.array([1, 2, 3]).view(np.matrix)
+        self.assert_deprecated(np.add.outer, args=(m, m), num=2)
+        self.assert_deprecated(np.add.outer, args=(arr, m))
+        self.assert_deprecated(np.add.outer, args=(m, arr))
+        self.assert_not_deprecated(np.add.outer, args=(arr, arr))
+
+
+class TestRaggedArray(_DeprecationTestCase):
+    # 2020-07-24, NumPy 1.20.0
+    message = "setting an array element with a sequence"
+
+    def test_deprecated(self):
+        arr = np.ones((1, 1))
+        # Deprecated if the array is a leave node:
+        self.assert_deprecated(lambda: np.array([arr, 0], dtype=np.float64))
+        self.assert_deprecated(lambda: np.array([0, arr], dtype=np.float64))
+        # And when it is an assignment into a lower dimensional subarray:
+        self.assert_deprecated(lambda: np.array([arr, [0]], dtype=np.float64))
+        self.assert_deprecated(lambda: np.array([[0], arr], dtype=np.float64))
+
+
+class FlatteningConcatenateUnsafeCast(_DeprecationTestCase):
+    # NumPy 1.20, 2020-09-03
+    message = "concatenate with `axis=None` will use same-kind casting"
+
+    def test_deprecated(self):
+        self.assert_deprecated(np.concatenate,
+                args=(([0.], [1.]),),
+                kwargs=dict(axis=None, out=np.empty(2, dtype=np.int64)))
+
+    def test_not_deprecated(self):
+        self.assert_not_deprecated(np.concatenate,
+                args=(([0.], [1.]),),
+                kwargs={'axis': None, 'out': np.empty(2, dtype=np.int64),
+                        'casting': "unsafe"})
+
+        with assert_raises(TypeError):
+            # Tests should notice if the deprecation warning is given first...
+            np.concatenate(([0.], [1.]), out=np.empty(2, dtype=np.int64),
+                           casting="same_kind")
+
+
+class TestDeprecateSubarrayDTypeDuringArrayCoercion(_DeprecationTestCase):
+    warning_cls = FutureWarning
+    message = "(creating|casting) an array (with|to) a subarray dtype"
+
+    def test_deprecated_array(self):
+        # Arrays are more complex, since they "broadcast" on success:
+        arr = np.array([1, 2])
+
+        self.assert_deprecated(lambda: arr.astype("(2)i,"))
+        with pytest.warns(FutureWarning):
+            res = arr.astype("(2)i,")
+
+        assert_array_equal(res, [[1, 2], [1, 2]])
+
+        self.assert_deprecated(lambda: np.array(arr, dtype="(2)i,"))
+        with pytest.warns(FutureWarning):
+            res = np.array(arr, dtype="(2)i,")
+
+        assert_array_equal(res, [[1, 2], [1, 2]])
+
+        with pytest.warns(FutureWarning):
+            res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
+
+        assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 2], [1, 2]]])
+
+    def test_deprecated_and_error(self):
+        # These error paths do not give a warning, but will succeed in the
+        # future.
+        arr = np.arange(5 * 2).reshape(5, 2)
+        def check():
+            with pytest.raises(ValueError):
+                arr.astype("(2,2)f")
+
+        self.assert_deprecated(check)
+
+        def check():
+            with pytest.raises(ValueError):
+                np.array(arr, dtype="(2,2)f")
+
+        self.assert_deprecated(check)
+
+
+class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
+    # Deprecated 2020-11-24, NumPy 1.20
+    """
+    Technically, it should be impossible to create numpy object scalars,
+    but there was an unpickle path that would in theory allow it. That
+    path is invalid and must lead to the warning.
+    """
+    message = "Unpickling a scalar with object dtype is deprecated."
+
+    def test_deprecated(self):
+        ctor = np.core.multiarray.scalar
+        self.assert_deprecated(lambda: ctor(np.dtype("O"), 1))
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 2e2b0dbe2..0ebcc72da 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -6,6 +6,7 @@ import gc
 
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_tests import create_custom_field_dtype
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT)
 from numpy.compat import pickle
@@ -152,6 +153,9 @@ class TestBuiltin:
                       'formats': ['f4', 'i4'],
                       'offsets': [4, 0]})
         assert_equal(x == y, False)
+        # But it is currently an equivalent cast:
+        assert np.can_cast(x, y, casting="equiv")
+
 
 class TestRecord:
     def test_equivalent_record(self):
@@ -313,6 +317,24 @@ class TestRecord:
                        'formats':['i1', 'O'],
                        'offsets':[np.dtype('intp').itemsize, 0]})
 
+    @pytest.mark.parametrize(["obj", "dtype", "expected"],
+        [([], ("(2)f4,"), np.empty((0, 2), dtype="f4")),
+         (3, "(3)f4,", [3, 3, 3]),
+         (np.float64(2), "(2)f4,", [2, 2]),
+         ([((0, 1), (1, 2)), ((2,),)], '(2,2)f4', None),
+         (["1", "2"], "(2)i,", None)])
+    def test_subarray_list(self, obj, dtype, expected):
+        dtype = np.dtype(dtype)
+        res = np.array(obj, dtype=dtype)
+
+        if expected is None:
+            # iterate the 1-d list to fill the array
+            expected = np.empty(len(obj), dtype=dtype)
+            for i in range(len(expected)):
+                expected[i] = obj[i]
+
+        assert_array_equal(res, expected)
+
     def test_comma_datetime(self):
         dt = np.dtype('M8[D],datetime64[Y],i8')
         assert_equal(dt, np.dtype([('f0', 'M8[D]'),
@@ -766,6 +788,26 @@ class TestMonsterType:
             ('yi', np.dtype((a, (3, 2))))])
         assert_dtype_equal(c, d)
 
+    def test_list_recursion(self):
+        l = list()
+        l.append(('f', l))
+        with pytest.raises(RecursionError):
+            np.dtype(l)
+
+    def test_tuple_recursion(self):
+        d = np.int32
+        for i in range(100000):
+            d = (d, (1,))
+        with pytest.raises(RecursionError):
+            np.dtype(d)
+
+    def test_dict_recursion(self):
+        d = dict(names=['self'], formats=[None], offsets=[0])
+        d['formats'][0] = d
+        with pytest.raises(RecursionError):
+            np.dtype(d)
+
+
 class TestMetadata:
     def test_no_metadata(self):
         d = np.dtype(int)
@@ -1338,3 +1380,35 @@ class TestFromCTypes:
         pair_type = np.dtype('{},{}'.format(*pair))
         expected = np.dtype([('f0', pair[0]), ('f1', pair[1])])
         assert_equal(pair_type, expected)
+
+
+class TestUserDType:
+    @pytest.mark.leaks_references(reason="dynamically creates custom dtype.")
+    def test_custom_structured_dtype(self):
+        class mytype:
+            pass
+
+        blueprint = np.dtype([("field", object)])
+        dt = create_custom_field_dtype(blueprint, mytype, 0)
+        assert dt.type == mytype
+        # We cannot (currently) *create* this dtype with `np.dtype` because
+        # mytype does not inherit from `np.generic`.  This seems like an
+        # unnecessary restriction, but one that has been around forever:
+        assert np.dtype(mytype) == np.dtype("O")
+
+    def test_custom_structured_dtype_errors(self):
+        class mytype:
+            pass
+
+        blueprint = np.dtype([("field", object)])
+
+        with pytest.raises(ValueError):
+            # Tests what happens if fields are unset during creation
+            # which is currently rejected due to the containing object
+            # (see PyArray_RegisterDataType).
+            create_custom_field_dtype(blueprint, mytype, 1)
+
+        with pytest.raises(RuntimeError):
+            # Tests that a dtype must have its type field set up to np.dtype
+            # or in this case a builtin instance.
+            create_custom_field_dtype(blueprint, mytype, 2)
diff --git a/numpy/core/tests/test_function_base.py b/numpy/core/tests/test_function_base.py
index 62a9772c8..dad7a5883 100644
--- a/numpy/core/tests/test_function_base.py
+++ b/numpy/core/tests/test_function_base.py
@@ -402,3 +402,8 @@ class TestLinspace:
         stop = array(2, dtype='O')
         y = linspace(start, stop, 3)
         assert_array_equal(y, array([1., 1.5, 2.]))
+                    
+    def test_round_negative(self):
+        y = linspace(-1, 3, num=8, dtype=int)
+        t = array([-1, -1, 0, 0, 1, 1, 2, 3], dtype=int)
+        assert_array_equal(y, t)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 1069cbe8d..667c49240 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -7,8 +7,8 @@ import numpy as np
 from numpy.core._multiarray_tests import array_indexing
 from itertools import product
 from numpy.testing import (
-    assert_, assert_equal, assert_raises, assert_array_equal, assert_warns,
-    HAS_REFCOUNT,
+    assert_, assert_equal, assert_raises, assert_raises_regex,
+    assert_array_equal, assert_warns, HAS_REFCOUNT,
     )
 
 
@@ -1239,6 +1239,41 @@ class TestBooleanIndexing:
         assert_raises(IndexError, lambda: a[False, [0, 1], ...])
 
 
+    def test_boolean_indexing_fast_path(self):
+        # These used to either give the wrong error, or incorrectly give no
+        # error.
+        a = np.ones((3, 3))
+
+        # This used to incorrectly work (and give an array of shape (0,))
+        idx1 = np.array([[False]*9])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx1])
+
+        # This used to incorrectly give a ValueError: operands could not be broadcast together
+        idx2 = np.array([[False]*8 + [True]])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx2])
+
+        # This is the same as it used to be. The above two should work like this.
+        idx3 = np.array([[False]*10])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx3])
+
+        # This used to give ValueError: non-broadcastable operand
+        a = np.ones((1, 1, 2))
+        idx = np.array([[[True], [False]]])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 1; "
+            "dimension is 1 but corresponding boolean dimension is 2",
+            lambda: a[idx])
+
+
 class TestArrayToIndexDeprecation:
     """Creating an an index from array not 0-D is an error.
 
diff --git a/numpy/core/tests/test_memmap.py b/numpy/core/tests/test_memmap.py
index feef80ce8..a1e0c8f8f 100644
--- a/numpy/core/tests/test_memmap.py
+++ b/numpy/core/tests/test_memmap.py
@@ -11,7 +11,8 @@ from numpy import (
 
 from numpy import arange, allclose, asarray
 from numpy.testing import (
-    assert_, assert_equal, assert_array_equal, suppress_warnings
+    assert_, assert_equal, assert_array_equal, suppress_warnings, IS_PYPY,
+    break_cycles
     )
 
 class TestMemmap:
@@ -25,6 +26,10 @@ class TestMemmap:
 
     def teardown(self):
         self.tmpfp.close()
+        self.data = None
+        if IS_PYPY:
+            break_cycles()
+            break_cycles()
         shutil.rmtree(self.tempdir)
 
     def test_roundtrip(self):
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b7d4a6a92..048b1688f 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -21,8 +21,8 @@ import builtins
 from decimal import Decimal
 
 import numpy as np
-from numpy.compat import strchar
 import numpy.core._multiarray_tests as _multiarray_tests
+from numpy.core._rational_tests import rational
 from numpy.testing import (
     assert_, assert_raises, assert_warns, assert_equal, assert_almost_equal,
     assert_array_equal, assert_raises_regex, assert_array_almost_equal,
@@ -207,6 +207,23 @@ class TestFlags:
             a[2] = 10
             # only warn once
             assert_(len(w) == 1)
+    
+    @pytest.mark.parametrize(["flag", "flag_value", "writeable"],
+            [("writeable", True, True),
+             # Delete _warn_on_write after deprecation and simplify
+             # the parameterization:
+             ("_warn_on_write", True, False),
+             ("writeable", False, False)])
+    def test_readonly_flag_protocols(self, flag, flag_value, writeable):
+        a = np.arange(10)
+        setattr(a.flags, flag, flag_value)
+
+        class MyArr():
+            __array_struct__ = a.__array_struct__
+
+        assert memoryview(a).readonly is not writeable
+        assert a.__array_interface__['data'][1] is not writeable
+        assert np.asarray(MyArr()).flags.writeable is writeable
 
     def test_otherflags(self):
         assert_equal(self.a.flags.carray, True)
@@ -828,7 +845,37 @@ class TestCreation:
 
     def test_void(self):
         arr = np.array([], dtype='V')
-        assert_equal(arr.dtype.kind, 'V')
+        assert arr.dtype == 'V8'  # current default
+        # Same length scalars (those that go to the same void) work:
+        arr = np.array([b"1234", b"1234"], dtype="V")
+        assert arr.dtype == "V4"
+
+        # Promoting different lengths will fail (pre 1.20 this worked)
+        # by going via S5 and casting to V5.
+        with pytest.raises(TypeError):
+            np.array([b"1234", b"12345"], dtype="V")
+        with pytest.raises(TypeError):
+            np.array([b"12345", b"1234"], dtype="V")
+
+        # Check the same for the casting path:
+        arr = np.array([b"1234", b"1234"], dtype="O").astype("V")
+        assert arr.dtype == "V4"
+        with pytest.raises(TypeError):
+            np.array([b"1234", b"12345"], dtype="O").astype("V")
+
+    @pytest.mark.parametrize("idx",
+            [pytest.param(Ellipsis, id="arr"), pytest.param((), id="scalar")])
+    def test_structured_void_promotion(self, idx):
+        arr = np.array(
+            [np.array(1, dtype="i,i")[idx], np.array(2, dtype='i,i')[idx]],
+            dtype="V")
+        assert_array_equal(arr, np.array([(1, 1), (2, 2)], dtype="i,i"))
+        # The following fails to promote the two dtypes, resulting in an error
+        with pytest.raises(TypeError):
+            np.array(
+                [np.array(1, dtype="i,i")[idx], np.array(2, dtype='i,i,i')[idx]],
+                dtype="V")
+
 
     def test_too_big_error(self):
         # 45341 is the smallest integer greater than sqrt(2**31 - 1).
@@ -1587,6 +1634,47 @@ class TestMethods:
 
     sort_kinds = ['quicksort', 'heapsort', 'stable']
 
+    def test_all_where(self):
+        a = np.array([[True, False, True],
+                      [False, False, False],
+                      [True, True, True]])
+        wh_full = np.array([[True, False, True],
+                            [False, False, False],
+                            [True, False, True]])
+        wh_lower = np.array([[False],
+                             [False],
+                             [True]])
+        for _ax in [0, None]:
+            assert_equal(a.all(axis=_ax, where=wh_lower),
+                        np.all(a[wh_lower[:,0],:], axis=_ax))
+            assert_equal(np.all(a, axis=_ax, where=wh_lower),
+                         a[wh_lower[:,0],:].all(axis=_ax))
+
+        assert_equal(a.all(where=wh_full), True)
+        assert_equal(np.all(a, where=wh_full), True)
+        assert_equal(a.all(where=False), True)
+        assert_equal(np.all(a, where=False), True)
+
+    def test_any_where(self):
+        a = np.array([[True, False, True],
+                      [False, False, False],
+                      [True, True, True]])
+        wh_full = np.array([[False, True, False],
+                            [True, True, True],
+                            [False, False, False]])
+        wh_middle = np.array([[False],
+                              [True],
+                              [False]])
+        for _ax in [0, None]:
+            assert_equal(a.any(axis=_ax, where=wh_middle),
+                         np.any(a[wh_middle[:,0],:], axis=_ax))
+            assert_equal(np.any(a, axis=_ax, where=wh_middle),
+                         a[wh_middle[:,0],:].any(axis=_ax))
+        assert_equal(a.any(where=wh_full), False)
+        assert_equal(np.any(a, where=wh_full), False)
+        assert_equal(a.any(where=False), False)
+        assert_equal(np.any(a, where=False), False)
+
     def test_compress(self):
         tgt = [[5, 6, 7, 8, 9]]
         arr = np.arange(10).reshape(2, 5)
@@ -2014,7 +2102,7 @@ class TestMethods:
             strtype = '>i2'
         else:
             strtype = '<i2'
-        mydtype = [('name', strchar + '5'), ('col2', strtype)]
+        mydtype = [('name', 'U5'), ('col2', strtype)]
         r = np.array([('a', 1), ('b', 255), ('c', 3), ('d', 258)],
                      dtype=mydtype)
         r.sort(order='col2')
@@ -3851,13 +3939,6 @@ class TestPickling:
             with pytest.raises(ImportError):
                 array.__reduce_ex__(5)
 
-        elif sys.version_info[:2] < (3, 6):
-            # when calling __reduce_ex__ explicitly with protocol=5 on python
-            # raise a ValueError saying that protocol 5 is not available for
-            # this python version
-            with pytest.raises(ValueError):
-                array.__reduce_ex__(5)
-
     def test_record_array_with_object_dtype(self):
         my_object = object()
 
@@ -4562,6 +4643,13 @@ class TestPutmask:
         np.putmask(x[1:4], x[:3], [True, False, True])
         assert_equal(x, np.array([True, True, True, True]))
 
+    def test_writeable(self):
+        a = np.arange(5)
+        a.flags.writeable = False
+
+        with pytest.raises(ValueError):
+            np.putmask(a, a >= 2, 3)
+
 
 class TestTake:
     def tst_basic(self, x):
@@ -5039,6 +5127,17 @@ class TestIO:
             s = f.read()
         assert_equal(s, '1.51,2.00,3.51,4.00')
 
+    def test_tofile_cleanup(self):
+        x = np.zeros((10), dtype=object)
+        with open(self.filename, 'wb') as f:
+            assert_raises(IOError, lambda: x.tofile(f, sep=''))
+        # Dup-ed file handle should be closed or remove will fail on Windows OS
+        os.remove(self.filename)
+
+        # Also make sure that we close the Python handle
+        assert_raises(IOError, lambda: x.tofile(self.filename))
+        os.remove(self.filename)
+
     def test_locale(self):
         with CommaDecimalPointLocale():
             self.test_numbers()
@@ -5063,6 +5162,33 @@ class TestIO:
             res = np.fromstring(x_str, dtype="(3,4)i4")
             assert_array_equal(x, res)
 
+    def test_parsing_subarray_unsupported(self):
+        # We currently do not support parsing subarray dtypes
+        data = "12,42,13," * 50
+        with pytest.raises(ValueError):
+            expected = np.fromstring(data, dtype="(3,)i", sep=",")
+
+        with open(self.filename, "w") as f:
+            f.write(data)
+
+        with pytest.raises(ValueError):
+            np.fromfile(self.filename, dtype="(3,)i", sep=",")
+
+    def test_read_shorter_than_count_subarray(self):
+        # Test that requesting more values does not cause any problems
+        # in conjuction with subarray dimensions being absored into the
+        # array dimension.
+        expected = np.arange(511 * 10, dtype="i").reshape(-1, 10)
+
+        binary = expected.tobytes()
+        with pytest.raises(ValueError):
+            with pytest.warns(DeprecationWarning):
+                np.fromstring(binary, dtype="(10,)i", count=10000)
+
+        expected.tofile(self.filename)
+        res = np.fromfile(self.filename, dtype="(10,)i", count=10000)
+        assert_array_equal(res, expected)
+
 
 class TestFromBuffer:
     @pytest.mark.parametrize('byteorder', ['<', '>'])
@@ -5575,6 +5701,33 @@ class TestStats:
         with assert_raises(np.core._exceptions.AxisError):
             np.arange(10).mean(axis=2)
 
+    def test_mean_where(self):
+        a = np.arange(16).reshape((4, 4))
+        wh_full = np.array([[False, True, False, True],
+                            [True, False, True, False],
+                            [True, True, False, False],
+                            [False, False, True, True]])
+        wh_partial = np.array([[False],
+                               [True],
+                               [True],
+                               [False]])
+        _cases = [(1, True, [1.5, 5.5, 9.5, 13.5]),
+                  (0, wh_full, [6., 5., 10., 9.]),
+                  (1, wh_full, [2., 5., 8.5, 14.5]),
+                  (0, wh_partial, [6., 7., 8., 9.])]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.mean(axis=_ax, where=_wh),
+                            np.array(_res))
+            assert_allclose(np.mean(a, axis=_ax, where=_wh),
+                            np.array(_res))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_allclose(a.mean(axis=1, where=wh_partial),
+                            np.array([np.nan, 5.5, 9.5, np.nan]))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.mean(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.mean(a, where=False), np.nan)
+
     def test_var_values(self):
         for mat in [self.rmat, self.cmat, self.omat]:
             for axis in [0, 1, None]:
@@ -5623,6 +5776,34 @@ class TestStats:
         with assert_raises(np.core._exceptions.AxisError):
             np.arange(10).var(axis=2)
 
+    def test_var_where(self):
+        a = np.arange(25).reshape((5, 5))
+        wh_full = np.array([[False, True, False, True, True],
+                            [True, False, True, True, False],
+                            [True, True, False, False, True],
+                            [False, True, True, False, True],
+                            [True, False, True, True, False]])
+        wh_partial = np.array([[False],
+                               [True],
+                               [True],
+                               [False],
+                               [True]])
+        _cases = [(0, True, [50., 50., 50., 50., 50.]),
+                  (1, True, [2., 2., 2., 2., 2.])]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.var(axis=_ax, where=_wh),
+                            np.array(_res))
+            assert_allclose(np.var(a, axis=_ax, where=_wh),
+                            np.array(_res))
+        assert_allclose(np.var(a, axis=1, where=wh_full),
+                        np.var(a[wh_full].reshape((5, 3)), axis=1))
+        assert_allclose(np.var(a, axis=0, where=wh_partial),
+                        np.var(a[wh_partial[:,0]], axis=0))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.var(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.var(a, where=False), np.nan)
+
     def test_std_values(self):
         for mat in [self.rmat, self.cmat, self.omat]:
             for axis in [0, 1, None]:
@@ -5630,6 +5811,42 @@ class TestStats:
                 res = _std(mat, axis=axis)
                 assert_almost_equal(res, tgt)
 
+    def test_std_where(self):
+        a = np.arange(25).reshape((5,5))[::-1]
+        whf = np.array([[False, True, False, True, True],
+                        [True, False, True, False, True],
+                        [True, True, False, True, False],
+                        [True, False, True, True, False],
+                        [False, True, False, True, True]])
+        whp = np.array([[False],
+                        [False],
+                        [True],
+                        [True],
+                        [False]])
+        _cases = [
+            (0, True, 7.07106781*np.ones((5))),
+            (1, True, 1.41421356*np.ones((5))),
+            (0, whf,
+             np.array([4.0824829 , 8.16496581, 5., 7.39509973, 8.49836586])),
+            (0, whp, 2.5*np.ones((5)))
+        ]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.std(axis=_ax, where=_wh), _res)
+            assert_allclose(np.std(a, axis=_ax, where=_wh), _res)
+
+        assert_allclose(a.std(axis=1, where=whf),
+                        np.std(a[whf].reshape((5,3)), axis=1))
+        assert_allclose(np.std(a, axis=1, where=whf),
+                        (a[whf].reshape((5,3))).std(axis=1))
+        assert_allclose(a.std(axis=0, where=whp),
+                        np.std(a[whp[:,0]], axis=0))
+        assert_allclose(np.std(a, axis=0, where=whp),
+                        (a[whp[:,0]]).std(axis=0))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.std(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.std(a, where=False), np.nan)
+
     def test_subclass(self):
         class TestArray(np.ndarray):
             def __new__(cls, data, info):
@@ -7134,6 +7351,21 @@ class TestNewBufferProtocol:
                       _multiarray_tests.get_buffer_info,
                        np.arange(5)[::2], ('SIMPLE',))
 
+    @pytest.mark.parametrize(["obj", "error"], [
+            pytest.param(np.array([1, 2], dtype=rational), ValueError, id="array"),
+            pytest.param(rational(1, 2), TypeError, id="scalar")])
+    def test_export_and_pickle_user_dtype(self, obj, error):
+        # User dtypes should export successfully when FORMAT was not requested.
+        with pytest.raises(error):
+            _multiarray_tests.get_buffer_info(obj, ("STRIDED_RO", "FORMAT"))
+
+        _multiarray_tests.get_buffer_info(obj, ("STRIDED_RO",))
+
+        # This is currently also necessary to implement pickling:
+        pickle_obj = pickle.dumps(obj)
+        res = pickle.loads(pickle_obj)
+        assert_array_equal(res, obj)
+
     def test_padding(self):
         for j in range(8):
             x = np.array([(1,), (2,)], dtype={'f0': (int, j)})
@@ -7169,9 +7401,10 @@ class TestNewBufferProtocol:
         x3 = np.arange(dt3.itemsize, dtype=np.int8).view(dt3)
         self._check_roundtrip(x3)
 
-    def test_relaxed_strides(self):
-        # Test that relaxed strides are converted to non-relaxed
-        c = np.ones((1, 10, 10), dtype='i8')
+    @pytest.mark.valgrind_error(reason="leaks buffer info cache temporarily.")
+    def test_relaxed_strides(self, c=np.ones((1, 10, 10), dtype='i8')):
+        # Note: c defined as parameter so that it is persistent and leak
+        # checks will notice gh-16934 (buffer info cache leak).
 
         # Check for NPY_RELAXED_STRIDES_CHECKING:
         if np.ones((10, 1), order="C").flags.f_contiguous:
@@ -7196,6 +7429,23 @@ class TestNewBufferProtocol:
                     arr, ['C_CONTIGUOUS'])
             assert_(strides[-1] == 8)
 
+    @pytest.mark.valgrind_error(reason="leaks buffer info cache temporarily.")
+    @pytest.mark.skipif(not np.ones((10, 1), order="C").flags.f_contiguous,
+            reason="Test is unnecessary (but fails) without relaxed strides.")
+    def test_relaxed_strides_buffer_info_leak(self, arr=np.ones((1, 10))):
+        """Test that alternating export of C- and F-order buffers from
+        an array which is both C- and F-order when relaxed strides is
+        active works.
+        This test defines array in the signature to ensure leaking more
+        references every time the test is run (catching the leak with
+        pytest-leaks).
+        """
+        for i in range(10):
+            _, s = _multiarray_tests.get_buffer_info(arr, ['F_CONTIGUOUS'])
+            assert s == (8, 8)
+            _, s = _multiarray_tests.get_buffer_info(arr, ['C_CONTIGUOUS'])
+            assert s == (80, 8)
+
     def test_out_of_order_fields(self):
         dt = np.dtype(dict(
             formats=['<i4', '<i4'],
@@ -7283,6 +7533,25 @@ class TestNewBufferProtocol:
         f.a = 3
         assert_equal(arr['a'], 3)
 
+    @pytest.mark.parametrize("obj", [np.ones(3), np.ones(1, dtype="i,i")[()]])
+    def test_error_if_stored_buffer_info_is_corrupted(self, obj):
+        """
+        If a user extends a NumPy array before 1.20 and then runs it
+        on NumPy 1.20+. A C-subclassed array might in theory modify
+        the new buffer-info field. This checks that an error is raised
+        if this happens (for buffer export), an error is written on delete.
+        This is a sanity check to help users transition to safe code, it
+        may be deleted at any point.
+        """
+        # corrupt buffer info:
+        _multiarray_tests.corrupt_or_fix_bufferinfo(obj)
+        name = type(obj)
+        with pytest.raises(RuntimeError,
+                    match=f".*{name} appears to be C subclassed"):
+            memoryview(obj)
+        # Fix buffer info again before we delete (or we lose the memory)
+        _multiarray_tests.corrupt_or_fix_bufferinfo(obj)
+
 
 class TestArrayAttributeDeletion:
 
@@ -7419,6 +7688,18 @@ def test_array_interface_offset():
     arr1 = np.asarray(DummyArray())
     assert_equal(arr1, arr[1:])
 
+def test_array_interface_unicode_typestr():
+    arr = np.array([1, 2, 3], dtype='int32')
+    interface = dict(arr.__array_interface__)
+    interface['typestr'] = '\N{check mark}'
+
+    class DummyArray:
+        __array_interface__ = interface
+
+    # should not be UnicodeEncodeError
+    with pytest.raises(TypeError):
+        np.asarray(DummyArray())
+
 def test_flat_element_deletion():
     it = np.ones(3).flat
     try:
@@ -8199,6 +8480,22 @@ class TestArange:
         assert_raises(ZeroDivisionError, np.arange, 0, 0, 0)
         assert_raises(ZeroDivisionError, np.arange, 0.0, 0.0, 0.0)
 
+    def test_require_range(self):
+        assert_raises(TypeError, np.arange)
+        assert_raises(TypeError, np.arange, step=3)
+        assert_raises(TypeError, np.arange, dtype='int64')
+        assert_raises(TypeError, np.arange, start=4)
+
+    def test_start_stop_kwarg(self):
+        keyword_stop = np.arange(stop=3)
+        keyword_zerotostop = np.arange(start=0, stop=3)
+        keyword_start_stop = np.arange(start=3, stop=9)
+
+        assert len(keyword_stop) == 3
+        assert len(keyword_zerotostop) == 3
+        assert len(keyword_start_stop) == 6
+        assert_array_equal(keyword_stop, keyword_zerotostop)
+
 
 class TestArrayFinalize:
     """ Tests __array_finalize__ """
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index 7b3c3a40d..e10c7ad92 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -2880,3 +2880,68 @@ def test_warn_noclose():
                         casting='equiv', op_dtypes=[np.dtype('f4')])
         del it
         assert len(sup.log) == 1
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize(["in_dtype", "buf_dtype"],
+        [("i", "O"), ("O", "i"),  # most simple cases
+         ("i,O", "O,O"),  # structured partially only copying O
+         ("O,i", "i,O"),  # structured casting to and from O
+         ])
+@pytest.mark.parametrize("steps", [1, 2, 3])
+def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.full(int(np.BUFSIZE * 2.5), value).astype(in_dtype)
+    count = sys.getrefcount(value)
+
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+            flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    for step in range(steps):
+        # The iteration finishes in 3 steps, the first two are partial
+        next(it)
+
+    # Note that resetting does not free references
+    del it
+    assert count == sys.getrefcount(value)
+
+    # Repeat the test with `iternext`
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+                   flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    for step in range(steps):
+        it.iternext()
+
+    del it  # should ensure cleanup
+    assert count == sys.getrefcount(value)
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize(["in_dtype", "buf_dtype"],
+         [("O", "i"),  # most simple cases
+          ("O,i", "i,O"),  # structured casting to and from O
+          ])
+def test_partial_iteration_error(in_dtype, buf_dtype):
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.full(int(np.BUFSIZE * 2.5), value).astype(in_dtype)
+    if in_dtype == "O":
+        arr[int(np.BUFSIZE * 1.5)] = None
+    else:
+        arr[int(np.BUFSIZE * 1.5)]["f0"] = None
+
+    count = sys.getrefcount(value)
+
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+            flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    with pytest.raises(TypeError):
+        # pytest.raises seems to have issues with the error originating
+        # in the for loop, so manually unravel:
+        next(it)
+        next(it)  # raises TypeError
+
+    # Repeat the test with `iternext` after resetting, the buffers should
+    # already be cleared from any references, so resetting is sufficient.
+    it.reset()
+    with pytest.raises(TypeError):
+        it.iternext()
+        it.iternext()
+
+    assert count == sys.getrefcount(value)
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 2a87ffaf8..866a96e31 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -14,6 +14,7 @@ from numpy.testing import (
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_warns, assert_array_max_ulp, HAS_REFCOUNT
     )
+from numpy.core._rational_tests import rational
 
 from hypothesis import assume, given, strategies as st
 from hypothesis.extra import numpy as hynp
@@ -863,6 +864,30 @@ class TestTypes:
         assert_equal(np.promote_types('<m8', '<m8'), np.dtype('m8'))
         assert_equal(np.promote_types('>m8', '>m8'), np.dtype('m8'))
 
+    def test_can_cast_and_promote_usertypes(self):
+        # The rational type defines safe casting for signed integers,
+        # boolean. Rational itself *does* cast safely to double.
+        # (rational does not actually cast to all signed integers, e.g.
+        # int64 can be both long and longlong and it registers only the first)
+        valid_types = ["int8", "int16", "int32", "int64", "bool"]
+        invalid_types = "BHILQP" + "FDG" + "mM" + "f" + "V"
+
+        rational_dt = np.dtype(rational)
+        for numpy_dtype in valid_types:
+            numpy_dtype = np.dtype(numpy_dtype)
+            assert np.can_cast(numpy_dtype, rational_dt)
+            assert np.promote_types(numpy_dtype, rational_dt) is rational_dt
+
+        for numpy_dtype in invalid_types:
+            numpy_dtype = np.dtype(numpy_dtype)
+            assert not np.can_cast(numpy_dtype, rational_dt)
+            with pytest.raises(TypeError):
+                np.promote_types(numpy_dtype, rational_dt)
+
+        double_dt = np.dtype("double")
+        assert np.can_cast(rational_dt, double_dt)
+        assert np.promote_types(double_dt, rational_dt) is double_dt
+
     def test_promote_types_strings(self):
         assert_equal(np.promote_types('bool', 'S'), np.dtype('S5'))
         assert_equal(np.promote_types('b', 'S'), np.dtype('S4'))
@@ -897,6 +922,126 @@ class TestTypes:
         assert_equal(np.promote_types('u8', 'S1'), np.dtype('S20'))
         assert_equal(np.promote_types('u8', 'S30'), np.dtype('S30'))
 
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V6"), np.dtype("V10")],
+             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
+             [np.dtype("i8,i8"), np.dtype("i4,i4")],
+            ])
+    def test_invalid_void_promotion(self, dtype1, dtype2):
+        # Mainly test structured void promotion, which currently allows
+        # byte-swapping, but nothing else:
+        with pytest.raises(TypeError):
+            np.promote_types(dtype1, dtype2)
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V10"), np.dtype("V10")],
+             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
+             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
+            ])
+    def test_valid_void_promotion(self, dtype1, dtype2):
+        assert np.promote_types(dtype1, dtype2) is dtype1
+
+    @pytest.mark.parametrize("dtype",
+           list(np.typecodes["All"]) +
+           ["i,i", "S3", "S100", "U3", "U100", rational])
+    def test_promote_identical_types_metadata(self, dtype):
+        # The same type passed in twice to promote types always
+        # preserves metadata
+        metadata = {1: 1}
+        dtype = np.dtype(dtype, metadata=metadata)
+
+        res = np.promote_types(dtype, dtype)
+        assert res.metadata == dtype.metadata
+
+        # byte-swapping preserves and makes the dtype native:
+        dtype = dtype.newbyteorder()
+        if dtype.isnative:
+            # The type does not have byte swapping
+            return
+
+        res = np.promote_types(dtype, dtype)
+        if res.char in "?bhilqpBHILQPefdgFDGOmM" or dtype.type is rational:
+            # Metadata is lost for simple promotions (they create a new dtype)
+            assert res.metadata is None
+        else:
+            assert res.metadata == metadata
+        if dtype.kind != "V":
+            # the result is native (except for structured void)
+            assert res.isnative
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            itertools.product(
+                list(np.typecodes["All"]) +
+                ["i,i", "S3", "S100", "U3", "U100", rational],
+                repeat=2))
+    def test_promote_types_metadata(self, dtype1, dtype2):
+        """Metadata handling in promotion does not appear formalized
+        right now in NumPy. This test should thus be considered to
+        document behaviour, rather than test the correct definition of it.
+
+        This test is very ugly, it was useful for rewriting part of the
+        promotion, but probably should eventually be replaced/deleted
+        (i.e. when metadata handling in promotion is better defined).
+        """
+        metadata1 = {1: 1}
+        metadata2 = {2: 2}
+        dtype1 = np.dtype(dtype1, metadata=metadata1)
+        dtype2 = np.dtype(dtype2, metadata=metadata2)
+
+        try:
+            res = np.promote_types(dtype1, dtype2)
+        except TypeError:
+            # Promotion failed, this test only checks metadata
+            return
+
+        if res.char in "?bhilqpBHILQPefdgFDGOmM" or res.type is rational:
+            # All simple types lose metadata (due to using promotion table):
+            assert res.metadata is None
+        elif res == dtype1:
+            # If one result is the result, it is usually returned unchanged:
+            assert res is dtype1
+        elif res == dtype2:
+            # dtype1 may have been cast to the same type/kind as dtype2.
+            # If the resulting dtype is identical we currently pick the cast
+            # version of dtype1, which lost the metadata:
+            if np.promote_types(dtype1, dtype2.kind) == dtype2:
+                res.metadata is None
+            else:
+                res.metadata == metadata2
+        else:
+            assert res.metadata is None
+
+        # Try again for byteswapped version
+        dtype1 = dtype1.newbyteorder()
+        assert dtype1.metadata == metadata1
+        res_bs = np.promote_types(dtype1, dtype2)
+        if res_bs.names is not None:
+            # Structured promotion doesn't remove byteswap:
+            assert res_bs.newbyteorder() == res
+        else:
+            assert res_bs == res
+        assert res_bs.metadata == res.metadata
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V6"), np.dtype("V10")],
+             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
+             [np.dtype("i8,i8"), np.dtype("i4,i4")],
+            ])
+    def test_invalid_void_promotion(self, dtype1, dtype2):
+        # Mainly test structured void promotion, which currently allows
+        # byte-swapping, but nothing else:
+        with pytest.raises(TypeError):
+            np.promote_types(dtype1, dtype2)
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V10"), np.dtype("V10")],
+             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
+             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
+            ])
+    def test_valid_void_promotion(self, dtype1, dtype2):
+        assert np.promote_types(dtype1, dtype2) is dtype1
+
     def test_can_cast(self):
         assert_(np.can_cast(np.int32, np.int64))
         assert_(np.can_cast(np.float64, complex))
@@ -2521,7 +2666,7 @@ class TestCreationFuncs:
         self.check_function(np.zeros)
 
     def test_ones(self):
-        self.check_function(np.zeros)
+        self.check_function(np.ones)
 
     def test_empty(self):
         self.check_function(np.empty)
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 7e73d8c03..6862fca03 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -1,5 +1,7 @@
 import inspect
 import sys
+import tempfile
+from io import StringIO
 from unittest import mock
 
 import numpy as np
@@ -425,3 +427,155 @@ class TestNumPyFunctions:
         # note: the internal implementation of np.sum() calls the .sum() method
         array = np.array(1).view(MyArray)
         assert_equal(np.sum(array), 'summed')
+
+
+class TestArrayLike:
+    def setup(self):
+        class MyArray():
+            def __init__(self, function=None):
+                self.function = function
+
+            def __array_function__(self, func, types, args, kwargs):
+                try:
+                    my_func = getattr(self, func.__name__)
+                except AttributeError:
+                    return NotImplemented
+                return my_func(*args, **kwargs)
+
+        self.MyArray = MyArray
+
+        class MyNoArrayFunctionArray():
+            def __init__(self, function=None):
+                self.function = function
+
+        self.MyNoArrayFunctionArray = MyNoArrayFunctionArray
+
+    def add_method(self, name, arr_class, enable_value_error=False):
+        def _definition(*args, **kwargs):
+            # Check that `like=` isn't propagated downstream
+            assert 'like' not in kwargs
+
+            if enable_value_error and 'value_error' in kwargs:
+                raise ValueError
+
+            return arr_class(getattr(arr_class, name))
+        setattr(arr_class, name, _definition)
+
+    def func_args(*args, **kwargs):
+        return args, kwargs
+
+    @requires_array_function
+    def test_array_like_not_implemented(self):
+        self.add_method('array', self.MyArray)
+
+        ref = self.MyArray.array()
+
+        with assert_raises_regex(TypeError, 'no implementation found'):
+            array_like = np.asarray(1, like=ref)
+
+    _array_tests = [
+        ('array', *func_args((1,))),
+        ('asarray', *func_args((1,))),
+        ('asanyarray', *func_args((1,))),
+        ('ascontiguousarray', *func_args((2, 3))),
+        ('asfortranarray', *func_args((2, 3))),
+        ('require', *func_args((np.arange(6).reshape(2, 3),),
+                               requirements=['A', 'F'])),
+        ('empty', *func_args((1,))),
+        ('full', *func_args((1,), 2)),
+        ('ones', *func_args((1,))),
+        ('zeros', *func_args((1,))),
+        ('arange', *func_args(3)),
+        ('frombuffer', *func_args(b'\x00' * 8, dtype=int)),
+        ('fromiter', *func_args(range(3), dtype=int)),
+        ('fromstring', *func_args('1,2', dtype=int, sep=',')),
+        ('loadtxt', *func_args(lambda: StringIO('0 1\n2 3'))),
+        ('genfromtxt', *func_args(lambda: StringIO(u'1,2.1'),
+                                  dtype=[('int', 'i8'), ('float', 'f8')],
+                                  delimiter=',')),
+    ]
+
+    @pytest.mark.parametrize('function, args, kwargs', _array_tests)
+    @pytest.mark.parametrize('numpy_ref', [True, False])
+    @requires_array_function
+    def test_array_like(self, function, args, kwargs, numpy_ref):
+        self.add_method('array', self.MyArray)
+        self.add_method(function, self.MyArray)
+        np_func = getattr(np, function)
+        my_func = getattr(self.MyArray, function)
+
+        if numpy_ref is True:
+            ref = np.array(1)
+        else:
+            ref = self.MyArray.array()
+
+        like_args = tuple(a() if callable(a) else a for a in args)
+        array_like = np_func(*like_args, **kwargs, like=ref)
+
+        if numpy_ref is True:
+            assert type(array_like) is np.ndarray
+
+            np_args = tuple(a() if callable(a) else a for a in args)
+            np_arr = np_func(*np_args, **kwargs)
+
+            # Special-case np.empty to ensure values match
+            if function == "empty":
+                np_arr.fill(1)
+                array_like.fill(1)
+
+            assert_equal(array_like, np_arr)
+        else:
+            assert type(array_like) is self.MyArray
+            assert array_like.function is my_func
+
+    @pytest.mark.parametrize('function, args, kwargs', _array_tests)
+    @pytest.mark.parametrize('ref', [1, [1], "MyNoArrayFunctionArray"])
+    @requires_array_function
+    def test_no_array_function_like(self, function, args, kwargs, ref):
+        self.add_method('array', self.MyNoArrayFunctionArray)
+        self.add_method(function, self.MyNoArrayFunctionArray)
+        np_func = getattr(np, function)
+
+        # Instantiate ref if it's the MyNoArrayFunctionArray class
+        if ref == "MyNoArrayFunctionArray":
+            ref = self.MyNoArrayFunctionArray.array()
+
+        like_args = tuple(a() if callable(a) else a for a in args)
+
+        with assert_raises_regex(TypeError,
+                'The `like` argument must be an array-like that implements'):
+            np_func(*like_args, **kwargs, like=ref)
+
+    @pytest.mark.parametrize('numpy_ref', [True, False])
+    def test_array_like_fromfile(self, numpy_ref):
+        self.add_method('array', self.MyArray)
+        self.add_method("fromfile", self.MyArray)
+
+        if numpy_ref is True:
+            ref = np.array(1)
+        else:
+            ref = self.MyArray.array()
+
+        data = np.random.random(5)
+
+        fname = tempfile.mkstemp()[1]
+        data.tofile(fname)
+
+        array_like = np.fromfile(fname, like=ref)
+        if numpy_ref is True:
+            assert type(array_like) is np.ndarray
+            np_res = np.fromfile(fname, like=ref)
+            assert_equal(np_res, data)
+            assert_equal(array_like, np_res)
+        else:
+            assert type(array_like) is self.MyArray
+            assert array_like.function is self.MyArray.fromfile
+
+    @requires_array_function
+    def test_exception_handling(self):
+        self.add_method('array', self.MyArray, enable_value_error=True)
+
+        ref = self.MyArray.array()
+
+        with assert_raises(ValueError):
+            np.array(1, value_error=True, like=ref)
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index 4350a3407..4d4b4b515 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -1,5 +1,6 @@
 import collections.abc
 import textwrap
+from io import BytesIO
 from os import path
 from pathlib import Path
 import pytest
@@ -79,8 +80,14 @@ class TestFromrecords:
         r1 = np.rec.fromfile(fd, formats='f8,i4,a5', shape=3, byteorder='big')
         fd.seek(2880 * 2)
         r2 = np.rec.array(fd, formats='f8,i4,a5', shape=3, byteorder='big')
+        fd.seek(2880 * 2)
+        bytes_array = BytesIO()
+        bytes_array.write(fd.read())
+        bytes_array.seek(0)
+        r3 = np.rec.fromfile(bytes_array, formats='f8,i4,a5', shape=3, byteorder='big')
         fd.close()
         assert_equal(r1, r2)
+        assert_equal(r2, r3)
 
     def test_recarray_from_obj(self):
         count = 10
@@ -417,7 +424,16 @@ class TestRecord:
         # make sure we did not pickle the address
         assert not isinstance(obj, bytes)
 
-        assert_raises(TypeError, ctor, dtype, 13)
+        assert_raises(RuntimeError, ctor, dtype, 13)
+
+        # Test roundtrip:
+        dump = pickle.dumps(a[0])
+        unpickled = pickle.loads(dump)
+        assert a[0] == unpickled
+
+        # Also check the similar (impossible) "object scalar" path:
+        with pytest.warns(DeprecationWarning):
+            assert ctor(np.dtype("O"), data) is data
 
     def test_objview_record(self):
         # https://github.com/numpy/numpy/issues/2599
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 170d20157..831e48e8b 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -14,7 +14,7 @@ from numpy.testing import (
         assert_raises_regex, assert_warns, suppress_warnings,
         _assert_valid_refcount, HAS_REFCOUNT,
         )
-from numpy.testing._private.utils import _no_tracing
+from numpy.testing._private.utils import _no_tracing, requires_memory
 from numpy.compat import asbytes, asunicode, pickle
 
 try:
@@ -1506,10 +1506,10 @@ class TestRegression:
                 test_type(t)
 
     def test_buffer_hashlib(self):
-        from hashlib import md5
+        from hashlib import sha256
 
         x = np.array([1, 2, 3], dtype=np.dtype('<i4'))
-        assert_equal(md5(x).hexdigest(), '2a1dd1e1e59d0a384c26951e316cd7e6')
+        assert_equal(sha256(x).hexdigest(), '4636993d3e1da4e9d6b8f87b79e8f7c6d018580d52661950eabc3845c5897a4d')
 
     def test_0d_string_scalar(self):
         # Bug #1436; the following should succeed
@@ -2427,9 +2427,10 @@ class TestRegression:
             assert b'numpy.core.multiarray' in s
 
     def test_object_casting_errors(self):
-        # gh-11993
+        # gh-11993 update to ValueError (see gh-16909), since strings can in
+        # principle be converted to complex, but this string cannot.
         arr = np.array(['AAAAA', 18465886.0, 18465886.0], dtype=object)
-        assert_raises(TypeError, arr.astype, 'c8')
+        assert_raises(ValueError, arr.astype, 'c8')
 
     def test_eff1d_casting(self):
         # gh-12711
@@ -2487,3 +2488,39 @@ class TestRegression:
         assert arr.size * arr.itemsize > 2 ** 31
         c_arr = np.ctypeslib.as_ctypes(arr)
         assert_equal(c_arr._length_, arr.size)
+
+    def test_complex_conversion_error(self):
+        # gh-17068
+        with pytest.raises(TypeError, match=r"Unable to convert dtype.*"):
+            complex(np.array("now", np.datetime64))
+
+    def test__array_interface__descr(self):
+        # gh-17068
+        dt = np.dtype(dict(names=['a', 'b'],
+                           offsets=[0, 0],
+                           formats=[np.int64, np.int64]))
+        descr = np.array((1, 1), dtype=dt).__array_interface__['descr']
+        assert descr == [('', '|V8')]  # instead of [(b'', '|V8')]
+
+    @pytest.mark.skipif(sys.maxsize < 2 ** 31 + 1, reason='overflows 32-bit python')
+    @requires_memory(free_bytes=9e9)
+    def test_dot_big_stride(self):
+        # gh-17111
+        # blas stride = stride//itemsize > int32 max
+        int32_max = np.iinfo(np.int32).max
+        n = int32_max + 3
+        a = np.empty([n], dtype=np.float32)
+        b = a[::n-1]
+        b[...] = 1
+        assert b.strides[0] > int32_max * b.dtype.itemsize
+        assert np.dot(b, b) == 2.0
+
+    def test_frompyfunc_name(self):
+        # name conversion was failing for python 3 strings
+        # resulting in the default '?' name. Also test utf-8
+        # encoding using non-ascii name.
+        def cassé(x):
+            return x
+
+        f = np.frompyfunc(cassé, 1, 1)
+        assert str(f) == "<ufunc 'cassé (vectorized)'>"
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index 574c56864..851cd3081 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -3,6 +3,7 @@ Test scalar buffer interface adheres to PEP 3118
 """
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_tests import get_buffer_info
 import pytest
 
 from numpy.testing import assert_, assert_equal, assert_raises
@@ -52,10 +53,20 @@ class TestScalarPEP3118:
         assert_equal(mv_x.suboffsets, ())
 
     @pytest.mark.parametrize('scalar, code', scalars_and_codes, ids=codes_only)
-    def test_scalar_known_code(self, scalar, code):
+    def test_scalar_code_and_properties(self, scalar, code):
         x = scalar()
+        expected = dict(strides=(), itemsize=x.dtype.itemsize, ndim=0,
+                        shape=(), format=code, readonly=True)
+
         mv_x = memoryview(x)
-        assert_equal(mv_x.format, code)
+        print(mv_x.readonly, self._as_dict(mv_x))
+        assert self._as_dict(mv_x) == expected
+
+    @pytest.mark.parametrize('scalar', scalars_only, ids=codes_only)
+    def test_scalar_buffers_readonly(self, scalar):
+        x = scalar()
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(x, ["WRITABLE"])
 
     def test_void_scalar_structured_data(self):
         dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
@@ -77,9 +88,14 @@ class TestScalarPEP3118:
         assert_equal(mv_x.itemsize, mv_a.itemsize)
         assert_equal(mv_x.format, mv_a.format)
 
+        # Check that we do not allow writeable buffer export (technically
+        # we could allow it sometimes here...)
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(x, ["WRITABLE"])
+
     def _as_dict(self, m):
         return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
-                    ndim=m.ndim, format=m.format)
+                    ndim=m.ndim, format=m.format, readonly=m.readonly)
 
     def test_datetime_memoryview(self):
         # gh-11656
@@ -88,7 +104,7 @@ class TestScalarPEP3118:
         dt1 = np.datetime64('2016-01-01')
         dt2 = np.datetime64('2017-01-01')
         expected = dict(strides=(1,), itemsize=1, ndim=1, shape=(8,),
-                        format='B')
+                        format='B', readonly=True)
         v = memoryview(dt1)
         assert self._as_dict(v) == expected
 
@@ -100,6 +116,10 @@ class TestScalarPEP3118:
         # Fails to create a PEP 3118 valid buffer
         assert_raises((ValueError, BufferError), memoryview, a[0])
 
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(dt1, ["WRITABLE"])
+
     @pytest.mark.parametrize('s', [
         pytest.param("\x32\x32", id="ascii"),
         pytest.param("\uFE0F\uFE0F", id="basic multilingual"),
@@ -109,7 +129,8 @@ class TestScalarPEP3118:
         s = np.str_(s)  # only our subclass implements the buffer protocol
 
         # all the same, characters always encode as ucs4
-        expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w')
+        expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w',
+                        readonly=True)
 
         v = memoryview(s)
         assert self._as_dict(v) == expected
@@ -119,7 +140,15 @@ class TestScalarPEP3118:
 
         assert_equal(code_points, [ord(c) for c in s])
 
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(s, ["WRITABLE"])
+
     def test_user_scalar_fails_buffer(self):
         r = rational(1)
         with assert_raises(TypeError):
             memoryview(r)
+
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(r, ["WRITABLE"])
+\ No newline at end of file
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index c7f44cf50..d8529418e 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -276,6 +276,10 @@ class TestModulus:
         # Check nans, inf
         with suppress_warnings() as sup:
             sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            sup.filter(RuntimeWarning, "divide by zero encountered in remainder")
+            sup.filter(RuntimeWarning, "divide by zero encountered in floor_divide")
+            sup.filter(RuntimeWarning, "divide by zero encountered in divmod")
+            sup.filter(RuntimeWarning, "invalid value encountered in divmod")
             for dt in np.typecodes['Float']:
                 fone = np.array(1.0, dtype=dt)
                 fzer = np.array(0.0, dtype=dt)
@@ -290,6 +294,9 @@ class TestModulus:
                 assert_(np.isnan(rem), 'dt: %s' % dt)
                 rem = operator.mod(finf, fone)
                 assert_(np.isnan(rem), 'dt: %s' % dt)
+                for op in [floordiv_and_mod, divmod]:
+                    div, mod = op(fone, fzer)
+                    assert_(np.isinf(div)) and assert_(np.isnan(mod))
 
     def test_inplace_floordiv_handling(self):
         # issue gh-12927
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 94a916193..4e56ace90 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -342,19 +342,32 @@ class TestConcatenate:
         assert_raises(ValueError, concatenate, (a, b), out=np.empty((1,4)))
         concatenate((a, b), out=np.empty(4))
 
-    def test_out_dtype(self):
-        out = np.empty(4, np.float32)
-        res = concatenate((array([1, 2]), array([3, 4])), out=out)
-        assert_(out is res)
-
-        out = np.empty(4, np.complex64)
-        res = concatenate((array([0.1, 0.2]), array([0.3, 0.4])), out=out)
-        assert_(out is res)
-
-        # invalid cast
-        out = np.empty(4, np.int32)
-        assert_raises(TypeError, concatenate,
-            (array([0.1, 0.2]), array([0.3, 0.4])), out=out)
+    @pytest.mark.parametrize("axis", [None, 0])
+    @pytest.mark.parametrize("out_dtype", ["c8", "f4", "f8", ">f8", "i8"])
+    @pytest.mark.parametrize("casting",
+            ['no', 'equiv', 'safe', 'same_kind', 'unsafe'])
+    def test_out_and_dtype(self, axis, out_dtype, casting):
+        # Compare usage of `out=out` with `dtype=out.dtype`
+        out = np.empty(4, dtype=out_dtype)
+        to_concat = (array([1.1, 2.2]), array([3.3, 4.4]))
+
+        if not np.can_cast(to_concat[0], out_dtype, casting=casting):
+            with assert_raises(TypeError):
+                concatenate(to_concat, out=out, axis=axis, casting=casting)
+            with assert_raises(TypeError):
+                concatenate(to_concat, dtype=out.dtype,
+                            axis=axis, casting=casting)
+        else:
+            res_out = concatenate(to_concat, out=out,
+                                  axis=axis, casting=casting)
+            res_dtype = concatenate(to_concat, dtype=out.dtype,
+                                    axis=axis, casting=casting)
+            assert res_out is out
+            assert_array_equal(out, res_dtype)
+            assert res_dtype.dtype == out_dtype
+
+        with assert_raises(TypeError):
+            concatenate(to_concat, out=out, dtype=out_dtype, axis=axis)
 
 
 def test_stack():
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
new file mode 100644
index 000000000..196003cdd
--- /dev/null
+++ b/numpy/core/tests/test_simd.py
@@ -0,0 +1,671 @@
+# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
+# may be involved in their functionality.
+import pytest, math
+from numpy.core._simd import targets
+
+class _Test_Utility:
+    # submodule of the desired SIMD extention, e.g. targets["AVX512F"]
+    npyv = None
+    # the current data type suffix e.g. 's8'
+    sfx  = None
+
+    def __getattr__(self, attr):
+        """
+        To call NPV intrinsics without the attribute 'npyv' and
+        auto suffixing intrinsics according to class attribute 'sfx'
+        """
+        return getattr(self.npyv, attr + "_" + self.sfx)
+
+    def _data(self, start=None, count=None, reverse=False):
+        """
+        Create list of consecutive numbers according to number of vector's lanes.
+        """
+        if start is None:
+            start = 1
+        if count is None:
+            count = self.nlanes
+        rng = range(start, start + count)
+        if reverse:
+            rng = reversed(rng)
+        if self._is_fp():
+            return [x / 1.0 for x in rng]
+        return list(rng)
+
+    def _is_unsigned(self):
+        return self.sfx[0] == 'u'
+
+    def _is_signed(self):
+        return self.sfx[0] == 's'
+
+    def _is_fp(self):
+        return self.sfx[0] == 'f'
+
+    def _scalar_size(self):
+        return int(self.sfx[1:])
+
+    def _int_clip(self, seq):
+        if self._is_fp():
+            return seq
+        max_int = self._int_max()
+        min_int = self._int_min()
+        return [min(max(v, min_int), max_int) for v in seq]
+
+    def _int_max(self):
+        if self._is_fp():
+            return None
+        max_u = self._to_unsigned(self.setall(-1))[0]
+        if self._is_signed():
+            return max_u // 2
+        return max_u
+
+    def _int_min(self):
+        if self._is_fp():
+            return None
+        if self._is_unsigned():
+            return 0
+        return -(self._int_max() + 1)
+
+    def _true_mask(self):
+        max_unsig = getattr(self.npyv, "setall_u" + self.sfx[1:])(-1)
+        return max_unsig[0]
+
+    def _to_unsigned(self, vector):
+        if isinstance(vector, (list, tuple)):
+            return getattr(self.npyv, "load_u" + self.sfx[1:])(vector)
+        else:
+            sfx = vector.__name__.replace("npyv_", "")
+            if sfx[0] == "b":
+                cvt_intrin = "cvt_u{0}_b{0}"
+            else:
+                cvt_intrin = "reinterpret_u{0}_{1}"
+            return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
+
+    def _pinfinity(self):
+        v = self.npyv.setall_u32(0x7f800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _ninfinity(self):
+        v = self.npyv.setall_u32(0xff800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _nan(self):
+        v = self.npyv.setall_u32(0x7fc00000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+class _SIMD_BOOL(_Test_Utility):
+    """
+    To test all boolean vector types at once
+    """
+    def _data(self, start=None, count=None, reverse=False):
+        nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:])
+        true_mask = self._true_mask()
+        rng = range(nlanes)
+        if reverse:
+            rng = reversed(rng)
+        return [true_mask if x % 2 else 0 for x in rng]
+
+    def _load_b(self, data):
+        len_str = self.sfx[1:]
+        load = getattr(self.npyv, "load_u" + len_str)
+        cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}")
+        return cvt(load(data))
+
+    def test_tobits(self):
+        data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
+        for data in (self._data(), self._data(reverse=True)):
+            vdata = self._load_b(data)
+            data_bits = data2bits(data)
+            tobits = bin(self.tobits(vdata))
+            assert tobits == bin(data_bits)
+
+class _SIMD_INT(_Test_Utility):
+    """
+    To test all integer vector types at once
+    """
+    def test_operators_shift(self):
+        if self.sfx in ("u8", "s8"):
+            return
+
+        data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        for count in range(self._scalar_size()):
+            # load to cast
+            data_shl_a = self.load([a << count for a in data_a])
+            # left shift
+            shl = self.shl(vdata_a, count)
+            assert shl == data_shl_a
+            # left shift by an immediate constant
+            shli = self.shli(vdata_a, count)
+            assert shli == data_shl_a
+            # load to cast
+            data_shr_a = self.load([a >> count for a in data_a])
+            # right shift
+            shr = self.shr(vdata_a, count)
+            assert shr == data_shr_a
+            # right shift by an immediate constant
+            shri = self.shri(vdata_a, count)
+            assert shri == data_shr_a
+
+    def test_arithmetic_subadd_saturated(self):
+        if self.sfx in ("u32", "s32", "u64", "s64"):
+            return
+
+        data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        data_adds = self._int_clip([a + b for a, b in zip(data_a, data_b)])
+        adds = self.adds(vdata_a, vdata_b)
+        assert adds == data_adds
+
+        data_subs = self._int_clip([a - b for a, b in zip(data_a, data_b)])
+        subs = self.subs(vdata_a, vdata_b)
+        assert subs == data_subs
+
+class _SIMD_FP(_Test_Utility):
+    """
+    To test all float vector types at once
+    """
+    def test_arithmetic_fused(self):
+        vdata_a, vdata_b, vdata_c = [self.load(self._data())]*3
+        vdata_cx2 = self.add(vdata_c, vdata_c)
+        # multiply and add, a*b + c
+        data_fma = self.load([a * b + c for a, b, c in zip(vdata_a, vdata_b, vdata_c)])
+        fma = self.muladd(vdata_a, vdata_b, vdata_c)
+        assert fma == data_fma
+        # multiply and subtract, a*b - c
+        fms = self.mulsub(vdata_a, vdata_b, vdata_c)
+        data_fms = self.sub(data_fma, vdata_cx2)
+        assert fms == data_fms
+        # negate multiply and add, -(a*b) + c
+        nfma = self.nmuladd(vdata_a, vdata_b, vdata_c)
+        data_nfma = self.sub(vdata_cx2, data_fma)
+        assert nfma == data_nfma
+        # negate multiply and subtract, -(a*b) - c
+        nfms = self.nmulsub(vdata_a, vdata_b, vdata_c)
+        data_nfms = self.mul(data_fma, self.setall(-1))
+        assert nfms == data_nfms
+
+    def test_abs(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan))
+        for case, desired in abs_cases:
+            data_abs = [desired]*self.nlanes
+            vabs = self.abs(self.setall(case))
+            assert vabs == pytest.approx(data_abs, nan_ok=True)
+
+        vabs = self.abs(self.mul(vdata, self.setall(-1)))
+        assert vabs == data
+
+    def test_sqrt(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf))
+        for case, desired in sqrt_cases:
+            data_sqrt = [desired]*self.nlanes
+            sqrt  = self.sqrt(self.setall(case))
+            assert sqrt == pytest.approx(data_sqrt, nan_ok=True)
+
+        data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision
+        sqrt = self.sqrt(vdata)
+        assert sqrt == data_sqrt
+
+    def test_square(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+        # square
+        square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf))
+        for case, desired in square_cases:
+            data_square = [desired]*self.nlanes
+            square  = self.square(self.setall(case))
+            assert square == pytest.approx(data_square, nan_ok=True)
+
+        data_square = [x*x for x in data]
+        square = self.square(vdata)
+        assert square == data_square
+
+    def test_reciprocal(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf))
+        for case, desired in recip_cases:
+            data_recip = [desired]*self.nlanes
+            recip = self.recip(self.setall(case))
+            assert recip == pytest.approx(data_recip, nan_ok=True)
+
+        data_recip = self.load([1/x for x in data]) # load to truncate precision
+        recip = self.recip(vdata)
+        assert recip == data_recip
+
+class _SIMD_ALL(_Test_Utility):
+    """
+    To test all vector types at once
+    """
+    def test_memory_load(self):
+        data = self._data()
+        # unaligned load
+        load_data = self.load(data)
+        assert load_data == data
+        # aligned load
+        loada_data = self.loada(data)
+        assert loada_data == data
+        # stream load
+        loads_data = self.loads(data)
+        assert loads_data == data
+        # load lower part
+        loadl = self.loadl(data)
+        loadl_half = list(loadl)[:self.nlanes//2]
+        data_half = data[:self.nlanes//2]
+        assert loadl_half == data_half
+        assert loadl != data # detect overflow
+
+    def test_memory_store(self):
+        data = self._data()
+        vdata = self.load(data)
+        # unaligned store
+        store = [0] * self.nlanes
+        self.store(store, vdata)
+        assert store == data
+        # aligned store
+        store_a = [0] * self.nlanes
+        self.storea(store_a, vdata)
+        assert store_a == data
+        # stream store
+        store_s = [0] * self.nlanes
+        self.stores(store_s, vdata)
+        assert store_s == data
+        # store lower part
+        store_l = [0] * self.nlanes
+        self.storel(store_l, vdata)
+        assert store_l[:self.nlanes//2] == data[:self.nlanes//2]
+        assert store_l != vdata # detect overflow
+        # store higher part
+        store_h = [0] * self.nlanes
+        self.storeh(store_h, vdata)
+        assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
+        assert store_h != vdata  # detect overflow
+
+    def test_memory_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4] # test out of range
+        for n in lanes:
+            load_till  = self.load_till(data, n, 15)
+            data_till  = data[:n] + [15] * (self.nlanes-n)
+            assert load_till == data_till
+            load_tillz = self.load_tillz(data, n)
+            data_tillz = data[:n] + [0] * (self.nlanes-n)
+            assert load_tillz == data_tillz
+
+    def test_memory_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        data_rev = self._data(reverse=True)
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for n in lanes:
+            data_till = data_rev.copy()
+            data_till[:n] = data[:n]
+            store_till = self._data(reverse=True)
+            self.store_till(store_till, n, vdata)
+            assert store_till == data_till
+
+    def test_memory_noncont_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = self.load(data[::stride]) # cast unsigned
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+    def test_memory_noncont_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = list(self.load(data[::stride])) # cast unsigned
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+    def test_memory_noncont_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        vdata = self.load(self._data())
+        for stride in range(1, 64):
+            data = [15] * stride * self.nlanes
+            data[::stride] = vdata
+            storen = [15] * stride * self.nlanes
+            storen += [127]*64
+            self.storen(storen, stride, vdata)
+            assert storen[:-64] == data
+            assert storen[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            data = [15] * -stride * self.nlanes
+            data[::stride] = vdata
+            storen = [127]*64
+            storen += [15] * -stride * self.nlanes
+            self.storen(storen, stride, vdata)
+            assert storen[64:] == data
+            assert storen[:64] == [127]*64 # detect overflow
+
+    def test_memory_noncont_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            for n in lanes:
+                data_till = [15] * stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [15] * stride * self.nlanes
+                storen_till += [127]*64
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[:-64] == data_till
+                assert storen_till[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            for n in lanes:
+                data_till = [15] * -stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [127]*64
+                storen_till += [15] * -stride * self.nlanes
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[64:] == data_till
+                assert storen_till[:64] == [127]*64 # detect overflow
+
+    def test_misc(self):
+        broadcast_zero = self.zero()
+        assert broadcast_zero == [0] * self.nlanes
+        for i in range(1, 10):
+            broadcasti = self.setall(i)
+            assert broadcasti == [i] * self.nlanes
+
+        data_a, data_b = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # py level of npyv_set_* don't support ignoring the extra specified lanes or
+        # fill non-specified lanes with zero.
+        vset = self.set(*data_a)
+        assert vset == data_a
+        # py level of npyv_setf_* don't support ignoring the extra specified lanes or
+        # fill non-specified lanes with the specified scalar.
+        vsetf = self.setf(10, *data_a)
+        assert vsetf == data_a
+
+        # We're testing the sainty of _simd's type-vector,
+        # reinterpret* intrinsics itself are tested via compiler
+        # during the build of _simd module
+        sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"]
+        if self.npyv.simd_f64:
+            sfxes.append("f64")
+        for sfx in sfxes:
+            vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__
+            assert vec_name == "npyv_" + sfx
+
+        # select & mask operations
+        select_a = self.select(self.cmpeq(self.zero(), self.zero()), vdata_a, vdata_b)
+        assert select_a == data_a
+        select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b)
+        assert select_b == data_b
+
+        # cleanup intrinsic is only used with AVX for
+        # zeroing registers to avoid the AVX-SSE transition penalty,
+        # so nothing to test here
+        self.npyv.cleanup()
+
+    def test_reorder(self):
+        data_a, data_b  = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+        # lower half part
+        data_a_lo = data_a[:self.nlanes//2]
+        data_b_lo = data_b[:self.nlanes//2]
+        # higher half part
+        data_a_hi = data_a[self.nlanes//2:]
+        data_b_hi = data_b[self.nlanes//2:]
+        # combine two lower parts
+        combinel = self.combinel(vdata_a, vdata_b)
+        assert combinel == data_a_lo + data_b_lo
+        # combine two higher parts
+        combineh = self.combineh(vdata_a, vdata_b)
+        assert combineh == data_a_hi + data_b_hi
+        # combine x2
+        combine  = self.combine(vdata_a, vdata_b)
+        assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi)
+        # zip(interleave)
+        data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p]
+        data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p]
+        vzip  = self.zip(vdata_a, vdata_b)
+        assert vzip == (data_zipl, data_ziph)
+
+    def test_reorder_rev64(self):
+        # Reverse elements of each 64-bit lane
+        ssize = self._scalar_size()
+        if ssize == 64:
+            return
+        data_rev64 = [
+            y for x in range(0, self.nlanes, 64//ssize)
+              for y in reversed(range(x, x + 64//ssize))
+        ]
+        rev64 = self.rev64(self.load(range(self.nlanes)))
+        assert rev64 == data_rev64
+
+    def test_operators_comparison(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        mask_true = self._true_mask()
+        def to_bool(vector):
+            return [lane == mask_true for lane in vector]
+        # equal
+        data_eq = [a == b for a, b in zip(data_a, data_b)]
+        cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b))
+        assert cmpeq == data_eq
+        # not equal
+        data_neq = [a != b for a, b in zip(data_a, data_b)]
+        cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b))
+        assert cmpneq == data_neq
+        # greater than
+        data_gt = [a > b for a, b in zip(data_a, data_b)]
+        cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b))
+        assert cmpgt == data_gt
+        # greater than and equal
+        data_ge = [a >= b for a, b in zip(data_a, data_b)]
+        cmpge = to_bool(self.cmpge(vdata_a, vdata_b))
+        assert cmpge == data_ge
+        # less than
+        data_lt  = [a < b for a, b in zip(data_a, data_b)]
+        cmplt = to_bool(self.cmplt(vdata_a, vdata_b))
+        assert cmplt == data_lt
+        # less than and equal
+        data_le  = [a <= b for a, b in zip(data_a, data_b)]
+        cmple = to_bool(self.cmple(vdata_a, vdata_b))
+        assert cmple == data_le
+
+    def test_operators_logical(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        if self._is_fp():
+            data_cast_a = self._to_unsigned(vdata_a)
+            data_cast_b = self._to_unsigned(vdata_b)
+            cast, cast_data = self._to_unsigned, self._to_unsigned
+        else:
+            data_cast_a, data_cast_b = data_a, data_b
+            cast, cast_data = lambda a: a, self.load
+
+        data_xor = cast_data([a ^ b for a, b in zip(data_cast_a, data_cast_b)])
+        vxor = cast(self.xor(vdata_a, vdata_b))
+        assert vxor == data_xor
+
+        data_or  = cast_data([a | b for a, b in zip(data_cast_a, data_cast_b)])
+        vor  = cast(getattr(self, "or")(vdata_a, vdata_b))
+        assert vor == data_or
+
+        data_and = cast_data([a & b for a, b in zip(data_cast_a, data_cast_b)])
+        vand = cast(getattr(self, "and")(vdata_a, vdata_b))
+        assert vand == data_and
+
+        data_not = cast_data([~a for a in data_cast_a])
+        vnot = cast(getattr(self, "not")(vdata_a))
+        assert vnot == data_not
+
+    def test_conversion_boolean(self):
+        bsfx = "b" + self.sfx[1:]
+        to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))
+        from_boolean = getattr(self.npyv, "cvt_%s_%s" % (self.sfx, bsfx))
+
+        false_vb = to_boolean(self.setall(0))
+        true_vb  = self.cmpeq(self.setall(0), self.setall(0))
+        assert false_vb != true_vb
+
+        false_vsfx = from_boolean(false_vb)
+        true_vsfx = from_boolean(true_vb)
+        assert false_vsfx != true_vsfx
+
+    def test_arithmetic_subadd(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # non-saturated
+        data_add = self.load([a + b for a, b in zip(data_a, data_b)]) # load to cast
+        add  = self.add(vdata_a, vdata_b)
+        assert add == data_add
+        data_sub  = self.load([a - b for a, b in zip(data_a, data_b)])
+        sub  = self.sub(vdata_a, vdata_b)
+        assert sub == data_sub
+
+    def test_arithmetic_mul(self):
+        if self.sfx in ("u64", "s64"):
+            return
+
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        data_mul = self.load([a * b for a, b in zip(data_a, data_b)])
+        mul = self.mul(vdata_a, vdata_b)
+        assert mul == data_mul
+
+    def test_arithmetic_div(self):
+        if not self._is_fp():
+            return
+
+        data_a, data_b = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # load to truncate f64 to precision of f32
+        data_div = self.load([a / b for a, b in zip(data_a, data_b)])
+        div = self.div(vdata_a, vdata_b)
+        assert div == data_div
+
+    def test_arithmetic_reduce_sum(self):
+        if not self._is_fp():
+            return
+        # reduce sum
+        data = self._data()
+        vdata = self.load(data)
+
+        data_sum = sum(data)
+        vsum = self.sum(vdata)
+        assert vsum == data_sum
+
+bool_sfx = ("b8", "b16", "b32", "b64")
+int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
+fp_sfx  = ("f32", "f64")
+all_sfx = int_sfx + fp_sfx
+tests_registry = {
+    bool_sfx: _SIMD_BOOL,
+    int_sfx : _SIMD_INT,
+    fp_sfx  : _SIMD_FP,
+    all_sfx : _SIMD_ALL
+}
+for target_name, npyv in targets.items():
+    simd_width = npyv.simd if npyv else ''
+    pretty_name = target_name.split('__') # multi-target separator
+    if len(pretty_name) > 1:
+        # multi-target
+        pretty_name = f"({' '.join(pretty_name)})"
+    else:
+        pretty_name = pretty_name[0]
+
+    skip = ""
+    skip_sfx = dict()
+    if not npyv:
+        skip = f"target '{pretty_name}' isn't supported by current machine"
+    elif not npyv.simd:
+        skip = f"target '{pretty_name}' isn't supported by NPYV"
+    elif not npyv.simd_f64:
+        skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
+
+    for sfxes, cls in tests_registry.items():
+        for sfx in sfxes:
+            skip_m = skip_sfx.get(sfx, skip)
+            inhr = (cls,)
+            attr = dict(npyv=targets[target_name], sfx=sfx)
+            tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr)
+            if skip_m:
+                pytest.mark.skip(reason=skip_m)(tcls)
+            globals()[tcls.__name__] = tcls
diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py
new file mode 100644
index 000000000..3d710884a
--- /dev/null
+++ b/numpy/core/tests/test_simd_module.py
@@ -0,0 +1,97 @@
+import pytest
+from numpy.core._simd import targets
+"""
+This testing unit only for checking the sanity of common functionality,
+therefore all we need is just to take one submodule that represents any
+of enabled SIMD extensions to run the test on it and the second submodule
+required to run only one check related to the possibility of mixing
+the data types among each submodule.
+"""
+npyvs = [npyv_mod for npyv_mod in targets.values() if npyv_mod and npyv_mod.simd]
+npyv, npyv2 = (npyvs + [None, None])[:2]
+
+unsigned_sfx = ["u8", "u16", "u32", "u64"]
+signed_sfx = ["s8", "s16", "s32", "s64"]
+fp_sfx = ["f32"]
+if npyv and npyv.simd_f64:
+    fp_sfx.append("f64")
+
+int_sfx = unsigned_sfx + signed_sfx
+all_sfx = unsigned_sfx + int_sfx
+
+@pytest.mark.skipif(not npyv, reason="could not find any SIMD extension with NPYV support")
+class Test_SIMD_MODULE:
+
+    @pytest.mark.parametrize('sfx', all_sfx)
+    def test_num_lanes(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        vector = getattr(npyv, "setall_" + sfx)(1)
+        assert len(vector) == nlanes
+
+    @pytest.mark.parametrize('sfx', all_sfx)
+    def test_type_name(self, sfx):
+        vector = getattr(npyv, "setall_" + sfx)(1)
+        assert vector.__name__ == "npyv_" + sfx
+
+    def test_raises(self):
+        a, b = [npyv.setall_u32(1)]*2
+        for sfx in all_sfx:
+            vcb = lambda intrin: getattr(npyv, f"{intrin}_{sfx}")
+            pytest.raises(TypeError, vcb("add"), a)
+            pytest.raises(TypeError, vcb("add"), a, b, a)
+            pytest.raises(TypeError, vcb("setall"))
+            pytest.raises(TypeError, vcb("setall"), [1])
+            pytest.raises(TypeError, vcb("load"), 1)
+            pytest.raises(ValueError, vcb("load"), [1])
+            pytest.raises(ValueError, vcb("store"), [1], getattr(npyv, f"reinterpret_{sfx}_u32")(a))
+
+    @pytest.mark.skipif(not npyv2, reason=(
+        "could not find a second SIMD extension with NPYV support"
+    ))
+    def test_nomix(self):
+        # mix among submodules isn't allowed
+        a = npyv.setall_u32(1)
+        a2 = npyv2.setall_u32(1)
+        pytest.raises(TypeError, npyv.add_u32, a2, a2)
+        pytest.raises(TypeError, npyv2.add_u32, a, a)
+
+    @pytest.mark.parametrize('sfx', unsigned_sfx)
+    def test_unsigned_overflow(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        maxu = (1 << int(sfx[1:])) - 1
+        maxu_72 = (1 << 72) - 1
+        lane = getattr(npyv, "setall_" + sfx)(maxu_72)[0]
+        assert lane == maxu
+        lanes = getattr(npyv, "load_" + sfx)([maxu_72] * nlanes)
+        assert lanes == [maxu] * nlanes
+        lane = getattr(npyv, "setall_" + sfx)(-1)[0]
+        assert lane == maxu
+        lanes = getattr(npyv, "load_" + sfx)([-1] * nlanes)
+        assert lanes == [maxu] * nlanes
+
+    @pytest.mark.parametrize('sfx', signed_sfx)
+    def test_signed_overflow(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        maxs_72 = (1 << 71) - 1
+        lane = getattr(npyv, "setall_" + sfx)(maxs_72)[0]
+        assert lane == -1
+        lanes = getattr(npyv, "load_" + sfx)([maxs_72] * nlanes)
+        assert lanes == [-1] * nlanes
+        mins_72 = -1 << 71
+        lane = getattr(npyv, "setall_" + sfx)(mins_72)[0]
+        assert lane == 0
+        lanes = getattr(npyv, "load_" + sfx)([mins_72] * nlanes)
+        assert lanes == [0] * nlanes
+
+    def test_truncate_f32(self):
+        f32 = npyv.setall_f32(0.1)[0]
+        assert f32 != 0.1
+        assert round(f32, 1) == 0.1
+
+    def test_compare(self):
+        data_range = range(0, npyv.nlanes_u32)
+        vdata = npyv.load_u32(data_range)
+        assert vdata == list(data_range)
+        assert vdata == tuple(data_range)
+        for i in data_range:
+            assert vdata[i] == data_range[i]
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 1305f4877..aa17d6b08 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1,5 +1,6 @@
 import warnings
 import itertools
+import sys
 
 import pytest
 
@@ -11,7 +12,7 @@ import numpy.core._rational_tests as _rational_tests
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal,
     assert_almost_equal, assert_array_almost_equal, assert_no_warnings,
-    assert_allclose,
+    assert_allclose, HAS_REFCOUNT,
     )
 from numpy.compat import pickle
 
@@ -177,6 +178,10 @@ class TestUfuncGenericLoops:
                 assert_array_equal(res_num.astype("O"), res_obj)
 
 
+def _pickleable_module_global():
+    pass
+
+
 class TestUfunc:
     def test_pickle(self):
         for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
@@ -194,6 +199,15 @@ class TestUfunc:
                    b"(S'numpy.core.umath'\np1\nS'cos'\np2\ntp3\nRp4\n.")
         assert_(pickle.loads(astring) is np.cos)
 
+    def test_pickle_name_is_qualname(self):
+        # This tests that a simplification of our ufunc pickle code will
+        # lead to allowing qualnames as names.  Future ufuncs should
+        # possible add a specific qualname, or a hook into pickling instead
+        # (dask+numba may benefit).
+        _pickleable_module_global.ufunc = umt._pickleable_module_global_ufunc
+        obj = pickle.loads(pickle.dumps(_pickleable_module_global.ufunc))
+        assert obj is umt._pickleable_module_global_ufunc
+
     def test_reduceat_shifting_sum(self):
         L = 6
         x = np.arange(L)
@@ -1319,6 +1333,18 @@ class TestUfunc:
         m = np.array([True], dtype=bool)
         assert_equal(np.sqrt(a, where=m), [1])
 
+    def test_where_with_broadcasting(self):
+        # See gh-17198
+        a = np.random.random((5000, 4))
+        b = np.random.random((5000, 1))
+
+        where = a > 0.3
+        out = np.full_like(a, 0)
+        np.less(a, b, where=where, out=out)
+        b_where = np.broadcast_to(b, a.shape)[where]
+        assert_array_equal((a[where] < b_where), out[where].astype(bool))
+        assert not out[~where].any()  # outside mask, out remains all 0
+
     def check_identityless_reduction(self, a):
         # np.minimum.reduce is an identityless reduction
 
@@ -2074,3 +2100,60 @@ def test_ufunc_warn_with_nan(ufunc):
     else:
         raise ValueError('ufunc with more than 2 inputs')
 
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_ufunc_casterrors():
+    # Tests that casting errors are correctly reported and buffers are
+    # cleared.
+    # The following array can be added to itself as an object array, but
+    # the result cannot be cast to an integer output:
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.array([value] * int(np.BUFSIZE * 1.5) +
+                   ["string"] +
+                   [value] * int(1.5 * np.BUFSIZE), dtype=object)
+    out = np.ones(len(arr), dtype=np.intp)
+
+    count = sys.getrefcount(value)
+    with pytest.raises(ValueError):
+        # Output casting failure:
+        np.add(arr, arr, out=out, casting="unsafe")
+
+    assert count == sys.getrefcount(value)
+    # output is unchanged after the error, this shows that the iteration
+    # was aborted (this is not necessarily defined behaviour)
+    assert out[-1] == 1
+
+    with pytest.raises(ValueError):
+        # Input casting failure:
+        np.add(arr, arr, out=out, dtype=np.intp, casting="unsafe")
+
+    assert count == sys.getrefcount(value)
+    # output is unchanged after the error, this shows that the iteration
+    # was aborted (this is not necessarily defined behaviour)
+    assert out[-1] == 1
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize("offset",
+        [0, np.BUFSIZE//2, int(1.5*np.BUFSIZE)])
+def test_reduce_casterrors(offset):
+    # Test reporting of casting errors in reductions, we test various
+    # offsets to where the casting error will occur, since these may occur
+    # at different places during the reduction procedure. For example
+    # the first item may be special.
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.array([value] * offset +
+                   ["string"] +
+                   [value] * int(1.5 * np.BUFSIZE), dtype=object)
+    out = np.array(-1, dtype=np.intp)
+
+    count = sys.getrefcount(value)
+    with pytest.raises(ValueError):
+        # This is an unsafe cast, but we currently always allow that:
+        np.add.reduce(arr, dtype=np.intp, out=out)
+    assert count == sys.getrefcount(value)
+    # If an error occurred during casting, the operation is done at most until
+    # the error occurs (the result of which would be `value * offset`) and -1
+    # if the error happened immediately.
+    # This does not define behaviour, the output is invalid and thus undefined
+    assert out[()] < value * offset
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 941d99521..8162e52bd 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -13,7 +13,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp
+    _gen_alignment_data, assert_array_almost_equal_nulp, assert_warns
     )
 
 def on_powerpc():
@@ -249,6 +249,67 @@ class TestDivision:
         assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
+    @pytest.mark.parametrize("input_dtype",
+            [np.int8, np.int16, np.int32, np.int64])
+    def test_division_int_boundary(self, input_dtype):
+        iinfo = np.iinfo(input_dtype)
+
+        # Create list with min, 25th percentile, 0, 75th percentile, max
+        lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
+        divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
+        a = np.array(lst, dtype=input_dtype)
+
+        for divisor in divisors:
+            div_a = a // divisor
+            b = a.copy(); b //= divisor
+            div_lst = [i // divisor for i in lst]
+
+            msg = "Integer arrays floor division check (//)"
+            assert all(div_a == div_lst), msg
+
+            msg = "Integer arrays floor division check (//=)"
+            assert all(div_a == b), msg
+
+        with np.errstate(divide='raise'):
+            with pytest.raises(FloatingPointError):
+                a // 0
+            with pytest.raises(FloatingPointError):
+                a //= 0
+
+            np.array([], dtype=input_dtype) // 0
+
+    @pytest.mark.parametrize(
+            "dividend,divisor,quotient",
+            [(np.timedelta64(2,'Y'), np.timedelta64(2,'M'), 12),
+             (np.timedelta64(2,'Y'), np.timedelta64(-2,'M'), -12),
+             (np.timedelta64(-2,'Y'), np.timedelta64(2,'M'), -12),
+             (np.timedelta64(-2,'Y'), np.timedelta64(-2,'M'), 12),
+             (np.timedelta64(2,'M'), np.timedelta64(-2,'Y'), -1),
+             (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), 0),
+             (np.timedelta64(2,'Y'), 2, np.timedelta64(1,'Y')),
+             (np.timedelta64(2,'Y'), -2, np.timedelta64(-1,'Y')),
+             (np.timedelta64(-2,'Y'), 2, np.timedelta64(-1,'Y')),
+             (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')),
+             (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')),
+             (np.timedelta64(-2,'Y'), -3, np.timedelta64(0,'Y')),
+             (np.timedelta64(-2,'Y'), 0, np.timedelta64('Nat','Y')),
+            ])
+    def test_division_int_timedelta(self, dividend, divisor, quotient):
+        # If either divisor is 0 or quotient is Nat, check for division by 0
+        if divisor and (isinstance(quotient, int) or not np.isnat(quotient)):
+            msg = "Timedelta floor division check"
+            assert dividend // divisor == quotient, msg
+
+            # Test for arrays as well
+            msg = "Timedelta arrays floor division check"
+            dividend_array = np.array([dividend]*5)
+            quotient_array = np.array([quotient]*5)
+            assert all(dividend_array // divisor == quotient_array), msg
+        else:
+            with np.errstate(divide='raise', invalid='raise'):
+                with pytest.raises(FloatingPointError):
+                    dividend // divisor
+
     def test_division_complex(self):
         # check that implementation is correct
         msg = "Complex division implementation check"
@@ -293,6 +354,42 @@ class TestDivision:
         assert_equal(np.signbit(x//1), 0)
         assert_equal(np.signbit((-x)//1), 1)
 
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_floor_division_errors(self, dtype):
+        fnan = np.array(np.nan, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        fzer = np.array(0.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        # divide by zero error check
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.floor_divide, fone, fzer)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.floor_divide, fnan, fone)
+            assert_raises(FloatingPointError, np.floor_divide, fone, fnan)
+            assert_raises(FloatingPointError, np.floor_divide, fnan, fzer)
+
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_floor_division_corner_cases(self, dtype):
+        # test corner cases like 1.0//0.0 for errors and return vals
+        x = np.zeros(10, dtype=dtype)
+        y = np.ones(10, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        fzer = np.array(0.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning, "invalid value encountered in floor_divide")
+            div = np.floor_divide(fnan, fone)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+            div = np.floor_divide(fone, fnan)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+            div = np.floor_divide(fnan, fzer)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+        # verify 1.0//0.0 computations return inf
+        with np.errstate(divide='ignore'):
+            z = np.floor_divide(y, x)
+            assert_(np.isinf(z).all())
+
 def floor_divide_and_remainder(x, y):
     return (np.floor_divide(x, y), np.remainder(x, y))
 
@@ -366,9 +463,90 @@ class TestRemainder:
                     else:
                         assert_(b > rem >= 0, msg)
 
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_float_divmod_errors(self, dtype):
+        # Check valid errors raised for divmod and remainder
+        fzero = np.array(0.0, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        # since divmod is combination of both remainder and divide
+        # ops it will set both dividebyzero and invalid flags
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, fone, fzero)
+        with np.errstate(divide='ignore', invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, fone, fzero)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, fzero, fzero)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, finf, finf)
+        with np.errstate(divide='ignore', invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, finf, fzero)
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, finf, fzero)
+
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    @pytest.mark.parametrize('fn', [np.fmod, np.remainder])
+    def test_float_remainder_errors(self, dtype, fn):
+        fzero = np.array(0.0, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, fn, fone, fzero)
+            assert_raises(FloatingPointError, fn, fnan, fzero)
+            assert_raises(FloatingPointError, fn, fone, fnan)
+            assert_raises(FloatingPointError, fn, fnan, fone)
+
+    def test_float_remainder_overflow(self):
+        a = np.finfo(np.float64).tiny
+        with np.errstate(over='ignore', invalid='ignore'):
+            div, mod = np.divmod(4, a)
+            np.isinf(div)
+            assert_(mod == 0)
+        with np.errstate(over='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, 4, a)
+        with np.errstate(invalid='raise', over='ignore'):
+            assert_raises(FloatingPointError, np.divmod, 4, a)
+
+    def test_float_divmod_corner_cases(self):
+        # check nan cases
+        for dt in np.typecodes['Float']:
+            fnan = np.array(np.nan, dtype=dt)
+            fone = np.array(1.0, dtype=dt)
+            fzer = np.array(0.0, dtype=dt)
+            finf = np.array(np.inf, dtype=dt)
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "invalid value encountered in divmod")
+                sup.filter(RuntimeWarning, "divide by zero encountered in divmod")
+                div, rem = np.divmod(fone, fzer)
+                assert(np.isinf(div)), 'dt: %s, div: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(fzer, fzer)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                assert_(np.isnan(div)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(finf, finf)
+                assert(np.isnan(div)), 'dt: %s, rem: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(finf, fzer)
+                assert(np.isinf(div)), 'dt: %s, rem: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(fnan, fone)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+                div, rem = np.divmod(fone, fnan)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+                div, rem = np.divmod(fnan, fzer)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+
     def test_float_remainder_corner_cases(self):
         # Check remainder magnitude.
         for dt in np.typecodes['Float']:
+            fone = np.array(1.0, dtype=dt)
+            fzer = np.array(0.0, dtype=dt)
+            fnan = np.array(np.nan, dtype=dt)
             b = np.array(1.0, dtype=dt)
             a = np.nextafter(np.array(0.0, dtype=dt), -b)
             rem = np.remainder(a, b)
@@ -379,6 +557,7 @@ class TestRemainder:
         # Check nans, inf
         with suppress_warnings() as sup:
             sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            sup.filter(RuntimeWarning, "invalid value encountered in fmod")
             for dt in np.typecodes['Float']:
                 fone = np.array(1.0, dtype=dt)
                 fzer = np.array(0.0, dtype=dt)
@@ -389,10 +568,30 @@ class TestRemainder:
                 # MSVC 2008 returns NaN here, so disable the check.
                 #rem = np.remainder(fone, finf)
                 #assert_(rem == fone, 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(finf, fone)
+                fmod = np.fmod(finf, fone)
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(finf, finf)
+                fmod = np.fmod(finf, fone)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                rem = np.remainder(finf, fzer)
+                fmod = np.fmod(finf, fzer)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
                 rem = np.remainder(fone, fnan)
+                fmod = np.fmod(fone, fnan)
                 assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
-                rem = np.remainder(finf, fone)
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                rem = np.remainder(fnan, fzer)
+                fmod = np.fmod(fnan, fzer)
                 assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, rem))
+                rem = np.remainder(fnan, fone)
+                fmod = np.fmod(fnan, fone)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, rem))
 
 
 class TestCbrt:
@@ -657,6 +856,24 @@ class TestLog:
             yf = np.array(y, dtype=dt)*log2_
             assert_almost_equal(np.log(xf), yf)
 
+        # test aliasing(issue #17761)
+        x = np.array([2, 0.937500, 3, 0.947500, 1.054697])
+        xf = np.log(x)
+        assert_almost_equal(np.log(x, out=x), xf)
+
+    def test_log_strides(self):
+        np.random.seed(42)
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        sizes = np.arange(2,100)
+        for ii in sizes:
+            x_f64 = np.float64(np.random.uniform(low=0.01, high=100.0,size=ii))
+            x_special = x_f64.copy()
+            x_special[3:-1:4] = 1.0
+            y_true = np.log(x_f64)
+            y_special = np.log(x_special)
+            for jj in strides:
+                assert_array_almost_equal_nulp(np.log(x_f64[::jj]), y_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.log(x_special[::jj]), y_special[::jj], nulp=2)
 
 class TestExp:
     def test_exp_values(self):
@@ -883,6 +1100,10 @@ class TestAVXFloat32Transcendental:
         x_f64 = np.float64(x_f32)
         assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
         assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
+        # test aliasing(issue #17761)
+        tx_f32 = x_f32.copy()
+        assert_array_max_ulp(np.sin(x_f32, out=x_f32), np.float32(np.sin(x_f64)), maxulp=2)
+        assert_array_max_ulp(np.cos(tx_f32, out=tx_f32), np.float32(np.cos(x_f64)), maxulp=2)
 
     def test_strided_float32(self):
         np.random.seed(42)
@@ -2444,7 +2665,7 @@ class TestSpecialMethods:
         assert_raises(ValueError, inner1d, a, a, out=())
 
     def test_ufunc_override_with_super(self):
-        # NOTE: this class is given as an example in doc/subclassing.py;
+        # NOTE: this class is used in doc/source/user/basics.subclassing.rst
         # if you make any changes here, do update it there too.
         class A(np.ndarray):
             def __array_ufunc__(self, ufunc, method, *inputs, out=None, **kwargs):
@@ -3270,3 +3491,39 @@ def test_outer_subclass_preserve(arr):
     class foo(np.ndarray): pass
     actual = np.multiply.outer(arr.view(foo), arr.view(foo))
     assert actual.__class__.__name__ == 'foo'
+
+def test_outer_bad_subclass():
+    class BadArr1(np.ndarray):
+        def __array_finalize__(self, obj):
+            # The outer call reshapes to 3 dims, try to do a bad reshape.
+            if self.ndim == 3:
+                self.shape = self.shape + (1,)
+
+        def __array_prepare__(self, obj, context=None):
+            return obj
+
+    class BadArr2(np.ndarray):
+        def __array_finalize__(self, obj):
+            if isinstance(obj, BadArr2):
+                # outer inserts 1-sized dims. In that case disturb them.
+                if self.shape[-1] == 1:
+                    self.shape = self.shape[::-1]
+
+        def __array_prepare__(self, obj, context=None):
+            return obj
+
+    for cls in [BadArr1, BadArr2]:
+        arr = np.ones((2, 3)).view(cls)
+        with assert_raises(TypeError) as a:
+            # The first array gets reshaped (not the second one)
+            np.add.outer(arr, [1, 2])
+
+        # This actually works, since we only see the reshaping error:
+        arr = np.ones((2, 3)).view(cls)
+        assert type(np.add.outer([1, 2], arr)) is cls
+
+def test_outer_exceeds_maxdims():
+    deep = np.ones((1,) * 17)
+    with assert_raises(ValueError):
+        np.add.outer(deep, deep)
+
diff --git a/numpy/ctypeslib.py b/numpy/ctypeslib.py
index 76ba838b7..e8f7750fe 100644
--- a/numpy/ctypeslib.py
+++ b/numpy/ctypeslib.py
@@ -49,12 +49,11 @@ Then, we're ready to call ``foo_func``:
 >>> _lib.foo_func(out, len(out))                #doctest: +SKIP
 
 """
-__all__ = ['load_library', 'ndpointer', 'ctypes_load_library',
-           'c_intp', 'as_ctypes', 'as_array']
+__all__ = ['load_library', 'ndpointer', 'c_intp', 'as_ctypes', 'as_array']
 
 import os
 from numpy import (
-    integer, ndarray, dtype as _dtype, deprecate, array, frombuffer
+    integer, ndarray, dtype as _dtype, array, frombuffer
 )
 from numpy.core.multiarray import _flagdict, flagsobj
 
@@ -75,7 +74,6 @@ if ctypes is None:
 
         """
         raise ImportError("ctypes is not available.")
-    ctypes_load_library = _dummy
     load_library = _dummy
     as_ctypes = _dummy
     as_array = _dummy
@@ -154,8 +152,6 @@ else:
         ## if no successful return in the libname_ext loop:
         raise OSError("no file with expected extension")
 
-    ctypes_load_library = deprecate(load_library, 'ctypes_load_library',
-                                    'load_library')
 
 def _num_fromflags(flaglist):
     num = 0
diff --git a/numpy/ctypeslib.pyi b/numpy/ctypeslib.pyi
new file mode 100644
index 000000000..cacc97d68
--- /dev/null
+++ b/numpy/ctypeslib.pyi
@@ -0,0 +1,7 @@
+from typing import Any
+
+load_library: Any
+ndpointer: Any
+c_intp: Any
+as_ctypes: Any
+as_array: Any
diff --git a/numpy/distutils/__init__.py b/numpy/distutils/__init__.py
index 528b76eb5..79974d1c2 100644
--- a/numpy/distutils/__init__.py
+++ b/numpy/distutils/__init__.py
@@ -18,9 +18,7 @@ LAPACK, and for setting include paths and similar build options, please see
 ``site.cfg.example`` in the root of the NumPy repository or sdist.
 
 """
-# from setuptools v49.2.0, setuptools warns if distutils is imported first,
-# so pre-emptively import setuptools
-import setuptools
+
 # Must import local ccompiler ASAP in order to get
 # customized CCompiler.spawn effective.
 from . import ccompiler
diff --git a/numpy/distutils/__init__.pyi b/numpy/distutils/__init__.pyi
new file mode 100644
index 000000000..3938d68de
--- /dev/null
+++ b/numpy/distutils/__init__.pyi
@@ -0,0 +1,4 @@
+from typing import Any
+
+# TODO: remove when the full numpy namespace is defined
+def __getattr__(name: str) -> Any: ...
diff --git a/numpy/distutils/ccompiler_opt.py b/numpy/distutils/ccompiler_opt.py
index b6b7939a2..20dbb5c00 100644
--- a/numpy/distutils/ccompiler_opt.py
+++ b/numpy/distutils/ccompiler_opt.py
@@ -152,6 +152,18 @@ class _Config:
             By default(None), treated as True if the feature contains at
             least one applicable flag. see `feature_can_autovec()`
 
+        "extra_checks": str or list, optional
+            Extra test case names for the CPU feature that need to be tested
+            against the compiler.
+
+            Each test case must have a C file named ``extra_xxxx.c``, where
+            ``xxxx`` is the case name in lower case, under 'conf_check_path'.
+            It should contain at least one intrinsic or function related to the test case.
+
+            If the compiler able to successfully compile the C file then `CCompilerOpt`
+            will add a C ``#define`` for it into the main dispatch header, e.g.
+            ```#define {conf_c_prefix}_XXXX`` where ``XXXX`` is the case name in upper case.
+
         **NOTES**:
             * space can be used as separator with options that supports "str or list"
             * case-sensitive for all values and feature name must be in upper-case.
@@ -230,7 +242,10 @@ class _Config:
         F16C   = dict(interest=11, implies="AVX"),
         FMA3   = dict(interest=12, implies="F16C"),
         AVX2   = dict(interest=13, implies="F16C"),
-        AVX512F = dict(interest=20, implies="FMA3 AVX2", implies_detect=False),
+        AVX512F = dict(
+            interest=20, implies="FMA3 AVX2", implies_detect=False,
+            extra_checks="AVX512F_REDUCE"
+        ),
         AVX512CD = dict(interest=21, implies="AVX512F"),
         AVX512_KNL = dict(
             interest=40, implies="AVX512CD", group="AVX512ER AVX512PF",
@@ -243,7 +258,8 @@ class _Config:
         ),
         AVX512_SKX = dict(
             interest=42, implies="AVX512CD", group="AVX512VL AVX512BW AVX512DQ",
-            detect="AVX512_SKX", implies_detect=False
+            detect="AVX512_SKX", implies_detect=False,
+            extra_checks="AVX512BW_MASK"
         ),
         AVX512_CLX = dict(
             interest=43, implies="AVX512_SKX", group="AVX512VNNI",
@@ -260,7 +276,7 @@ class _Config:
         ),
         # IBM/Power
         ## Power7/ISA 2.06
-        VSX = dict(interest=1, headers="altivec.h"),
+        VSX = dict(interest=1, headers="altivec.h", extra_checks="VSX_ASM"),
         ## Power8/ISA 2.07
         VSX2 = dict(interest=2, implies="VSX", implies_detect=False),
         ## Power9/ISA 3.00
@@ -673,7 +689,7 @@ class _Distutils:
         # intel and msvc compilers don't raise
         # fatal errors when flags are wrong or unsupported
         ".*("
-        "warning D9002|"  # msvc, it should be work with any language. 
+        "warning D9002|"  # msvc, it should be work with any language.
         "invalid argument for option" # intel
         ").*"
     )
@@ -1137,7 +1153,7 @@ class _Feature:
                 continue
             # list is used internally for these options
             for option in (
-                "implies", "group", "detect", "headers", "flags"
+                "implies", "group", "detect", "headers", "flags", "extra_checks"
             ) :
                 oval = feature.get(option)
                 if isinstance(oval, str):
@@ -1439,7 +1455,7 @@ class _Feature:
             self.conf_check_path, "cpu_%s.c" % name.lower()
         )
         if not os.path.exists(test_path):
-            self.dist_fatal("feature test file is not exist", path)
+            self.dist_fatal("feature test file is not exist", test_path)
 
         test = self.dist_test(test_path, force_flags + self.cc_flags["werror"])
         if not test:
@@ -1487,6 +1503,45 @@ class _Feature:
             can = valid_flags and any(valid_flags)
         return can
 
+    @_Cache.me
+    def feature_extra_checks(self, name):
+        """
+        Return a list of supported extra checks after testing them against
+        the compiler.
+
+        Parameters
+        ----------
+        names: str
+            CPU feature name in uppercase.
+        """
+        assert isinstance(name, str)
+        d = self.feature_supported[name]
+        extra_checks = d.get("extra_checks", [])
+        if not extra_checks:
+            return []
+
+        self.dist_log("Testing extra checks for feature '%s'" % name, extra_checks)
+        flags = self.feature_flags(name)
+        available = []
+        not_available = []
+        for chk in extra_checks:
+            test_path = os.path.join(
+                self.conf_check_path, "extra_%s.c" % chk.lower()
+            )
+            if not os.path.exists(test_path):
+                self.dist_fatal("extra check file does not exist", test_path)
+
+            is_supported = self.dist_test(test_path, flags + self.cc_flags["werror"])
+            if is_supported:
+                available.append(chk)
+            else:
+                not_available.append(chk)
+
+        if not_available:
+            self.dist_log("testing failed for checks", not_available, stderr=True)
+        return available
+
+
     def feature_c_preprocessor(self, feature_name, tabs=0):
         """
         Generate C preprocessor definitions and include headers of a CPU feature.
@@ -1520,14 +1575,18 @@ class _Feature:
         prepr += [
             "#include <%s>" % h for h in feature.get("headers", [])
         ]
-        group = feature.get("group", [])
-        for f in group:
-            # Guard features in case of duplicate definitions
+
+        extra_defs = feature.get("group", [])
+        extra_defs += self.feature_extra_checks(feature_name)
+        for edef in extra_defs:
+            # Guard extra definitions in case of duplicate with
+            # another feature
             prepr += [
-                "#ifndef %sHAVE_%s" % (self.conf_c_prefix, f),
-                "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, f),
+                "#ifndef %sHAVE_%s" % (self.conf_c_prefix, edef),
+                "\t#define %sHAVE_%s 1" % (self.conf_c_prefix, edef),
                 "#endif",
             ]
+
         if tabs > 0:
             prepr = [('\t'*tabs) + l for l in prepr]
         return '\n'.join(prepr)
@@ -2127,7 +2186,7 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
 
         See Also
         --------
-        parse_targets() :
+        parse_targets :
             Parsing the configuration statements of dispatch-able sources.
         """
         to_compile = {}
@@ -2136,9 +2195,12 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
 
         for src in sources:
             output_dir = os.path.dirname(src)
-            if src_dir and not output_dir.startswith(src_dir):
-                output_dir = os.path.join(src_dir, output_dir)
+            if src_dir:
+                if not output_dir.startswith(src_dir):
+                    output_dir = os.path.join(src_dir, output_dir)
                 if output_dir not in include_dirs:
+                    # To allow including the generated config header(*.dispatch.h)
+                    # by the dispatch-able sources
                     include_dirs.append(output_dir)
 
             has_baseline, targets, extra_flags = self.parse_targets(src)
@@ -2266,6 +2328,12 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
         baseline_rows.append((
             "Flags", (' '.join(baseline_flags) if baseline_flags else "none")
         ))
+        extra_checks = []
+        for name in baseline_names:
+            extra_checks += self.feature_extra_checks(name)
+        baseline_rows.append((
+            "Extra checks", (' '.join(extra_checks) if extra_checks else "none")
+        ))
 
         ########## dispatch ##########
         if self.cc_noopt:
@@ -2305,13 +2373,19 @@ class CCompilerOpt(_Config, _Distutils, _Cache, _CCompiler, _Feature, _Parse):
             dispatch_rows.append(("Generated", ''))
             for tar in self.feature_sorted(target_sources):
                 sources = target_sources[tar]
-                name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
+                pretty_name = tar if isinstance(tar, str) else '(%s)' % ' '.join(tar)
                 flags = ' '.join(self.feature_flags(tar))
                 implies = ' '.join(self.feature_sorted(self.feature_implies(tar)))
                 detect = ' '.join(self.feature_detect(tar))
+                extra_checks = []
+                for name in ((tar,) if isinstance(tar, str) else tar):
+                    extra_checks += self.feature_extra_checks(name)
+                extra_checks = (' '.join(extra_checks) if extra_checks else "none")
+
                 dispatch_rows.append(('', ''))
-                dispatch_rows.append((name, implies))
+                dispatch_rows.append((pretty_name, implies))
                 dispatch_rows.append(("Flags", flags))
+                dispatch_rows.append(("Extra checks", extra_checks))
                 dispatch_rows.append(("Detect", detect))
                 for src in sources:
                     dispatch_rows.append(("", src))
diff --git a/numpy/distutils/checks/extra_avx512bw_mask.c b/numpy/distutils/checks/extra_avx512bw_mask.c
new file mode 100644
index 000000000..9cfd0c2a5
--- /dev/null
+++ b/numpy/distutils/checks/extra_avx512bw_mask.c
@@ -0,0 +1,18 @@
+#include <immintrin.h>
+/**
+ * Test BW mask operations due to:
+ *  - MSVC has supported it since vs2019 see,
+ *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
+ *  - Clang >= v8.0
+ *  - GCC >= v7.1
+ */
+int main(void)
+{
+    __mmask64 m64 = _mm512_cmpeq_epi8_mask(_mm512_set1_epi8((char)1), _mm512_set1_epi8((char)1));
+    m64 = _kor_mask64(m64, m64);
+    m64 = _kxor_mask64(m64, m64);
+    m64 = _cvtu64_mask64(_cvtmask64_u64(m64));
+    m64 = _mm512_kunpackd(m64, m64);
+    m64 = (__mmask64)_mm512_kunpackw((__mmask32)m64, (__mmask32)m64);
+    return (int)_cvtmask64_u64(m64);
+}
diff --git a/numpy/distutils/checks/extra_avx512f_reduce.c b/numpy/distutils/checks/extra_avx512f_reduce.c
new file mode 100644
index 000000000..f979d504e
--- /dev/null
+++ b/numpy/distutils/checks/extra_avx512f_reduce.c
@@ -0,0 +1,41 @@
+#include <immintrin.h>
+/**
+ * The following intrinsics don't have direct native support but compilers
+ * tend to emulate them.
+ * They're usually supported by gcc >= 7.1, clang >= 4 and icc >= 19
+ */
+int main(void)
+{
+    __m512  one_ps = _mm512_set1_ps(1.0f);
+    __m512d one_pd = _mm512_set1_pd(1.0);
+    __m512i one_i64 = _mm512_set1_epi64(1.0);
+    // add
+    float sum_ps  = _mm512_reduce_add_ps(one_ps);
+    double sum_pd = _mm512_reduce_add_pd(one_pd);
+    int sum_int   = (int)_mm512_reduce_add_epi64(one_i64);
+        sum_int  += (int)_mm512_reduce_add_epi32(one_i64);
+    // mul
+    sum_ps  += _mm512_reduce_mul_ps(one_ps);
+    sum_pd  += _mm512_reduce_mul_pd(one_pd);
+    sum_int += (int)_mm512_reduce_mul_epi64(one_i64);
+    sum_int += (int)_mm512_reduce_mul_epi32(one_i64);
+    // min
+    sum_ps  += _mm512_reduce_min_ps(one_ps);
+    sum_pd  += _mm512_reduce_min_pd(one_pd);
+    sum_int += (int)_mm512_reduce_min_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epi64(one_i64);
+    // max
+    sum_ps  += _mm512_reduce_max_ps(one_ps);
+    sum_pd  += _mm512_reduce_max_pd(one_pd);
+    sum_int += (int)_mm512_reduce_max_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epi64(one_i64);
+    // and
+    sum_int += (int)_mm512_reduce_and_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_and_epi64(one_i64);
+    // or
+    sum_int += (int)_mm512_reduce_or_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_or_epi64(one_i64);
+    return (int)sum_ps + (int)sum_pd + sum_int;
+}
diff --git a/numpy/distutils/checks/extra_vsx_asm.c b/numpy/distutils/checks/extra_vsx_asm.c
new file mode 100644
index 000000000..b73a6f438
--- /dev/null
+++ b/numpy/distutils/checks/extra_vsx_asm.c
@@ -0,0 +1,36 @@
+/**
+ * Testing ASM VSX register number fixer '%x<n>'
+ *
+ * old versions of CLANG doesn't support %x<n> in the inline asm template
+ * which fixes register number when using any of the register constraints wa, wd, wf.
+ *
+ * xref:
+ * - https://bugs.llvm.org/show_bug.cgi?id=31837
+ * - https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ */
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    float z4[] = {0, 0, 0, 0};
+    signed int zout[] = {0, 0, 0, 0};
+
+    __vector float vz4 = vsx_ld(0, z4);
+    __vector signed int asm_ret = vsx_ld(0, zout);
+
+    __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (vz4) : "wa" (asm_ret));
+
+    vsx_st(asm_ret, 0, zout);
+    return zout[0];
+}
diff --git a/numpy/distutils/command/autodist.py b/numpy/distutils/command/autodist.py
index 8f6436004..b72d0cab1 100644
--- a/numpy/distutils/command/autodist.py
+++ b/numpy/distutils/command/autodist.py
@@ -46,15 +46,16 @@ def check_restrict(cmd):
     return ''
 
 
-def check_compiler_gcc4(cmd):
-    """Return True if the C compiler is GCC 4.x."""
+def check_compiler_gcc(cmd):
+    """Check if the compiler is GCC."""
+
     cmd._check_compiler()
     body = textwrap.dedent("""
         int
         main()
         {
-        #if (! defined __GNUC__) || (__GNUC__ < 4)
-        #error gcc >= 4 required
+        #if (! defined __GNUC__)
+        #error gcc required
         #endif
             return 0;
         }
@@ -62,6 +63,30 @@ def check_compiler_gcc4(cmd):
     return cmd.try_compile(body, None, None)
 
 
+def check_gcc_version_at_least(cmd, major, minor=0, patchlevel=0):
+    """
+    Check that the gcc version is at least the specified version."""
+
+    cmd._check_compiler()
+    version = '.'.join([str(major), str(minor), str(patchlevel)])
+    body = textwrap.dedent("""
+        int
+        main()
+        {
+        #if (! defined __GNUC__) || (__GNUC__ < %(major)d) || \\
+                (__GNUC_MINOR__ < %(minor)d) || \\
+                (__GNUC_PATCHLEVEL__ < %(patchlevel)d)
+        #error gcc >= %(version)s required
+        #endif
+            return 0;
+        }
+        """)
+    kw = {'version': version, 'major': major, 'minor': minor,
+          'patchlevel': patchlevel}
+
+    return cmd.try_compile(body % kw, None, None)
+
+
 def check_gcc_function_attribute(cmd, attribute, name):
     """Return True if the given function attribute is supported."""
     cmd._check_compiler()
diff --git a/numpy/distutils/command/build.py b/numpy/distutils/command/build.py
index 60ba4c917..a4fda537d 100644
--- a/numpy/distutils/command/build.py
+++ b/numpy/distutils/command/build.py
@@ -22,6 +22,8 @@ class build(old_build):
          "specify a list of dispatched CPU optimizations"),
         ('disable-optimization', None,
          "disable CPU optimized code(dispatch,simd,fast...)"),
+        ('simd-test=', None,
+         "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
         ]
 
     help_options = old_build.help_options + [
@@ -36,6 +38,16 @@ class build(old_build):
         self.cpu_baseline = "min"
         self.cpu_dispatch = "max -xop -fma4" # drop AMD legacy features by default
         self.disable_optimization = False
+        """
+        the '_simd' module is a very large. Adding more dispatched features
+        will increase binary size and compile time. By default we minimize
+        the targeted features to those most commonly used by the NumPy SIMD interface(NPYV),
+        NOTE: any specified features will be ignored if they're:
+            - part of the baseline(--cpu-baseline)
+            - not part of dispatch-able features(--cpu-dispatch)
+            - not supported by compiler or platform
+        """
+        self.simd_test = "BASELINE SSE2 SSE42 XOP FMA4 (FMA3 AVX2) AVX512F AVX512_SKX VSX VSX2 VSX3 NEON ASIMD"
 
     def finalize_options(self):
         build_scripts = self.build_scripts
diff --git a/numpy/distutils/command/build_clib.py b/numpy/distutils/command/build_clib.py
index 87345adbc..a0db6f31f 100644
--- a/numpy/distutils/command/build_clib.py
+++ b/numpy/distutils/command/build_clib.py
@@ -259,57 +259,56 @@ class build_clib(old_build_clib):
         if requiref90:
             self.mkpath(module_build_dir)
 
-        dispatch_objects = []
-        if not self.disable_optimization:
-            dispatch_sources  = [
-                c_sources.pop(c_sources.index(src))
-                for src in c_sources[:] if src.endswith(".dispatch.c")
-            ]
-            if dispatch_sources:
-                if not self.inplace:
-                    build_src = self.get_finalized_command("build_src").build_src
-                else:
-                    build_src = None
-                dispatch_objects = self.compiler_opt.try_dispatch(
-                    dispatch_sources,
-                    output_dir=self.build_temp,
-                    src_dir=build_src,
-                    macros=macros,
-                    include_dirs=include_dirs,
-                    debug=self.debug,
-                    extra_postargs=extra_postargs
-                )
-            extra_args_baseopt = extra_postargs + self.compiler_opt.cpu_baseline_flags()
-        else:
-            extra_args_baseopt = extra_postargs
-            macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
-
         if compiler.compiler_type == 'msvc':
             # this hack works around the msvc compiler attributes
             # problem, msvc uses its own convention :(
             c_sources += cxx_sources
             cxx_sources = []
 
+        # filtering C dispatch-table sources when optimization is not disabled,
+        # otherwise treated as normal sources.
+        copt_c_sources = []
+        copt_baseline_flags = []
+        copt_macros = []
+        if not self.disable_optimization:
+            copt_build_src = None if self.inplace else self.get_finalized_command("build_src").build_src
+            copt_c_sources = [
+                c_sources.pop(c_sources.index(src))
+                for src in c_sources[:] if src.endswith(".dispatch.c")
+            ]
+            copt_baseline_flags = self.compiler_opt.cpu_baseline_flags()
+        else:
+            copt_macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+
         objects = []
+        if copt_c_sources:
+            log.info("compiling C dispatch-able sources")
+            objects += self.compiler_opt.try_dispatch(copt_c_sources,
+                                                      output_dir=self.build_temp,
+                                                      src_dir=copt_build_src,
+                                                      macros=macros + copt_macros,
+                                                      include_dirs=include_dirs,
+                                                      debug=self.debug,
+                                                      extra_postargs=extra_postargs)
+
         if c_sources:
             log.info("compiling C sources")
-            objects = compiler.compile(c_sources,
-                                       output_dir=self.build_temp,
-                                       macros=macros,
-                                       include_dirs=include_dirs,
-                                       debug=self.debug,
-                                       extra_postargs=extra_args_baseopt)
-        objects.extend(dispatch_objects)
+            objects += compiler.compile(c_sources,
+                                        output_dir=self.build_temp,
+                                        macros=macros + copt_macros,
+                                        include_dirs=include_dirs,
+                                        debug=self.debug,
+                                        extra_postargs=extra_postargs + copt_baseline_flags)
 
         if cxx_sources:
             log.info("compiling C++ sources")
             cxx_compiler = compiler.cxx_compiler()
             cxx_objects = cxx_compiler.compile(cxx_sources,
                                                output_dir=self.build_temp,
-                                               macros=macros,
+                                               macros=macros + copt_macros,
                                                include_dirs=include_dirs,
                                                debug=self.debug,
-                                               extra_postargs=extra_postargs)
+                                               extra_postargs=extra_postargs + copt_baseline_flags)
             objects.extend(cxx_objects)
 
         if f_sources or fmodule_sources:
diff --git a/numpy/distutils/command/build_ext.py b/numpy/distutils/command/build_ext.py
index b6557fcf6..ca6f8bcd2 100644
--- a/numpy/distutils/command/build_ext.py
+++ b/numpy/distutils/command/build_ext.py
@@ -19,8 +19,7 @@ from numpy.distutils.misc_util import (
     has_cxx_sources, has_f_sources, is_sequence
 )
 from numpy.distutils.command.config_compiler import show_fortran_compilers
-from numpy.distutils.ccompiler_opt import new_ccompiler_opt
-
+from numpy.distutils.ccompiler_opt import new_ccompiler_opt, CCompilerOpt
 
 class build_ext (old_build_ext):
 
@@ -39,6 +38,8 @@ class build_ext (old_build_ext):
          "specify a list of dispatched CPU optimizations"),
         ('disable-optimization', None,
          "disable CPU optimized code(dispatch,simd,fast...)"),
+        ('simd-test=', None,
+         "specify a list of CPU optimizations to be tested against NumPy SIMD interface"),
     ]
 
     help_options = old_build_ext.help_options + [
@@ -56,6 +57,7 @@ class build_ext (old_build_ext):
         self.cpu_baseline = None
         self.cpu_dispatch = None
         self.disable_optimization = None
+        self.simd_test = None
 
     def finalize_options(self):
         if self.parallel:
@@ -87,7 +89,9 @@ class build_ext (old_build_ext):
                                         ('cpu_baseline', 'cpu_baseline'),
                                         ('cpu_dispatch', 'cpu_dispatch'),
                                         ('disable_optimization', 'disable_optimization'),
+                                        ('simd_test', 'simd_test')
                                   )
+        CCompilerOpt.conf_target_groups["simd_test"] = self.simd_test
 
     def run(self):
         if not self.extensions:
@@ -406,52 +410,49 @@ class build_ext (old_build_ext):
 
         include_dirs = ext.include_dirs + get_numpy_include_dirs()
 
-        dispatch_objects = []
+        # filtering C dispatch-table sources when optimization is not disabled,
+        # otherwise treated as normal sources.
+        copt_c_sources = []
+        copt_baseline_flags = []
+        copt_macros = []
         if not self.disable_optimization:
-            dispatch_sources  = [
+            copt_build_src = None if self.inplace else self.get_finalized_command("build_src").build_src
+            copt_c_sources = [
                 c_sources.pop(c_sources.index(src))
                 for src in c_sources[:] if src.endswith(".dispatch.c")
             ]
-            if dispatch_sources:
-                if not self.inplace:
-                    build_src = self.get_finalized_command("build_src").build_src
-                else:
-                    build_src = None
-                dispatch_objects = self.compiler_opt.try_dispatch(
-                    dispatch_sources,
-                    output_dir=output_dir,
-                    src_dir=build_src,
-                    macros=macros,
-                    include_dirs=include_dirs,
-                    debug=self.debug,
-                    extra_postargs=extra_args,
-                    **kws
-                )
-            extra_args_baseopt = extra_args + self.compiler_opt.cpu_baseline_flags()
+            copt_baseline_flags = self.compiler_opt.cpu_baseline_flags()
         else:
-            extra_args_baseopt = extra_args
-            macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
+            copt_macros.append(("NPY_DISABLE_OPTIMIZATION", 1))
 
         c_objects = []
+        if copt_c_sources:
+            log.info("compiling C dispatch-able sources")
+            c_objects += self.compiler_opt.try_dispatch(copt_c_sources,
+                                                        output_dir=output_dir,
+                                                        src_dir=copt_build_src,
+                                                        macros=macros + copt_macros,
+                                                        include_dirs=include_dirs,
+                                                        debug=self.debug,
+                                                        extra_postargs=extra_args,
+                                                        **kws)
         if c_sources:
             log.info("compiling C sources")
-            c_objects = self.compiler.compile(c_sources,
-                                              output_dir=output_dir,
-                                              macros=macros,
-                                              include_dirs=include_dirs,
-                                              debug=self.debug,
-                                              extra_postargs=extra_args_baseopt,
-                                              **kws)
-        c_objects.extend(dispatch_objects)
-
+            c_objects += self.compiler.compile(c_sources,
+                                               output_dir=output_dir,
+                                               macros=macros + copt_macros,
+                                               include_dirs=include_dirs,
+                                               debug=self.debug,
+                                               extra_postargs=extra_args + copt_baseline_flags,
+                                               **kws)
         if cxx_sources:
             log.info("compiling C++ sources")
             c_objects += cxx_compiler.compile(cxx_sources,
                                               output_dir=output_dir,
-                                              macros=macros,
+                                              macros=macros + copt_macros,
                                               include_dirs=include_dirs,
                                               debug=self.debug,
-                                              extra_postargs=extra_args,
+                                              extra_postargs=extra_args + copt_baseline_flags,
                                               **kws)
 
         extra_postargs = []
@@ -559,7 +560,7 @@ class build_ext (old_build_ext):
         unlinkable_fobjects = list(unlinkable_fobjects)
 
         # Expand possible fake static libraries to objects
-        for lib in list(libraries):
+        for lib in libraries:
             for libdir in library_dirs:
                 fake_lib = os.path.join(libdir, lib + '.fobjects')
                 if os.path.isfile(fake_lib):
diff --git a/numpy/distutils/command/config.py b/numpy/distutils/command/config.py
index e54a54449..60881f4a3 100644
--- a/numpy/distutils/command/config.py
+++ b/numpy/distutils/command/config.py
@@ -20,9 +20,10 @@ from numpy.distutils.mingw32ccompiler import generate_manifest
 from numpy.distutils.command.autodist import (check_gcc_function_attribute,
                                               check_gcc_function_attribute_with_intrinsics,
                                               check_gcc_variable_attribute,
+                                              check_gcc_version_at_least,
                                               check_inline,
                                               check_restrict,
-                                              check_compiler_gcc4)
+                                              check_compiler_gcc)
 
 LANG_EXT['f77'] = '.f'
 LANG_EXT['f90'] = '.f90'
@@ -416,9 +417,9 @@ class config(old_config):
         otherwise."""
         return check_restrict(self)
 
-    def check_compiler_gcc4(self):
-        """Return True if the C compiler is gcc >= 4."""
-        return check_compiler_gcc4(self)
+    def check_compiler_gcc(self):
+        """Return True if the C compiler is gcc"""
+        return check_compiler_gcc(self)
 
     def check_gcc_function_attribute(self, attribute, name):
         return check_gcc_function_attribute(self, attribute, name)
@@ -431,6 +432,11 @@ class config(old_config):
     def check_gcc_variable_attribute(self, attribute):
         return check_gcc_variable_attribute(self, attribute)
 
+    def check_gcc_version_at_least(self, major, minor=0, patchlevel=0):
+        """Return True if the GCC version is greater than or equal to the
+        specified version."""
+        return check_gcc_version_at_least(self, major, minor, patchlevel)
+
     def get_output(self, body, headers=None, include_dirs=None,
                    libraries=None, library_dirs=None,
                    lang="c", use_tee=None):
diff --git a/numpy/distutils/fcompiler/__init__.py b/numpy/distutils/fcompiler/__init__.py
index 1c3069363..76f00ee91 100644
--- a/numpy/distutils/fcompiler/__init__.py
+++ b/numpy/distutils/fcompiler/__init__.py
@@ -20,8 +20,6 @@ import os
 import sys
 import re
 
-from numpy.compat import open_latin1
-
 from distutils.sysconfig import get_python_lib
 from distutils.fancy_getopt import FancyGetopt
 from distutils.errors import DistutilsModuleError, \
@@ -745,8 +743,8 @@ _default_compilers = (
     ('win32', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95',
                'intelvem', 'intelem', 'flang')),
     ('cygwin.*', ('gnu', 'intelv', 'absoft', 'compaqv', 'intelev', 'gnu95', 'g95')),
-    ('linux.*', ('gnu95', 'intel', 'lahey', 'pg', 'absoft', 'nag', 'vast', 'compaq',
-                 'intele', 'intelem', 'gnu', 'g95', 'pathf95', 'nagfor')),
+    ('linux.*', ('gnu95', 'intel', 'lahey', 'pg', 'nv', 'absoft', 'nag', 'vast', 'compaq',
+                 'intele', 'intelem', 'gnu', 'g95', 'pathf95', 'nagfor', 'fujitsu')),
     ('darwin.*', ('gnu95', 'nag', 'absoft', 'ibm', 'intel', 'gnu', 'g95', 'pg')),
     ('sunos.*', ('sun', 'gnu', 'gnu95', 'g95')),
     ('irix.*', ('mips', 'gnu', 'gnu95',)),
@@ -975,29 +973,27 @@ def is_free_format(file):
     # f90 allows both fixed and free format, assuming fixed unless
     # signs of free format are detected.
     result = 0
-    f = open_latin1(file, 'r')
-    line = f.readline()
-    n = 10000 # the number of non-comment lines to scan for hints
-    if _has_f_header(line):
-        n = 0
-    elif _has_f90_header(line):
-        n = 0
-        result = 1
-    while n>0 and line:
-        line = line.rstrip()
-        if line and line[0]!='!':
-            n -= 1
-            if (line[0]!='\t' and _free_f90_start(line[:5])) or line[-1:]=='&':
-                result = 1
-                break
+    with open(file, encoding='latin1') as f:
         line = f.readline()
-    f.close()
+        n = 10000 # the number of non-comment lines to scan for hints
+        if _has_f_header(line):
+            n = 0
+        elif _has_f90_header(line):
+            n = 0
+            result = 1
+        while n>0 and line:
+            line = line.rstrip()
+            if line and line[0]!='!':
+                n -= 1
+                if (line[0]!='\t' and _free_f90_start(line[:5])) or line[-1:]=='&':
+                    result = 1
+                    break
+            line = f.readline()
     return result
 
 def has_f90_header(src):
-    f = open_latin1(src, 'r')
-    line = f.readline()
-    f.close()
+    with open(src, encoding='latin1') as f:
+        line = f.readline()
     return _has_f90_header(line) or _has_fix_header(line)
 
 _f77flags_re = re.compile(r'(c|)f77flags\s*\(\s*(?P<fcname>\w+)\s*\)\s*=\s*(?P<fflags>.*)', re.I)
@@ -1008,17 +1004,16 @@ def get_f77flags(src):
     Return a dictionary {<fcompiler type>:<f77 flags>}.
     """
     flags = {}
-    f = open_latin1(src, 'r')
-    i = 0
-    for line in f:
-        i += 1
-        if i>20: break
-        m = _f77flags_re.match(line)
-        if not m: continue
-        fcname = m.group('fcname').strip()
-        fflags = m.group('fflags').strip()
-        flags[fcname] = split_quoted(fflags)
-    f.close()
+    with open(src, encoding='latin1') as f:
+        i = 0
+        for line in f:
+            i += 1
+            if i>20: break
+            m = _f77flags_re.match(line)
+            if not m: continue
+            fcname = m.group('fcname').strip()
+            fflags = m.group('fflags').strip()
+            flags[fcname] = split_quoted(fflags)
     return flags
 
 # TODO: implement get_f90flags and use it in _compile similarly to get_f77flags
diff --git a/numpy/distutils/fcompiler/environment.py b/numpy/distutils/fcompiler/environment.py
index 21a5be003..ecd4d9989 100644
--- a/numpy/distutils/fcompiler/environment.py
+++ b/numpy/distutils/fcompiler/environment.py
@@ -33,7 +33,10 @@ class EnvironmentConfig:
         try:
             conf_desc = self._conf_keys[name]
         except KeyError:
-            raise AttributeError(name)
+            raise AttributeError(
+                f"'EnvironmentConfig' object has no attribute '{name}'"
+            ) from None
+
         return self._get_var(name, conf_desc)
 
     def get(self, name, default=None):
diff --git a/numpy/distutils/fcompiler/fujitsu.py b/numpy/distutils/fcompiler/fujitsu.py
new file mode 100644
index 000000000..ddce67456
--- /dev/null
+++ b/numpy/distutils/fcompiler/fujitsu.py
@@ -0,0 +1,46 @@
+"""
+fujitsu
+
+Supports Fujitsu compiler function.
+This compiler is developed by Fujitsu and is used in A64FX on Fugaku.
+"""
+from numpy.distutils.fcompiler import FCompiler
+
+compilers = ['FujitsuFCompiler']
+
+class FujitsuFCompiler(FCompiler):
+    compiler_type = 'fujitsu'
+    description = 'Fujitsu Fortran Compiler'
+
+    possible_executables = ['frt']
+    version_pattern = r'frt \(FRT\) (?P<version>[a-z\d.]+)'
+    # $ frt --version
+    # frt (FRT) x.x.x yyyymmdd
+
+    executables = {
+        'version_cmd'  : ["<F77>", "--version"],
+        'compiler_f77' : ["frt", "-Fixed"],
+        'compiler_fix' : ["frt", "-Fixed"],
+        'compiler_f90' : ["frt"],
+        'linker_so'    : ["frt", "-shared"],
+        'archiver'     : ["ar", "-cr"],
+        'ranlib'       : ["ranlib"]
+        }
+    pic_flags = ['-KPIC']
+    module_dir_switch = '-M'
+    module_include_switch = '-I'
+
+    def get_flags_opt(self):
+        return ['-O3']
+    def get_flags_debug(self):
+        return ['-g']
+    def runtime_library_dir_option(self, dir):
+        return f'-Wl,-rpath={dir}'
+    def get_libraries(self):
+        return ['fj90f', 'fj90i', 'fjsrcinfo']
+
+if __name__ == '__main__':
+    from distutils import log
+    from numpy.distutils import customized_fcompiler
+    log.set_verbosity(2)
+    print(customized_fcompiler('fujitsu').get_version())
diff --git a/numpy/distutils/fcompiler/gnu.py b/numpy/distutils/fcompiler/gnu.py
index caa08549e..68d1501ee 100644
--- a/numpy/distutils/fcompiler/gnu.py
+++ b/numpy/distutils/fcompiler/gnu.py
@@ -23,13 +23,6 @@ def is_win64():
     return sys.platform == "win32" and platform.architecture()[0] == "64bit"
 
 
-if is_win64():
-    #_EXTRAFLAGS = ["-fno-leading-underscore"]
-    _EXTRAFLAGS = []
-else:
-    _EXTRAFLAGS = []
-
-
 class GnuFCompiler(FCompiler):
     compiler_type = 'gnu'
     compiler_aliases = ('g77', )
@@ -133,7 +126,7 @@ class GnuFCompiler(FCompiler):
                     target = '10.9'
                     s = f'Env. variable MACOSX_DEPLOYMENT_TARGET set to {target}'
                     warnings.warn(s, stacklevel=2)
-                os.environ['MACOSX_DEPLOYMENT_TARGET'] = target
+                os.environ['MACOSX_DEPLOYMENT_TARGET'] = str(target)
             opt.extend(['-undefined', 'dynamic_lookup', '-bundle'])
         else:
             opt.append("-shared")
@@ -238,7 +231,7 @@ class GnuFCompiler(FCompiler):
 
     def _c_arch_flags(self):
         """ Return detected arch flags from CFLAGS """
-        from distutils import sysconfig
+        import sysconfig
         try:
             cflags = sysconfig.get_config_vars()['CFLAGS']
         except KeyError:
@@ -297,11 +290,11 @@ class Gnu95FCompiler(GnuFCompiler):
     executables = {
         'version_cmd'  : ["<F90>", "-dumpversion"],
         'compiler_f77' : [None, "-Wall", "-g", "-ffixed-form",
-                          "-fno-second-underscore"] + _EXTRAFLAGS,
+                          "-fno-second-underscore"],
         'compiler_f90' : [None, "-Wall", "-g",
-                          "-fno-second-underscore"] + _EXTRAFLAGS,
+                          "-fno-second-underscore"],
         'compiler_fix' : [None, "-Wall",  "-g","-ffixed-form",
-                          "-fno-second-underscore"] + _EXTRAFLAGS,
+                          "-fno-second-underscore"],
         'linker_so'    : ["<F90>", "-Wall", "-g"],
         'archiver'     : ["ar", "-cr"],
         'ranlib'       : ["ranlib"],
diff --git a/numpy/distutils/fcompiler/nag.py b/numpy/distutils/fcompiler/nag.py
index 908e724e6..7df8ffe2c 100644
--- a/numpy/distutils/fcompiler/nag.py
+++ b/numpy/distutils/fcompiler/nag.py
@@ -19,7 +19,7 @@ class BaseNAGFCompiler(FCompiler):
     def get_flags_opt(self):
         return ['-O4']
     def get_flags_arch(self):
-        return ['']
+        return []
 
 class NAGFCompiler(BaseNAGFCompiler):
 
diff --git a/numpy/distutils/fcompiler/nv.py b/numpy/distutils/fcompiler/nv.py
new file mode 100644
index 000000000..8e9f16835
--- /dev/null
+++ b/numpy/distutils/fcompiler/nv.py
@@ -0,0 +1,55 @@
+import sys
+
+from numpy.distutils.fcompiler import FCompiler
+
+compilers = ['NVHPCFCompiler']
+
+class NVHPCFCompiler(FCompiler):
+    """ NVIDIA High Performance Computing (HPC) SDK Fortran Compiler
+   
+    https://developer.nvidia.com/hpc-sdk
+   
+    Since august 2020 the NVIDIA HPC SDK includes the compilers formerly known as The Portland Group compilers,
+    https://www.pgroup.com/index.htm.
+    See also `numpy.distutils.fcompiler.pg`.
+    """
+
+    compiler_type = 'nv'
+    description = 'NVIDIA HPC SDK'
+    version_pattern = r'\s*(nvfortran|(pg(f77|f90|fortran)) \(aka nvfortran\)) (?P<version>[\d.-]+).*'
+
+    executables = {
+        'version_cmd': ["<F90>", "-V"],
+        'compiler_f77': ["nvfortran"],
+        'compiler_fix': ["nvfortran", "-Mfixed"],
+        'compiler_f90': ["nvfortran"],
+        'linker_so': ["<F90>"],
+        'archiver': ["ar", "-cr"],
+        'ranlib': ["ranlib"]
+    }
+    pic_flags = ['-fpic']
+
+    module_dir_switch = '-module '
+    module_include_switch = '-I'
+
+    def get_flags(self):
+        opt = ['-Minform=inform', '-Mnosecond_underscore']
+        return self.pic_flags + opt
+
+    def get_flags_opt(self):
+        return ['-fast']
+
+    def get_flags_debug(self):
+        return ['-g']
+
+    def get_flags_linker_so(self):
+        return ["-shared", '-fpic']
+
+    def runtime_library_dir_option(self, dir):
+        return '-R%s' % dir
+
+if __name__ == '__main__':
+    from distutils import log
+    log.set_verbosity(2)
+    from numpy.distutils import customized_fcompiler
+    print(customized_fcompiler(compiler='nv').get_version())
diff --git a/numpy/distutils/fcompiler/pg.py b/numpy/distutils/fcompiler/pg.py
index eb628cb63..72442c4fe 100644
--- a/numpy/distutils/fcompiler/pg.py
+++ b/numpy/distutils/fcompiler/pg.py
@@ -31,7 +31,7 @@ class PGroupFCompiler(FCompiler):
             'compiler_f77': ["pgfortran"],
             'compiler_fix': ["pgfortran", "-Mfixed"],
             'compiler_f90': ["pgfortran"],
-            'linker_so': ["pgfortran"],
+            'linker_so': ["<F90>"],
             'archiver': ["ar", "-cr"],
             'ranlib': ["ranlib"]
         }
diff --git a/numpy/distutils/mingw32ccompiler.py b/numpy/distutils/mingw32ccompiler.py
index 7cb6fadcc..3358695a8 100644
--- a/numpy/distutils/mingw32ccompiler.py
+++ b/numpy/distutils/mingw32ccompiler.py
@@ -8,6 +8,7 @@ Support code for building Python extensions on Windows.
 
 """
 import os
+import platform
 import sys
 import subprocess
 import re
@@ -265,16 +266,19 @@ def find_python_dll():
 
     # search in the file system for possible candidates
     major_version, minor_version = tuple(sys.version_info[:2])
-    patterns = ['python%d%d.dll']
-
-    for pat in patterns:
-        dllname = pat % (major_version, minor_version)
-        print("Looking for %s" % dllname)
-        for folder in lib_dirs:
-            dll = os.path.join(folder, dllname)
-            if os.path.exists(dll):
-                return dll
-
+    implementation = platform.python_implementation()
+    if implementation == 'CPython':
+        dllname = f'python{major_version}{minor_version}.dll'
+    elif implementation == 'PyPy':
+        dllname = f'libpypy{major_version}-c.dll'
+    else:
+        dllname = 'Unknown platform {implementation}' 
+    print("Looking for %s" % dllname)
+    for folder in lib_dirs:
+        dll = os.path.join(folder, dllname)
+        if os.path.exists(dll):
+            return dll
+ 
     raise ValueError("%s not found in %s" % (dllname, lib_dirs))
 
 def dump_table(dll):
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index 9f9e9f1ac..d3073ab2d 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -9,6 +9,7 @@ import subprocess
 import shutil
 import multiprocessing
 import textwrap
+import importlib.util
 
 import distutils
 from distutils.errors import DistutilsError
@@ -1900,15 +1901,16 @@ class Configuration:
                 revision0 = f.read().strip()
 
             branch_map = {}
-            for line in file(branch_cache_fn, 'r'):
-                branch1, revision1  = line.split()[:2]
-                if revision1==revision0:
-                    branch0 = branch1
-                try:
-                    revision1 = int(revision1)
-                except ValueError:
-                    continue
-                branch_map[branch1] = revision1
+            with open(branch_cache_fn, 'r') as f:
+                for line in f:
+                    branch1, revision1  = line.split()[:2]
+                    if revision1==revision0:
+                        branch0 = branch1
+                    try:
+                        revision1 = int(revision1)
+                    except ValueError:
+                        continue
+                    branch_map[branch1] = revision1
 
             return branch_map.get(branch0)
 
@@ -1966,6 +1968,13 @@ class Configuration:
                     version = getattr(version_module, a, None)
                     if version is not None:
                         break
+
+                # Try if versioneer module
+                try:
+                    version = version_module.get_versions()['version']
+                except AttributeError:
+                    version = None
+
                 if version is not None:
                     break
 
@@ -2121,12 +2130,11 @@ def get_npy_pkg_dir():
     environment, and using them when cross-compiling.
 
     """
-    # XXX: import here for bootstrapping reasons
-    import numpy
     d = os.environ.get('NPY_PKG_CONFIG_PATH')
     if d is not None:
         return d
-    d = os.path.join(os.path.dirname(numpy.__file__),
+    spec = importlib.util.find_spec('numpy')
+    d = os.path.join(os.path.dirname(spec.origin),
             'core', 'lib', 'npy-pkg-config')
     return d
 
@@ -2355,6 +2363,7 @@ def generate_config_py(target):
 
                 Examples
                 --------
+                >>> import numpy as np
                 >>> np.show_config()
                 blas_opt_info:
                     language = c
diff --git a/numpy/distutils/setup.py b/numpy/distutils/setup.py
index 798c3686f..522756fc9 100644
--- a/numpy/distutils/setup.py
+++ b/numpy/distutils/setup.py
@@ -8,6 +8,7 @@ def configuration(parent_package='',top_path=None):
     config.add_data_files('site.cfg')
     config.add_data_files('mingw/gfortran_vs2003_hack.c')
     config.add_data_dir('checks')
+    config.add_data_files('*.pyi')
     config.make_config_py()
     return config
 
diff --git a/numpy/distutils/system_info.py b/numpy/distutils/system_info.py
index df82683dc..13f9da0fb 100644
--- a/numpy/distutils/system_info.py
+++ b/numpy/distutils/system_info.py
@@ -171,7 +171,7 @@ from configparser import RawConfigParser as ConfigParser
 
 from distutils.errors import DistutilsError
 from distutils.dist import Distribution
-import distutils.sysconfig
+import sysconfig
 from numpy.distutils import log
 from distutils.util import get_platform
 
@@ -187,6 +187,7 @@ import distutils.ccompiler
 import tempfile
 import shutil
 
+__all__ = ['system_info']
 
 # Determine number of bits
 import platform
@@ -255,7 +256,7 @@ def libpaths(paths, bits):
 
 if sys.platform == 'win32':
     default_lib_dirs = ['C:\\',
-                        os.path.join(distutils.sysconfig.EXEC_PREFIX,
+                        os.path.join(sysconfig.get_config_var('exec_prefix'),
                                      'libs')]
     default_runtime_dirs = []
     default_include_dirs = []
@@ -289,7 +290,7 @@ if sys.platform == 'win32':
     vcpkg = shutil.which('vcpkg')
     if vcpkg:
         vcpkg_dir = os.path.dirname(vcpkg)
-        if platform.architecture() == '32bit':
+        if platform.architecture()[0] == '32bit':
             specifier = 'x86'
         else:
             specifier = 'x64'
@@ -414,6 +415,89 @@ def get_standard_file(fname):
     return filenames
 
 
+def _parse_env_order(base_order, env):
+    """ Parse an environment variable `env` by splitting with "," and only returning elements from `base_order`
+
+    This method will sequence the environment variable and check for their invidual elements in `base_order`.
+
+    The items in the environment variable may be negated via '^item' or '!itema,itemb'.
+    It must start with ^/! to negate all options.
+
+    Raises
+    ------
+    ValueError: for mixed negated and non-negated orders or multiple negated orders
+
+    Parameters
+    ----------
+    base_order : list of str
+       the base list of orders
+    env : str
+       the environment variable to be parsed, if none is found, `base_order` is returned
+
+    Returns
+    -------
+    allow_order : list of str
+        allowed orders in lower-case
+    unknown_order : list of str
+        for values not overlapping with `base_order`
+    """
+    order_str = os.environ.get(env, None)
+
+    # ensure all base-orders are lower-case (for easier comparison)
+    base_order = [order.lower() for order in base_order]
+    if order_str is None:
+        return base_order, []
+
+    neg = order_str.startswith('^') or order_str.startswith('!')
+    # Check format
+    order_str_l = list(order_str)
+    sum_neg = order_str_l.count('^') + order_str_l.count('!')
+    if neg:
+        if sum_neg > 1:
+            raise ValueError(f"Environment variable '{env}' may only contain a single (prefixed) negation: {order_str}")
+        # remove prefix
+        order_str = order_str[1:]
+    elif sum_neg > 0:
+        raise ValueError(f"Environment variable '{env}' may not mix negated an non-negated items: {order_str}")
+
+    # Split and lower case
+    orders = order_str.lower().split(',')
+
+    # to inform callee about non-overlapping elements
+    unknown_order = []
+
+    # if negated, we have to remove from the order
+    if neg:
+        allow_order = base_order.copy()
+
+        for order in orders:
+            if not order:
+                continue
+
+            if order not in base_order:
+                unknown_order.append(order)
+                continue
+
+            if order in allow_order:
+                allow_order.remove(order)
+
+    else:
+        allow_order = []
+
+        for order in orders:
+            if not order:
+                continue
+
+            if order not in base_order:
+                unknown_order.append(order)
+                continue
+
+            if order not in allow_order:
+                allow_order.append(order)
+
+    return allow_order, unknown_order
+
+
 def get_info(name, notfound_action=0):
     """
     notfound_action:
@@ -715,8 +799,7 @@ class system_info:
         AliasedOptionError :
             in case more than one of the options are found
         """
-        found = map(lambda opt: self.cp.has_option(self.section, opt), options)
-        found = list(found)
+        found = [self.cp.has_option(self.section, opt) for opt in options]
         if sum(found) == 1:
             return options[found.index(True)]
         elif sum(found) == 0:
@@ -1766,24 +1849,11 @@ class lapack_opt_info(system_info):
         return getattr(self, '_calc_info_{}'.format(name))()
 
     def calc_info(self):
-        user_order = os.environ.get(self.order_env_var_name, None)
-        if user_order is None:
-            lapack_order = self.lapack_order
-        else:
-            # the user has requested the order of the
-            # check they are all in the available list, a COMMA SEPARATED list
-            user_order = user_order.lower().split(',')
-            non_existing = []
-            lapack_order = []
-            for order in user_order:
-                if order in self.lapack_order:
-                    lapack_order.append(order)
-                elif len(order) > 0:
-                    non_existing.append(order)
-            if len(non_existing) > 0:
-                raise ValueError("lapack_opt_info user defined "
-                                 "LAPACK order has unacceptable "
-                                 "values: {}".format(non_existing))
+        lapack_order, unknown_order = _parse_env_order(self.lapack_order, self.order_env_var_name)
+        if len(unknown_order) > 0:
+            raise ValueError("lapack_opt_info user defined "
+                             "LAPACK order has unacceptable "
+                             "values: {}".format(unknown_order))
 
         for lapack in lapack_order:
             if self._calc_info(lapack):
@@ -1911,22 +1981,9 @@ class blas_opt_info(system_info):
         return getattr(self, '_calc_info_{}'.format(name))()
 
     def calc_info(self):
-        user_order = os.environ.get(self.order_env_var_name, None)
-        if user_order is None:
-            blas_order = self.blas_order
-        else:
-            # the user has requested the order of the
-            # check they are all in the available list
-            user_order = user_order.lower().split(',')
-            non_existing = []
-            blas_order = []
-            for order in user_order:
-                if order in self.blas_order:
-                    blas_order.append(order)
-                elif len(order) > 0:
-                    non_existing.append(order)
-            if len(non_existing) > 0:
-                raise ValueError("blas_opt_info user defined BLAS order has unacceptable values: {}".format(non_existing))
+        blas_order, unknown_order = _parse_env_order(self.blas_order, self.order_env_var_name)
+        if len(unknown_order) > 0:
+            raise ValueError("blas_opt_info user defined BLAS order has unacceptable values: {}".format(unknown_order))
 
         for blas in blas_order:
             if self._calc_info(blas):
@@ -1962,6 +2019,14 @@ class blas64__opt_info(blas_ilp64_opt_info):
     symbol_suffix = '64_'
 
 
+class cblas_info(system_info):
+    section = 'cblas'
+    dir_env_var = 'CBLAS'
+    # No default as it's used only in blas_info
+    _lib_names = []
+    notfounderror = BlasNotFoundError
+
+
 class blas_info(system_info):
     section = 'blas'
     dir_env_var = 'BLAS'
@@ -1983,6 +2048,13 @@ class blas_info(system_info):
             # often not installed when mingw is being used. This rough
             # treatment is not desirable, but windows is tricky.
             info['language'] = 'f77'  # XXX: is it generally true?
+            # If cblas is given as an option, use those
+            cblas_info_obj = cblas_info()
+            cblas_opt = cblas_info_obj.get_option_single('cblas_libs', 'libraries')
+            cblas_libs = cblas_info_obj.get_libs(cblas_opt, None)
+            if cblas_libs:
+                info['libraries'] = cblas_libs + blas_libs
+                info['define_macros'] = [('HAVE_CBLAS', None)]
         else:
             lib = self.get_cblas_libs(info)
             if lib is not None:
@@ -2499,13 +2571,12 @@ class _numpy_info(system_info):
             except AttributeError:
                 pass
 
-            include_dirs.append(distutils.sysconfig.get_python_inc(
-                                        prefix=os.sep.join(prefix)))
+            include_dirs.append(sysconfig.get_path('include'))
         except ImportError:
             pass
-        py_incl_dir = distutils.sysconfig.get_python_inc()
+        py_incl_dir = sysconfig.get_path('include')
         include_dirs.append(py_incl_dir)
-        py_pincl_dir = distutils.sysconfig.get_python_inc(plat_specific=True)
+        py_pincl_dir = sysconfig.get_path('platinclude')
         if py_pincl_dir not in include_dirs:
             include_dirs.append(py_pincl_dir)
         for d in default_include_dirs:
@@ -2632,8 +2703,8 @@ class boost_python_info(system_info):
                 break
         if not src_dir:
             return
-        py_incl_dirs = [distutils.sysconfig.get_python_inc()]
-        py_pincl_dir = distutils.sysconfig.get_python_inc(plat_specific=True)
+        py_incl_dirs = [sysconfig.get_path('include')]
+        py_pincl_dir = sysconfig.get_path('platinclude')
         if py_pincl_dir not in py_incl_dirs:
             py_incl_dirs.append(py_pincl_dir)
         srcs_dir = os.path.join(src_dir, 'libs', 'python', 'src')
diff --git a/numpy/distutils/tests/test_ccompiler_opt_conf.py b/numpy/distutils/tests/test_ccompiler_opt_conf.py
index 2f83a59e0..244748e58 100644
--- a/numpy/distutils/tests/test_ccompiler_opt_conf.py
+++ b/numpy/distutils/tests/test_ccompiler_opt_conf.py
@@ -66,11 +66,12 @@ class _TestConfFeatures(FakeCCompilerOpt):
 
         self.test_implies(error_msg, search_in, feature_name, feature_dict)
         self.test_group(error_msg, search_in, feature_name, feature_dict)
+        self.test_extra_checks(error_msg, search_in, feature_name, feature_dict)
 
     def test_option_types(self, error_msg, option, val):
         for tp, available in (
             ((str, list), (
-                "implies", "headers", "flags", "group", "detect"
+                "implies", "headers", "flags", "group", "detect", "extra_checks"
             )),
             ((str,),  ("disable",)),
             ((int,),  ("interest",)),
@@ -83,29 +84,25 @@ class _TestConfFeatures(FakeCCompilerOpt):
             if not isinstance(val, tp):
                 error_tp = [t.__name__ for t in (*tp,)]
                 error_tp = ' or '.join(error_tp)
-                raise AssertionError(error_msg + \
+                raise AssertionError(error_msg +
                     "expected '%s' type for option '%s' not '%s'" % (
                      error_tp, option, type(val).__name__
                 ))
             break
 
         if not found_it:
-            raise AssertionError(error_msg + \
-                "invalid option name '%s'" % option
-            )
+            raise AssertionError(error_msg + "invalid option name '%s'" % option)
 
     def test_duplicates(self, error_msg, option, val):
         if option not in (
-            "implies", "headers", "flags", "group", "detect"
+            "implies", "headers", "flags", "group", "detect", "extra_checks"
         ) : return
 
         if isinstance(val, str):
             val = val.split()
 
         if len(val) != len(set(val)):
-            raise AssertionError(error_msg + \
-                "duplicated values in option '%s'" % option
-            )
+            raise AssertionError(error_msg + "duplicated values in option '%s'" % option)
 
     def test_implies(self, error_msg, search_in, feature_name, feature_dict):
         if feature_dict.get("disabled") is not None:
@@ -117,21 +114,15 @@ class _TestConfFeatures(FakeCCompilerOpt):
             implies = implies.split()
 
         if feature_name in implies:
-            raise AssertionError(error_msg + \
-                "feature implies itself"
-            )
+            raise AssertionError(error_msg + "feature implies itself")
 
         for impl in implies:
             impl_dict = search_in.get(impl)
             if impl_dict is not None:
                 if "disable" in impl_dict:
-                    raise AssertionError(error_msg + \
-                        "implies disabled feature '%s'" % impl
-                    )
+                    raise AssertionError(error_msg + "implies disabled feature '%s'" % impl)
                 continue
-            raise AssertionError(error_msg + \
-                "implies non-exist feature '%s'" % impl
-            )
+            raise AssertionError(error_msg + "implies non-exist feature '%s'" % impl)
 
     def test_group(self, error_msg, search_in, feature_name, feature_dict):
         if feature_dict.get("disabled") is not None:
@@ -146,10 +137,26 @@ class _TestConfFeatures(FakeCCompilerOpt):
             impl_dict = search_in.get(f)
             if not impl_dict or "disable" in impl_dict:
                 continue
-            raise AssertionError(error_msg + \
-                "in option '%s', '%s' already exists as a feature name" % (
-                option, f
-            ))
+            raise AssertionError(error_msg +
+                "in option 'group', '%s' already exists as a feature name" % f
+            )
+
+    def test_extra_checks(self, error_msg, search_in, feature_name, feature_dict):
+        if feature_dict.get("disabled") is not None:
+            return
+        extra_checks = feature_dict.get("extra_checks", "")
+        if not extra_checks:
+            return
+        if isinstance(extra_checks, str):
+            extra_checks = extra_checks.split()
+
+        for f in extra_checks:
+            impl_dict = search_in.get(f)
+            if not impl_dict or "disable" in impl_dict:
+                continue
+            raise AssertionError(error_msg +
+                "in option 'extra_checks', extra test case '%s' already exists as a feature name" % f
+            )
 
 class TestConfFeatures(unittest.TestCase):
     def __init__(self, methodName="runTest"):
diff --git a/numpy/distutils/tests/test_system_info.py b/numpy/distutils/tests/test_system_info.py
index 0768ffdde..ec15126f7 100644
--- a/numpy/distutils/tests/test_system_info.py
+++ b/numpy/distutils/tests/test_system_info.py
@@ -284,4 +284,37 @@ class TestSystemInfoReading:
             assert info.get_lib_dirs() == lib_dirs
         finally:
             os.chdir(previousDir)
-        
+
+
+def test_distutils_parse_env_order(monkeypatch):
+    from numpy.distutils.system_info import _parse_env_order
+    env = 'NPY_TESTS_DISTUTILS_PARSE_ENV_ORDER'
+
+    base_order = list('abcdef')
+
+    monkeypatch.setenv(env, 'b,i,e,f')
+    order, unknown = _parse_env_order(base_order, env)
+    assert len(order) == 3
+    assert order == list('bef')
+    assert len(unknown) == 1
+
+    # For when LAPACK/BLAS optimization is disabled
+    monkeypatch.setenv(env, '')
+    order, unknown = _parse_env_order(base_order, env)
+    assert len(order) == 0
+    assert len(unknown) == 0
+
+    for prefix in '^!':
+        monkeypatch.setenv(env, f'{prefix}b,i,e')
+        order, unknown = _parse_env_order(base_order, env)
+        assert len(order) == 4
+        assert order == list('acdf')
+        assert len(unknown) == 1
+
+    with pytest.raises(ValueError):
+        monkeypatch.setenv(env, 'b,^e,i')
+        _parse_env_order(base_order, env)
+
+    with pytest.raises(ValueError):
+        monkeypatch.setenv(env, '!b,^e,i')
+        _parse_env_order(base_order, env)
diff --git a/numpy/distutils/unixccompiler.py b/numpy/distutils/unixccompiler.py
index 5f36c439f..0cd2d243e 100644
--- a/numpy/distutils/unixccompiler.py
+++ b/numpy/distutils/unixccompiler.py
@@ -3,6 +3,8 @@ unixccompiler - can handle very long argument lists for ar.
 
 """
 import os
+import sys
+import subprocess
 
 from distutils.errors import CompileError, DistutilsExecError, LibError
 from distutils.unixccompiler import UnixCCompiler
@@ -26,7 +28,8 @@ def UnixCCompiler__compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts
         self.compiler_so = ccomp
     # ensure OPT environment variable is read
     if 'OPT' in os.environ:
-        from distutils.sysconfig import get_config_vars
+        # XXX who uses this?
+        from sysconfig import get_config_vars
         opt = " ".join(os.environ['OPT'].split())
         gcv_opt = " ".join(get_config_vars('OPT')[0].split())
         ccomp_s = " ".join(self.compiler_so)
@@ -55,6 +58,11 @@ def UnixCCompiler__compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts
 
     # add commandline flags to dependency file
     if deps:
+        # After running the compiler, the file created will be in EBCDIC
+        # but will not be tagged as such. This tags it so the file does not
+        # have multiple different encodings being written to it
+        if sys.platform == 'zos':
+            subprocess.check_output(['chtag', '-tc', 'IBM1047', obj + '.d'])
         with open(obj + '.d', 'a') as f:
             f.write(_commandline_dep_string(cc_args, extra_postargs, pp_opts))
 
diff --git a/numpy/doc/basics.py b/numpy/doc/basics.py
deleted file mode 100644
index 635c1b1b8..000000000
--- a/numpy/doc/basics.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""
-============
-Array basics
-============
-
-Array types and conversions between types
-=========================================
-
-NumPy supports a much greater variety of numerical types than Python does.
-This section shows which are available, and how to modify an array's data-type.
-
-The primitive types supported are tied closely to those in C:
-
-.. list-table::
-    :header-rows: 1
-
-    * - Numpy type
-      - C type
-      - Description
-
-    * - `np.bool_`
-      - ``bool``
-      - Boolean (True or False) stored as a byte
-
-    * - `np.byte`
-      - ``signed char``
-      - Platform-defined
-
-    * - `np.ubyte`
-      - ``unsigned char``
-      - Platform-defined
-
-    * - `np.short`
-      - ``short``
-      - Platform-defined
-
-    * - `np.ushort`
-      - ``unsigned short``
-      - Platform-defined
-
-    * - `np.intc`
-      - ``int``
-      - Platform-defined
-
-    * - `np.uintc`
-      - ``unsigned int``
-      - Platform-defined
-
-    * - `np.int_`
-      - ``long``
-      - Platform-defined
-
-    * - `np.uint`
-      - ``unsigned long``
-      - Platform-defined
-
-    * - `np.longlong`
-      - ``long long``
-      - Platform-defined
-
-    * - `np.ulonglong`
-      - ``unsigned long long``
-      - Platform-defined
-
-    * - `np.half` / `np.float16`
-      -
-      - Half precision float:
-        sign bit, 5 bits exponent, 10 bits mantissa
-
-    * - `np.single`
-      - ``float``
-      - Platform-defined single precision float:
-        typically sign bit, 8 bits exponent, 23 bits mantissa
-
-    * - `np.double`
-      - ``double``
-      - Platform-defined double precision float:
-        typically sign bit, 11 bits exponent, 52 bits mantissa.
-
-    * - `np.longdouble`
-      - ``long double``
-      - Platform-defined extended-precision float
-
-    * - `np.csingle`
-      - ``float complex``
-      - Complex number, represented by two single-precision floats (real and imaginary components)
-
-    * - `np.cdouble`
-      - ``double complex``
-      - Complex number, represented by two double-precision floats (real and imaginary components).
-
-    * - `np.clongdouble`
-      - ``long double complex``
-      - Complex number, represented by two extended-precision floats (real and imaginary components).
-
-
-Since many of these have platform-dependent definitions, a set of fixed-size
-aliases are provided:
-
-.. list-table::
-    :header-rows: 1
-
-    * - Numpy type
-      - C type
-      - Description
-
-    * - `np.int8`
-      - ``int8_t``
-      - Byte (-128 to 127)
-
-    * - `np.int16`
-      - ``int16_t``
-      - Integer (-32768 to 32767)
-
-    * - `np.int32`
-      - ``int32_t``
-      - Integer (-2147483648 to 2147483647)
-
-    * - `np.int64`
-      - ``int64_t``
-      - Integer (-9223372036854775808 to 9223372036854775807)
-
-    * - `np.uint8`
-      - ``uint8_t``
-      - Unsigned integer (0 to 255)
-
-    * - `np.uint16`
-      - ``uint16_t``
-      - Unsigned integer (0 to 65535)
-
-    * - `np.uint32`
-      - ``uint32_t``
-      - Unsigned integer (0 to 4294967295)
-
-    * - `np.uint64`
-      - ``uint64_t``
-      - Unsigned integer (0 to 18446744073709551615)
-
-    * - `np.intp`
-      - ``intptr_t``
-      - Integer used for indexing, typically the same as ``ssize_t``
-
-    * - `np.uintp`
-      - ``uintptr_t``
-      - Integer large enough to hold a pointer
-
-    * - `np.float32`
-      - ``float``
-      -
-
-    * - `np.float64` / `np.float_`
-      - ``double``
-      - Note that this matches the precision of the builtin python `float`.
-
-    * - `np.complex64`
-      - ``float complex``
-      - Complex number, represented by two 32-bit floats (real and imaginary components)
-
-    * - `np.complex128` / `np.complex_`
-      - ``double complex``
-      - Note that this matches the precision of the builtin python `complex`.
-
-
-NumPy numerical types are instances of ``dtype`` (data-type) objects, each
-having unique characteristics.  Once you have imported NumPy using
-
-  ::
-
-    >>> import numpy as np
-
-the dtypes are available as ``np.bool_``, ``np.float32``, etc.
-
-Advanced types, not listed in the table above, are explored in
-section :ref:`structured_arrays`.
-
-There are 5 basic numerical types representing booleans (bool), integers (int),
-unsigned integers (uint) floating point (float) and complex. Those with numbers
-in their name indicate the bitsize of the type (i.e. how many bits are needed
-to represent a single value in memory).  Some types, such as ``int`` and
-``intp``, have differing bitsizes, dependent on the platforms (e.g. 32-bit
-vs. 64-bit machines).  This should be taken into account when interfacing
-with low-level code (such as C or Fortran) where the raw memory is addressed.
-
-Data-types can be used as functions to convert python numbers to array scalars
-(see the array scalar section for an explanation), python sequences of numbers
-to arrays of that type, or as arguments to the dtype keyword that many numpy
-functions or methods accept. Some examples::
-
-    >>> import numpy as np
-    >>> x = np.float32(1.0)
-    >>> x
-    1.0
-    >>> y = np.int_([1,2,4])
-    >>> y
-    array([1, 2, 4])
-    >>> z = np.arange(3, dtype=np.uint8)
-    >>> z
-    array([0, 1, 2], dtype=uint8)
-
-Array types can also be referred to by character codes, mostly to retain
-backward compatibility with older packages such as Numeric.  Some
-documentation may still refer to these, for example::
-
-  >>> np.array([1, 2, 3], dtype='f')
-  array([ 1.,  2.,  3.], dtype=float32)
-
-We recommend using dtype objects instead.
-
-To convert the type of an array, use the .astype() method (preferred) or
-the type itself as a function. For example: ::
-
-    >>> z.astype(float)                 #doctest: +NORMALIZE_WHITESPACE
-    array([  0.,  1.,  2.])
-    >>> np.int8(z)
-    array([0, 1, 2], dtype=int8)
-
-Note that, above, we use the *Python* float object as a dtype.  NumPy knows
-that ``int`` refers to ``np.int_``, ``bool`` means ``np.bool_``,
-that ``float`` is ``np.float_`` and ``complex`` is ``np.complex_``.
-The other data-types do not have Python equivalents.
-
-To determine the type of an array, look at the dtype attribute::
-
-    >>> z.dtype
-    dtype('uint8')
-
-dtype objects also contain information about the type, such as its bit-width
-and its byte-order.  The data type can also be used indirectly to query
-properties of the type, such as whether it is an integer::
-
-    >>> d = np.dtype(int)
-    >>> d
-    dtype('int32')
-
-    >>> np.issubdtype(d, np.integer)
-    True
-
-    >>> np.issubdtype(d, np.floating)
-    False
-
-
-Array Scalars
-=============
-
-NumPy generally returns elements of arrays as array scalars (a scalar
-with an associated dtype).  Array scalars differ from Python scalars, but
-for the most part they can be used interchangeably (the primary
-exception is for versions of Python older than v2.x, where integer array
-scalars cannot act as indices for lists and tuples).  There are some
-exceptions, such as when code requires very specific attributes of a scalar
-or when it checks specifically whether a value is a Python scalar. Generally,
-problems are easily fixed by explicitly converting array scalars
-to Python scalars, using the corresponding Python type function
-(e.g., ``int``, ``float``, ``complex``, ``str``, ``unicode``).
-
-The primary advantage of using array scalars is that
-they preserve the array type (Python may not have a matching scalar type
-available, e.g. ``int16``).  Therefore, the use of array scalars ensures
-identical behaviour between arrays and scalars, irrespective of whether the
-value is inside an array or not.  NumPy scalars also have many of the same
-methods arrays do.
-
-Overflow Errors
-===============
-
-The fixed size of NumPy numeric types may cause overflow errors when a value
-requires more memory than available in the data type. For example, 
-`numpy.power` evaluates ``100 * 10 ** 8`` correctly for 64-bit integers,
-but gives 1874919424 (incorrect) for a 32-bit integer.
-
-    >>> np.power(100, 8, dtype=np.int64)
-    10000000000000000
-    >>> np.power(100, 8, dtype=np.int32)
-    1874919424
-
-The behaviour of NumPy and Python integer types differs significantly for
-integer overflows and may confuse users expecting NumPy integers to behave
-similar to Python's ``int``. Unlike NumPy, the size of Python's ``int`` is
-flexible. This means Python integers may expand to accommodate any integer and
-will not overflow.
-
-NumPy provides `numpy.iinfo` and `numpy.finfo` to verify the
-minimum or maximum values of NumPy integer and floating point values
-respectively ::
-
-    >>> np.iinfo(int) # Bounds of the default integer on this system.
-    iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)
-    >>> np.iinfo(np.int32) # Bounds of a 32-bit integer
-    iinfo(min=-2147483648, max=2147483647, dtype=int32)
-    >>> np.iinfo(np.int64) # Bounds of a 64-bit integer
-    iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)
-
-If 64-bit integers are still too small the result may be cast to a
-floating point number. Floating point numbers offer a larger, but inexact,
-range of possible values.
-
-    >>> np.power(100, 100, dtype=np.int64) # Incorrect even with 64-bit int
-    0
-    >>> np.power(100, 100, dtype=np.float64)
-    1e+200
-
-Extended Precision
-==================
-
-Python's floating-point numbers are usually 64-bit floating-point numbers,
-nearly equivalent to ``np.float64``. In some unusual situations it may be
-useful to use floating-point numbers with more precision. Whether this
-is possible in numpy depends on the hardware and on the development
-environment: specifically, x86 machines provide hardware floating-point
-with 80-bit precision, and while most C compilers provide this as their
-``long double`` type, MSVC (standard for Windows builds) makes
-``long double`` identical to ``double`` (64 bits). NumPy makes the
-compiler's ``long double`` available as ``np.longdouble`` (and
-``np.clongdouble`` for the complex numbers). You can find out what your
-numpy provides with ``np.finfo(np.longdouble)``.
-
-NumPy does not provide a dtype with more precision than C's
-``long double``\\; in particular, the 128-bit IEEE quad precision
-data type (FORTRAN's ``REAL*16``\\) is not available.
-
-For efficient memory alignment, ``np.longdouble`` is usually stored
-padded with zero bits, either to 96 or 128 bits. Which is more efficient
-depends on hardware and development environment; typically on 32-bit
-systems they are padded to 96 bits, while on 64-bit systems they are
-typically padded to 128 bits. ``np.longdouble`` is padded to the system
-default; ``np.float96`` and ``np.float128`` are provided for users who
-want specific padding. In spite of the names, ``np.float96`` and
-``np.float128`` provide only as much precision as ``np.longdouble``,
-that is, 80 bits on most x86 machines and 64 bits in standard
-Windows builds.
-
-Be warned that even if ``np.longdouble`` offers more precision than
-python ``float``, it is easy to lose that extra precision, since
-python often forces values to pass through ``float``. For example,
-the ``%`` formatting operator requires its arguments to be converted
-to standard python types, and it is therefore impossible to preserve
-extended precision even if many decimal places are requested. It can
-be useful to test your code with the value
-``1 + np.finfo(np.longdouble).eps``.
-
-"""
diff --git a/numpy/doc/broadcasting.py b/numpy/doc/broadcasting.py
deleted file mode 100644
index 4ac1fd129..000000000
--- a/numpy/doc/broadcasting.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-========================
-Broadcasting over arrays
-========================
-
-.. note::
-    See `this article
-    <https://numpy.org/devdocs/user/theory.broadcasting.html>`_
-    for illustrations of broadcasting concepts.
-
-
-The term broadcasting describes how numpy treats arrays with different
-shapes during arithmetic operations. Subject to certain constraints,
-the smaller array is "broadcast" across the larger array so that they
-have compatible shapes. Broadcasting provides a means of vectorizing
-array operations so that looping occurs in C instead of Python. It does
-this without making needless copies of data and usually leads to
-efficient algorithm implementations. There are, however, cases where
-broadcasting is a bad idea because it leads to inefficient use of memory
-that slows computation.
-
-NumPy operations are usually done on pairs of arrays on an
-element-by-element basis.  In the simplest case, the two arrays must
-have exactly the same shape, as in the following example:
-
-  >>> a = np.array([1.0, 2.0, 3.0])
-  >>> b = np.array([2.0, 2.0, 2.0])
-  >>> a * b
-  array([ 2.,  4.,  6.])
-
-NumPy's broadcasting rule relaxes this constraint when the arrays'
-shapes meet certain constraints. The simplest broadcasting example occurs
-when an array and a scalar value are combined in an operation:
-
->>> a = np.array([1.0, 2.0, 3.0])
->>> b = 2.0
->>> a * b
-array([ 2.,  4.,  6.])
-
-The result is equivalent to the previous example where ``b`` was an array.
-We can think of the scalar ``b`` being *stretched* during the arithmetic
-operation into an array with the same shape as ``a``. The new elements in
-``b`` are simply copies of the original scalar. The stretching analogy is
-only conceptual.  NumPy is smart enough to use the original scalar value
-without actually making copies so that broadcasting operations are as
-memory and computationally efficient as possible.
-
-The code in the second example is more efficient than that in the first
-because broadcasting moves less memory around during the multiplication
-(``b`` is a scalar rather than an array).
-
-General Broadcasting Rules
-==========================
-When operating on two arrays, NumPy compares their shapes element-wise.
-It starts with the trailing (i.e. rightmost) dimensions and works its
-way left.  Two dimensions are compatible when
-
-1) they are equal, or
-2) one of them is 1
-
-If these conditions are not met, a
-``ValueError: operands could not be broadcast together`` exception is 
-thrown, indicating that the arrays have incompatible shapes. The size of 
-the resulting array is the size that is not 1 along each axis of the inputs.
-
-Arrays do not need to have the same *number* of dimensions.  For example,
-if you have a ``256x256x3`` array of RGB values, and you want to scale
-each color in the image by a different value, you can multiply the image
-by a one-dimensional array with 3 values. Lining up the sizes of the
-trailing axes of these arrays according to the broadcast rules, shows that
-they are compatible::
-
-  Image  (3d array): 256 x 256 x 3
-  Scale  (1d array):             3
-  Result (3d array): 256 x 256 x 3
-
-When either of the dimensions compared is one, the other is
-used.  In other words, dimensions with size 1 are stretched or "copied"
-to match the other.
-
-In the following example, both the ``A`` and ``B`` arrays have axes with
-length one that are expanded to a larger size during the broadcast
-operation::
-
-  A      (4d array):  8 x 1 x 6 x 1
-  B      (3d array):      7 x 1 x 5
-  Result (4d array):  8 x 7 x 6 x 5
-
-Here are some more examples::
-
-  A      (2d array):  5 x 4
-  B      (1d array):      1
-  Result (2d array):  5 x 4
-
-  A      (2d array):  5 x 4
-  B      (1d array):      4
-  Result (2d array):  5 x 4
-
-  A      (3d array):  15 x 3 x 5
-  B      (3d array):  15 x 1 x 5
-  Result (3d array):  15 x 3 x 5
-
-  A      (3d array):  15 x 3 x 5
-  B      (2d array):       3 x 5
-  Result (3d array):  15 x 3 x 5
-
-  A      (3d array):  15 x 3 x 5
-  B      (2d array):       3 x 1
-  Result (3d array):  15 x 3 x 5
-
-Here are examples of shapes that do not broadcast::
-
-  A      (1d array):  3
-  B      (1d array):  4 # trailing dimensions do not match
-
-  A      (2d array):      2 x 1
-  B      (3d array):  8 x 4 x 3 # second from last dimensions mismatched
-
-An example of broadcasting in practice::
-
- >>> x = np.arange(4)
- >>> xx = x.reshape(4,1)
- >>> y = np.ones(5)
- >>> z = np.ones((3,4))
-
- >>> x.shape
- (4,)
-
- >>> y.shape
- (5,)
-
- >>> x + y
- ValueError: operands could not be broadcast together with shapes (4,) (5,)
-
- >>> xx.shape
- (4, 1)
-
- >>> y.shape
- (5,)
-
- >>> (xx + y).shape
- (4, 5)
-
- >>> xx + y
- array([[ 1.,  1.,  1.,  1.,  1.],
-        [ 2.,  2.,  2.,  2.,  2.],
-        [ 3.,  3.,  3.,  3.,  3.],
-        [ 4.,  4.,  4.,  4.,  4.]])
-
- >>> x.shape
- (4,)
-
- >>> z.shape
- (3, 4)
-
- >>> (x + z).shape
- (3, 4)
-
- >>> x + z
- array([[ 1.,  2.,  3.,  4.],
-        [ 1.,  2.,  3.,  4.],
-        [ 1.,  2.,  3.,  4.]])
-
-Broadcasting provides a convenient way of taking the outer product (or
-any other outer operation) of two arrays. The following example shows an
-outer addition operation of two 1-d arrays::
-
-  >>> a = np.array([0.0, 10.0, 20.0, 30.0])
-  >>> b = np.array([1.0, 2.0, 3.0])
-  >>> a[:, np.newaxis] + b
-  array([[  1.,   2.,   3.],
-         [ 11.,  12.,  13.],
-         [ 21.,  22.,  23.],
-         [ 31.,  32.,  33.]])
-
-Here the ``newaxis`` index operator inserts a new axis into ``a``,
-making it a two-dimensional ``4x1`` array.  Combining the ``4x1`` array
-with ``b``, which has shape ``(3,)``, yields a ``4x3`` array.
-
-"""
diff --git a/numpy/doc/byteswapping.py b/numpy/doc/byteswapping.py
deleted file mode 100644
index fe9461977..000000000
--- a/numpy/doc/byteswapping.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""
-
-=============================
- Byteswapping and byte order
-=============================
-
-Introduction to byte ordering and ndarrays
-==========================================
-
-The ``ndarray`` is an object that provide a python array interface to data
-in memory.
-
-It often happens that the memory that you want to view with an array is
-not of the same byte ordering as the computer on which you are running
-Python.
-
-For example, I might be working on a computer with a little-endian CPU -
-such as an Intel Pentium, but I have loaded some data from a file
-written by a computer that is big-endian.  Let's say I have loaded 4
-bytes from a file written by a Sun (big-endian) computer.  I know that
-these 4 bytes represent two 16-bit integers.  On a big-endian machine, a
-two-byte integer is stored with the Most Significant Byte (MSB) first,
-and then the Least Significant Byte (LSB). Thus the bytes are, in memory order:
-
-#. MSB integer 1
-#. LSB integer 1
-#. MSB integer 2
-#. LSB integer 2
-
-Let's say the two integers were in fact 1 and 770.  Because 770 = 256 *
-3 + 2, the 4 bytes in memory would contain respectively: 0, 1, 3, 2.
-The bytes I have loaded from the file would have these contents:
-
->>> big_end_buffer = bytearray([0,1,3,2])
->>> big_end_buffer
-bytearray(b'\\x00\\x01\\x03\\x02')
-
-We might want to use an ``ndarray`` to access these integers.  In that
-case, we can create an array around this memory, and tell numpy that
-there are two integers, and that they are 16 bit and big-endian:
-
->>> import numpy as np
->>> big_end_arr = np.ndarray(shape=(2,),dtype='>i2', buffer=big_end_buffer)
->>> big_end_arr[0]
-1
->>> big_end_arr[1]
-770
-
-Note the array ``dtype`` above of ``>i2``.  The ``>`` means 'big-endian'
-(``<`` is little-endian) and ``i2`` means 'signed 2-byte integer'.  For
-example, if our data represented a single unsigned 4-byte little-endian
-integer, the dtype string would be ``<u4``.
-
-In fact, why don't we try that?
-
->>> little_end_u4 = np.ndarray(shape=(1,),dtype='<u4', buffer=big_end_buffer)
->>> little_end_u4[0] == 1 * 256**1 + 3 * 256**2 + 2 * 256**3
-True
-
-Returning to our ``big_end_arr`` - in this case our underlying data is
-big-endian (data endianness) and we've set the dtype to match (the dtype
-is also big-endian).  However, sometimes you need to flip these around.
-
-.. warning::
-
-    Scalars currently do not include byte order information, so extracting
-    a scalar from an array will return an integer in native byte order.
-    Hence:
-
-    >>> big_end_arr[0].dtype.byteorder == little_end_u4[0].dtype.byteorder
-    True
-
-Changing byte ordering
-======================
-
-As you can imagine from the introduction, there are two ways you can
-affect the relationship between the byte ordering of the array and the
-underlying memory it is looking at:
-
-* Change the byte-ordering information in the array dtype so that it
-  interprets the underlying data as being in a different byte order.
-  This is the role of ``arr.newbyteorder()``
-* Change the byte-ordering of the underlying data, leaving the dtype
-  interpretation as it was.  This is what ``arr.byteswap()`` does.
-
-The common situations in which you need to change byte ordering are:
-
-#. Your data and dtype endianness don't match, and you want to change
-   the dtype so that it matches the data.
-#. Your data and dtype endianness don't match, and you want to swap the
-   data so that they match the dtype
-#. Your data and dtype endianness match, but you want the data swapped
-   and the dtype to reflect this
-
-Data and dtype endianness don't match, change dtype to match data
------------------------------------------------------------------
-
-We make something where they don't match:
-
->>> wrong_end_dtype_arr = np.ndarray(shape=(2,),dtype='<i2', buffer=big_end_buffer)
->>> wrong_end_dtype_arr[0]
-256
-
-The obvious fix for this situation is to change the dtype so it gives
-the correct endianness:
-
->>> fixed_end_dtype_arr = wrong_end_dtype_arr.newbyteorder()
->>> fixed_end_dtype_arr[0]
-1
-
-Note the array has not changed in memory:
-
->>> fixed_end_dtype_arr.tobytes() == big_end_buffer
-True
-
-Data and type endianness don't match, change data to match dtype
-----------------------------------------------------------------
-
-You might want to do this if you need the data in memory to be a certain
-ordering.  For example you might be writing the memory out to a file
-that needs a certain byte ordering.
-
->>> fixed_end_mem_arr = wrong_end_dtype_arr.byteswap()
->>> fixed_end_mem_arr[0]
-1
-
-Now the array *has* changed in memory:
-
->>> fixed_end_mem_arr.tobytes() == big_end_buffer
-False
-
-Data and dtype endianness match, swap data and dtype
-----------------------------------------------------
-
-You may have a correctly specified array dtype, but you need the array
-to have the opposite byte order in memory, and you want the dtype to
-match so the array values make sense.  In this case you just do both of
-the previous operations:
-
->>> swapped_end_arr = big_end_arr.byteswap().newbyteorder()
->>> swapped_end_arr[0]
-1
->>> swapped_end_arr.tobytes() == big_end_buffer
-False
-
-An easier way of casting the data to a specific dtype and byte ordering
-can be achieved with the ndarray astype method:
-
->>> swapped_end_arr = big_end_arr.astype('<i2')
->>> swapped_end_arr[0]
-1
->>> swapped_end_arr.tobytes() == big_end_buffer
-False
-
-"""
diff --git a/numpy/doc/constants.py b/numpy/doc/constants.py
index 2c629ad33..128493d90 100644
--- a/numpy/doc/constants.py
+++ b/numpy/doc/constants.py
@@ -135,10 +135,6 @@ add_newdoc('numpy', 'newaxis',
     """
     A convenient alias for None, useful for indexing arrays.
 
-    See Also
-    --------
-    `numpy.doc.indexing`
-
     Examples
     --------
     >>> newaxis is None
diff --git a/numpy/doc/creation.py b/numpy/doc/creation.py
deleted file mode 100644
index 067f8bb33..000000000
--- a/numpy/doc/creation.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""
-==============
-Array Creation
-==============
-
-Introduction
-============
-
-There are 5 general mechanisms for creating arrays:
-
-1) Conversion from other Python structures (e.g., lists, tuples)
-2) Intrinsic numpy array creation objects (e.g., arange, ones, zeros,
-   etc.)
-3) Reading arrays from disk, either from standard or custom formats
-4) Creating arrays from raw bytes through the use of strings or buffers
-5) Use of special library functions (e.g., random)
-
-This section will not cover means of replicating, joining, or otherwise
-expanding or mutating existing arrays. Nor will it cover creating object
-arrays or structured arrays. Both of those are covered in their own sections.
-
-Converting Python array_like Objects to NumPy Arrays
-====================================================
-
-In general, numerical data arranged in an array-like structure in Python can
-be converted to arrays through the use of the array() function. The most
-obvious examples are lists and tuples. See the documentation for array() for
-details for its use. Some objects may support the array-protocol and allow
-conversion to arrays this way. A simple way to find out if the object can be
-converted to a numpy array using array() is simply to try it interactively and
-see if it works! (The Python Way).
-
-Examples: ::
-
- >>> x = np.array([2,3,1,0])
- >>> x = np.array([2, 3, 1, 0])
- >>> x = np.array([[1,2.0],[0,0],(1+1j,3.)]) # note mix of tuple and lists,
-     and types
- >>> x = np.array([[ 1.+0.j, 2.+0.j], [ 0.+0.j, 0.+0.j], [ 1.+1.j, 3.+0.j]])
-
-Intrinsic NumPy Array Creation
-==============================
-
-NumPy has built-in functions for creating arrays from scratch:
-
-zeros(shape) will create an array filled with 0 values with the specified
-shape. The default dtype is float64. ::
-
- >>> np.zeros((2, 3))
- array([[ 0., 0., 0.], [ 0., 0., 0.]])
-
-ones(shape) will create an array filled with 1 values. It is identical to
-zeros in all other respects.
-
-arange() will create arrays with regularly incrementing values. Check the
-docstring for complete information on the various ways it can be used. A few
-examples will be given here: ::
-
- >>> np.arange(10)
- array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- >>> np.arange(2, 10, dtype=float)
- array([ 2., 3., 4., 5., 6., 7., 8., 9.])
- >>> np.arange(2, 3, 0.1)
- array([ 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])
-
-Note that there are some subtleties regarding the last usage that the user
-should be aware of that are described in the arange docstring.
-
-linspace() will create arrays with a specified number of elements, and
-spaced equally between the specified beginning and end values. For
-example: ::
-
- >>> np.linspace(1., 4., 6)
- array([ 1. ,  1.6,  2.2,  2.8,  3.4,  4. ])
-
-The advantage of this creation function is that one can guarantee the
-number of elements and the starting and end point, which arange()
-generally will not do for arbitrary start, stop, and step values.
-
-indices() will create a set of arrays (stacked as a one-higher dimensioned
-array), one per dimension with each representing variation in that dimension.
-An example illustrates much better than a verbal description: ::
-
- >>> np.indices((3,3))
- array([[[0, 0, 0], [1, 1, 1], [2, 2, 2]], [[0, 1, 2], [0, 1, 2], [0, 1, 2]]])
-
-This is particularly useful for evaluating functions of multiple dimensions on
-a regular grid.
-
-Reading Arrays From Disk
-========================
-
-This is presumably the most common case of large array creation. The details,
-of course, depend greatly on the format of data on disk and so this section
-can only give general pointers on how to handle various formats.
-
-Standard Binary Formats
------------------------
-
-Various fields have standard formats for array data. The following lists the
-ones with known python libraries to read them and return numpy arrays (there
-may be others for which it is possible to read and convert to numpy arrays so
-check the last section as well)
-::
-
- HDF5: h5py
- FITS: Astropy
-
-Examples of formats that cannot be read directly but for which it is not hard to
-convert are those formats supported by libraries like PIL (able to read and
-write many image formats such as jpg, png, etc).
-
-Common ASCII Formats
-------------------------
-
-Comma Separated Value files (CSV) are widely used (and an export and import
-option for programs like Excel). There are a number of ways of reading these
-files in Python. There are CSV functions in Python and functions in pylab
-(part of matplotlib).
-
-More generic ascii files can be read using the io package in scipy.
-
-Custom Binary Formats
----------------------
-
-There are a variety of approaches one can use. If the file has a relatively
-simple format then one can write a simple I/O library and use the numpy
-fromfile() function and .tofile() method to read and write numpy arrays
-directly (mind your byteorder though!) If a good C or C++ library exists that
-read the data, one can wrap that library with a variety of techniques though
-that certainly is much more work and requires significantly more advanced
-knowledge to interface with C or C++.
-
-Use of Special Libraries
-------------------------
-
-There are libraries that can be used to generate arrays for special purposes
-and it isn't possible to enumerate all of them. The most common uses are use
-of the many array generation functions in random that can generate arrays of
-random values, and some utility functions to generate special matrices (e.g.
-diagonal).
-
-"""
diff --git a/numpy/doc/dispatch.py b/numpy/doc/dispatch.py
deleted file mode 100644
index af70ed836..000000000
--- a/numpy/doc/dispatch.py
+++ /dev/null
@@ -1,271 +0,0 @@
-""".. _dispatch_mechanism:
-
-Numpy's dispatch mechanism, introduced in numpy version v1.16 is the
-recommended approach for writing custom N-dimensional array containers that are
-compatible with the numpy API and provide custom implementations of numpy
-functionality. Applications include `dask <http://dask.pydata.org>`_ arrays, an
-N-dimensional array distributed across multiple nodes, and `cupy
-<https://docs-cupy.chainer.org/en/stable/>`_ arrays, an N-dimensional array on
-a GPU.
-
-To get a feel for writing custom array containers, we'll begin with a simple
-example that has rather narrow utility but illustrates the concepts involved.
-
->>> import numpy as np
->>> class DiagonalArray:
-...     def __init__(self, N, value):
-...         self._N = N
-...         self._i = value
-...     def __repr__(self):
-...         return f"{self.__class__.__name__}(N={self._N}, value={self._i})"
-...     def __array__(self):
-...         return self._i * np.eye(self._N)
-...
-
-Our custom array can be instantiated like:
-
->>> arr = DiagonalArray(5, 1)
->>> arr
-DiagonalArray(N=5, value=1)
-
-We can convert to a numpy array using :func:`numpy.array` or
-:func:`numpy.asarray`, which will call its ``__array__`` method to obtain a
-standard ``numpy.ndarray``.
-
->>> np.asarray(arr)
-array([[1., 0., 0., 0., 0.],
-       [0., 1., 0., 0., 0.],
-       [0., 0., 1., 0., 0.],
-       [0., 0., 0., 1., 0.],
-       [0., 0., 0., 0., 1.]])
-
-If we operate on ``arr`` with a numpy function, numpy will again use the
-``__array__`` interface to convert it to an array and then apply the function
-in the usual way.
-
->>> np.multiply(arr, 2)
-array([[2., 0., 0., 0., 0.],
-       [0., 2., 0., 0., 0.],
-       [0., 0., 2., 0., 0.],
-       [0., 0., 0., 2., 0.],
-       [0., 0., 0., 0., 2.]])
-
-
-Notice that the return type is a standard ``numpy.ndarray``.
-
->>> type(arr)
-numpy.ndarray
-
-How can we pass our custom array type through this function? Numpy allows a
-class to indicate that it would like to handle computations in a custom-defined
-way through the interfaces ``__array_ufunc__`` and ``__array_function__``. Let's
-take one at a time, starting with ``_array_ufunc__``. This method covers
-:ref:`ufuncs`, a class of functions that includes, for example,
-:func:`numpy.multiply` and :func:`numpy.sin`.
-
-The ``__array_ufunc__`` receives:
-
-- ``ufunc``, a function like ``numpy.multiply``
-- ``method``, a string, differentiating between ``numpy.multiply(...)`` and
-  variants like ``numpy.multiply.outer``, ``numpy.multiply.accumulate``, and so
-  on.  For the common case, ``numpy.multiply(...)``, ``method == '__call__'``.
-- ``inputs``, which could be a mixture of different types
-- ``kwargs``, keyword arguments passed to the function
-
-For this example we will only handle the method ``__call__``.
-
->>> from numbers import Number
->>> class DiagonalArray:
-...     def __init__(self, N, value):
-...         self._N = N
-...         self._i = value
-...     def __repr__(self):
-...         return f"{self.__class__.__name__}(N={self._N}, value={self._i})"
-...     def __array__(self):
-...         return self._i * np.eye(self._N)
-...     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-...         if method == '__call__':
-...             N = None
-...             scalars = []
-...             for input in inputs:
-...                 if isinstance(input, Number):
-...                     scalars.append(input)
-...                 elif isinstance(input, self.__class__):
-...                     scalars.append(input._i)
-...                     if N is not None:
-...                         if N != self._N:
-...                             raise TypeError("inconsistent sizes")
-...                     else:
-...                         N = self._N
-...                 else:
-...                     return NotImplemented
-...             return self.__class__(N, ufunc(*scalars, **kwargs))
-...         else:
-...             return NotImplemented
-...
-
-Now our custom array type passes through numpy functions.
-
->>> arr = DiagonalArray(5, 1)
->>> np.multiply(arr, 3)
-DiagonalArray(N=5, value=3)
->>> np.add(arr, 3)
-DiagonalArray(N=5, value=4)
->>> np.sin(arr)
-DiagonalArray(N=5, value=0.8414709848078965)
-
-At this point ``arr + 3`` does not work.
-
->>> arr + 3
-TypeError: unsupported operand type(s) for *: 'DiagonalArray' and 'int'
-
-To support it, we need to define the Python interfaces ``__add__``, ``__lt__``,
-and so on to dispatch to the corresponding ufunc. We can achieve this
-conveniently by inheriting from the mixin
-:class:`~numpy.lib.mixins.NDArrayOperatorsMixin`.
-
->>> import numpy.lib.mixins
->>> class DiagonalArray(numpy.lib.mixins.NDArrayOperatorsMixin):
-...     def __init__(self, N, value):
-...         self._N = N
-...         self._i = value
-...     def __repr__(self):
-...         return f"{self.__class__.__name__}(N={self._N}, value={self._i})"
-...     def __array__(self):
-...         return self._i * np.eye(self._N)
-...     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-...         if method == '__call__':
-...             N = None
-...             scalars = []
-...             for input in inputs:
-...                 if isinstance(input, Number):
-...                     scalars.append(input)
-...                 elif isinstance(input, self.__class__):
-...                     scalars.append(input._i)
-...                     if N is not None:
-...                         if N != self._N:
-...                             raise TypeError("inconsistent sizes")
-...                     else:
-...                         N = self._N
-...                 else:
-...                     return NotImplemented
-...             return self.__class__(N, ufunc(*scalars, **kwargs))
-...         else:
-...             return NotImplemented
-...
-
->>> arr = DiagonalArray(5, 1)
->>> arr + 3
-DiagonalArray(N=5, value=4)
->>> arr > 0
-DiagonalArray(N=5, value=True)
-
-Now let's tackle ``__array_function__``. We'll create dict that maps numpy
-functions to our custom variants.
-
->>> HANDLED_FUNCTIONS = {}
->>> class DiagonalArray(numpy.lib.mixins.NDArrayOperatorsMixin):
-...     def __init__(self, N, value):
-...         self._N = N
-...         self._i = value
-...     def __repr__(self):
-...         return f"{self.__class__.__name__}(N={self._N}, value={self._i})"
-...     def __array__(self):
-...         return self._i * np.eye(self._N)
-...     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-...         if method == '__call__':
-...             N = None
-...             scalars = []
-...             for input in inputs:
-...                 # In this case we accept only scalar numbers or DiagonalArrays.
-...                 if isinstance(input, Number):
-...                     scalars.append(input)
-...                 elif isinstance(input, self.__class__):
-...                     scalars.append(input._i)
-...                     if N is not None:
-...                         if N != self._N:
-...                             raise TypeError("inconsistent sizes")
-...                     else:
-...                         N = self._N
-...                 else:
-...                     return NotImplemented
-...             return self.__class__(N, ufunc(*scalars, **kwargs))
-...         else:
-...             return NotImplemented
-...    def __array_function__(self, func, types, args, kwargs):
-...        if func not in HANDLED_FUNCTIONS:
-...            return NotImplemented
-...        # Note: this allows subclasses that don't override
-...        # __array_function__ to handle DiagonalArray objects.
-...        if not all(issubclass(t, self.__class__) for t in types):
-...            return NotImplemented
-...        return HANDLED_FUNCTIONS[func](*args, **kwargs)
-...
-
-A convenient pattern is to define a decorator ``implements`` that can be used
-to add functions to ``HANDLED_FUNCTIONS``.
-
->>> def implements(np_function):
-...    "Register an __array_function__ implementation for DiagonalArray objects."
-...    def decorator(func):
-...        HANDLED_FUNCTIONS[np_function] = func
-...        return func
-...    return decorator
-...
-
-Now we write implementations of numpy functions for ``DiagonalArray``.
-For completeness, to support the usage ``arr.sum()`` add a method ``sum`` that
-calls ``numpy.sum(self)``, and the same for ``mean``.
-
->>> @implements(np.sum)
-... def sum(arr):
-...     "Implementation of np.sum for DiagonalArray objects"
-...     return arr._i * arr._N
-...
->>> @implements(np.mean)
-... def mean(arr):
-...     "Implementation of np.mean for DiagonalArray objects"
-...     return arr._i / arr._N
-...
->>> arr = DiagonalArray(5, 1)
->>> np.sum(arr)
-5
->>> np.mean(arr)
-0.2
-
-If the user tries to use any numpy functions not included in
-``HANDLED_FUNCTIONS``, a ``TypeError`` will be raised by numpy, indicating that
-this operation is not supported. For example, concatenating two
-``DiagonalArrays`` does not produce another diagonal array, so it is not
-supported.
-
->>> np.concatenate([arr, arr])
-TypeError: no implementation found for 'numpy.concatenate' on types that implement __array_function__: [<class '__main__.DiagonalArray'>]
-
-Additionally, our implementations of ``sum`` and ``mean`` do not accept the
-optional arguments that numpy's implementation does.
-
->>> np.sum(arr, axis=0)
-TypeError: sum() got an unexpected keyword argument 'axis'
-
-The user always has the option of converting to a normal ``numpy.ndarray`` with
-:func:`numpy.asarray` and using standard numpy from there.
-
->>> np.concatenate([np.asarray(arr), np.asarray(arr)])
-array([[1., 0., 0., 0., 0.],
-       [0., 1., 0., 0., 0.],
-       [0., 0., 1., 0., 0.],
-       [0., 0., 0., 1., 0.],
-       [0., 0., 0., 0., 1.],
-       [1., 0., 0., 0., 0.],
-       [0., 1., 0., 0., 0.],
-       [0., 0., 1., 0., 0.],
-       [0., 0., 0., 1., 0.],
-       [0., 0., 0., 0., 1.]])
-
-Refer to the `dask source code <https://github.com/dask/dask>`_ and
-`cupy source code <https://github.com/cupy/cupy>`_  for more fully-worked
-examples of custom array containers.
-
-See also :doc:`NEP 18<neps:nep-0018-array-function-protocol>`.
-"""
diff --git a/numpy/doc/glossary.py b/numpy/doc/glossary.py
deleted file mode 100644
index 31130559b..000000000
--- a/numpy/doc/glossary.py
+++ /dev/null
@@ -1,475 +0,0 @@
-"""
-========
-Glossary
-========
-
-.. glossary::
-
-   along an axis
-       Axes are defined for arrays with more than one dimension.  A
-       2-dimensional array has two corresponding axes: the first running
-       vertically downwards across rows (axis 0), and the second running
-       horizontally across columns (axis 1).
-
-       Many operations can take place along one of these axes.  For example,
-       we can sum each row of an array, in which case we operate along
-       columns, or axis 1::
-
-         >>> x = np.arange(12).reshape((3,4))
-
-         >>> x
-         array([[ 0,  1,  2,  3],
-                [ 4,  5,  6,  7],
-                [ 8,  9, 10, 11]])
-
-         >>> x.sum(axis=1)
-         array([ 6, 22, 38])
-
-   array
-       A homogeneous container of numerical elements.  Each element in the
-       array occupies a fixed amount of memory (hence homogeneous), and
-       can be a numerical element of a single type (such as float, int
-       or complex) or a combination (such as ``(float, int, float)``).  Each
-       array has an associated data-type (or ``dtype``), which describes
-       the numerical type of its elements::
-
-         >>> x = np.array([1, 2, 3], float)
-
-         >>> x
-         array([ 1.,  2.,  3.])
-
-         >>> x.dtype # floating point number, 64 bits of memory per element
-         dtype('float64')
-
-
-         # More complicated data type: each array element is a combination of
-         # and integer and a floating point number
-         >>> np.array([(1, 2.0), (3, 4.0)], dtype=[('x', int), ('y', float)])
-         array([(1, 2.0), (3, 4.0)],
-               dtype=[('x', '<i4'), ('y', '<f8')])
-
-       Fast element-wise operations, called a :term:`ufunc`, operate on arrays.
-
-   array_like
-       Any sequence that can be interpreted as an ndarray.  This includes
-       nested lists, tuples, scalars and existing arrays.
-
-   attribute
-       A property of an object that can be accessed using ``obj.attribute``,
-       e.g., ``shape`` is an attribute of an array::
-
-         >>> x = np.array([1, 2, 3])
-         >>> x.shape
-         (3,)
-
-   big-endian
-       When storing a multi-byte value in memory as a sequence of bytes, the
-       sequence addresses/sends/stores the most significant byte first (lowest
-       address) and the least significant byte last (highest address). Common in
-       micro-processors and used for transmission of data over network protocols.
-
-   BLAS
-       `Basic Linear Algebra Subprograms <https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms>`_
-
-   broadcast
-       NumPy can do operations on arrays whose shapes are mismatched::
-
-         >>> x = np.array([1, 2])
-         >>> y = np.array([[3], [4]])
-
-         >>> x
-         array([1, 2])
-
-         >>> y
-         array([[3],
-                [4]])
-
-         >>> x + y
-         array([[4, 5],
-                [5, 6]])
-
-       See `numpy.doc.broadcasting` for more information.
-
-   C order
-       See `row-major`
-
-   column-major
-       A way to represent items in a N-dimensional array in the 1-dimensional
-       computer memory. In column-major order, the leftmost index "varies the
-       fastest": for example the array::
-
-            [[1, 2, 3],
-             [4, 5, 6]]
-
-       is represented in the column-major order as::
-
-           [1, 4, 2, 5, 3, 6]
-
-       Column-major order is also known as the Fortran order, as the Fortran
-       programming language uses it.
-
-   decorator
-       An operator that transforms a function.  For example, a ``log``
-       decorator may be defined to print debugging information upon
-       function execution::
-
-         >>> def log(f):
-         ...     def new_logging_func(*args, **kwargs):
-         ...         print("Logging call with parameters:", args, kwargs)
-         ...         return f(*args, **kwargs)
-         ...
-         ...     return new_logging_func
-
-       Now, when we define a function, we can "decorate" it using ``log``::
-
-         >>> @log
-         ... def add(a, b):
-         ...     return a + b
-
-       Calling ``add`` then yields:
-
-       >>> add(1, 2)
-       Logging call with parameters: (1, 2) {}
-       3
-
-   dictionary
-       Resembling a language dictionary, which provides a mapping between
-       words and descriptions thereof, a Python dictionary is a mapping
-       between two objects::
-
-         >>> x = {1: 'one', 'two': [1, 2]}
-
-       Here, `x` is a dictionary mapping keys to values, in this case
-       the integer 1 to the string "one", and the string "two" to
-       the list ``[1, 2]``.  The values may be accessed using their
-       corresponding keys::
-
-         >>> x[1]
-         'one'
-
-         >>> x['two']
-         [1, 2]
-
-       Note that dictionaries are not stored in any specific order.  Also,
-       most mutable (see *immutable* below) objects, such as lists, may not
-       be used as keys.
-
-       For more information on dictionaries, read the
-       `Python tutorial <https://docs.python.org/tutorial/>`_.
-
-   field
-       In a :term:`structured data type`, each sub-type is called a `field`.
-       The `field` has a name (a string), a type (any valid dtype), and
-       an optional `title`. See :ref:`arrays.dtypes`
-
-   Fortran order
-       See `column-major`
-
-   flattened
-       Collapsed to a one-dimensional array. See `numpy.ndarray.flatten`
-       for details.
-
-   homogeneous
-       Describes a block of memory comprised of blocks, each block comprised of 
-       items and of the same size, and blocks are interpreted in exactly the
-       same way. In the simplest case each block contains a single item, for
-       instance int32 or float64.
-
-   immutable
-       An object that cannot be modified after execution is called
-       immutable.  Two common examples are strings and tuples.
-
-   instance
-       A class definition gives the blueprint for constructing an object::
-
-         >>> class House:
-         ...     wall_colour = 'white'
-
-       Yet, we have to *build* a house before it exists::
-
-         >>> h = House() # build a house
-
-       Now, ``h`` is called a ``House`` instance.  An instance is therefore
-       a specific realisation of a class.
-
-   iterable
-       A sequence that allows "walking" (iterating) over items, typically
-       using a loop such as::
-
-         >>> x = [1, 2, 3]
-         >>> [item**2 for item in x]
-         [1, 4, 9]
-
-       It is often used in combination with ``enumerate``::
-         >>> keys = ['a','b','c']
-         >>> for n, k in enumerate(keys):
-         ...     print("Key %d: %s" % (n, k))
-         ...
-         Key 0: a
-         Key 1: b
-         Key 2: c
-
-   itemsize
-       The size of the dtype element in bytes.
-
-   list
-       A Python container that can hold any number of objects or items.
-       The items do not have to be of the same type, and can even be
-       lists themselves::
-
-         >>> x = [2, 2.0, "two", [2, 2.0]]
-
-       The list `x` contains 4 items, each which can be accessed individually::
-
-         >>> x[2] # the string 'two'
-         'two'
-
-         >>> x[3] # a list, containing an integer 2 and a float 2.0
-         [2, 2.0]
-
-       It is also possible to select more than one item at a time,
-       using *slicing*::
-
-         >>> x[0:2] # or, equivalently, x[:2]
-         [2, 2.0]
-
-       In code, arrays are often conveniently expressed as nested lists::
-
-
-         >>> np.array([[1, 2], [3, 4]])
-         array([[1, 2],
-                [3, 4]])
-
-       For more information, read the section on lists in the `Python
-       tutorial <https://docs.python.org/tutorial/>`_.  For a mapping
-       type (key-value), see *dictionary*.
-
-   little-endian
-       When storing a multi-byte value in memory as a sequence of bytes, the
-       sequence addresses/sends/stores the least significant byte first (lowest
-       address) and the most significant byte last (highest address). Common in
-       x86 processors.
-
-   mask
-       A boolean array, used to select only certain elements for an operation::
-
-         >>> x = np.arange(5)
-         >>> x
-         array([0, 1, 2, 3, 4])
-
-         >>> mask = (x > 2)
-         >>> mask
-         array([False, False, False, True,  True])
-
-         >>> x[mask] = -1
-         >>> x
-         array([ 0,  1,  2,  -1, -1])
-
-   masked array
-       Array that suppressed values indicated by a mask::
-
-         >>> x = np.ma.masked_array([np.nan, 2, np.nan], [True, False, True])
-         >>> x
-         masked_array(data = [-- 2.0 --],
-                      mask = [ True False  True],
-                fill_value = 1e+20)
-
-         >>> x + [1, 2, 3]
-         masked_array(data = [-- 4.0 --],
-                      mask = [ True False  True],
-                fill_value = 1e+20)
-
-
-       Masked arrays are often used when operating on arrays containing
-       missing or invalid entries.
-
-   matrix
-       A 2-dimensional ndarray that preserves its two-dimensional nature
-       throughout operations.  It has certain special operations, such as ``*``
-       (matrix multiplication) and ``**`` (matrix power), defined::
-
-         >>> x = np.mat([[1, 2], [3, 4]])
-         >>> x
-         matrix([[1, 2],
-                 [3, 4]])
-
-         >>> x**2
-         matrix([[ 7, 10],
-               [15, 22]])
-
-   method
-       A function associated with an object.  For example, each ndarray has a
-       method called ``repeat``::
-
-         >>> x = np.array([1, 2, 3])
-         >>> x.repeat(2)
-         array([1, 1, 2, 2, 3, 3])
-
-   ndarray
-       See *array*.
-
-   record array
-       An :term:`ndarray` with :term:`structured data type` which has been
-       subclassed as ``np.recarray`` and whose dtype is of type ``np.record``,
-       making the fields of its data type to be accessible by attribute.
-
-   reference
-       If ``a`` is a reference to ``b``, then ``(a is b) == True``.  Therefore,
-       ``a`` and ``b`` are different names for the same Python object.
-
-   row-major
-       A way to represent items in a N-dimensional array in the 1-dimensional
-       computer memory. In row-major order, the rightmost index "varies
-       the fastest": for example the array::
-
-            [[1, 2, 3],
-             [4, 5, 6]]
-
-       is represented in the row-major order as::
-
-           [1, 2, 3, 4, 5, 6]
-
-       Row-major order is also known as the C order, as the C programming
-       language uses it. New NumPy arrays are by default in row-major order.
-
-   self
-       Often seen in method signatures, ``self`` refers to the instance
-       of the associated class.  For example:
-
-         >>> class Paintbrush:
-         ...     color = 'blue'
-         ...
-         ...     def paint(self):
-         ...         print("Painting the city %s!" % self.color)
-         ...
-         >>> p = Paintbrush()
-         >>> p.color = 'red'
-         >>> p.paint() # self refers to 'p'
-         Painting the city red!
-
-   slice
-       Used to select only certain elements from a sequence:
-
-       >>> x = range(5)
-       >>> x
-       [0, 1, 2, 3, 4]
-
-       >>> x[1:3] # slice from 1 to 3 (excluding 3 itself)
-       [1, 2]
-
-       >>> x[1:5:2] # slice from 1 to 5, but skipping every second element
-       [1, 3]
-
-       >>> x[::-1] # slice a sequence in reverse
-       [4, 3, 2, 1, 0]
-
-       Arrays may have more than one dimension, each which can be sliced
-       individually:
-
-       >>> x = np.array([[1, 2], [3, 4]])
-       >>> x
-       array([[1, 2],
-              [3, 4]])
-
-       >>> x[:, 1]
-       array([2, 4])
-
-   structure
-       See :term:`structured data type`
-
-   structured data type
-       A data type composed of other datatypes
-
-   subarray data type
-       A :term:`structured data type` may contain a :term:`ndarray` with its
-       own dtype and shape:
-
-       >>> dt = np.dtype([('a', np.int32), ('b', np.float32, (3,))])
-       >>> np.zeros(3, dtype=dt)
-       array([(0, [0., 0., 0.]), (0, [0., 0., 0.]), (0, [0., 0., 0.])],
-             dtype=[('a', '<i4'), ('b', '<f4', (3,))])
-
-   title
-       In addition to field names, structured array fields may have an
-       associated :ref:`title <titles>` which is an alias to the name and is
-       commonly used for plotting.
-
-   tuple
-       A sequence that may contain a variable number of types of any
-       kind.  A tuple is immutable, i.e., once constructed it cannot be
-       changed.  Similar to a list, it can be indexed and sliced::
-
-         >>> x = (1, 'one', [1, 2])
-         >>> x
-         (1, 'one', [1, 2])
-
-         >>> x[0]
-         1
-
-         >>> x[:2]
-         (1, 'one')
-
-       A useful concept is "tuple unpacking", which allows variables to
-       be assigned to the contents of a tuple::
-
-         >>> x, y = (1, 2)
-         >>> x, y = 1, 2
-
-       This is often used when a function returns multiple values:
-
-         >>> def return_many():
-         ...     return 1, 'alpha', None
-
-         >>> a, b, c = return_many()
-         >>> a, b, c
-         (1, 'alpha', None)
-
-         >>> a
-         1
-         >>> b
-         'alpha'
-
-   ufunc
-       Universal function.  A fast element-wise, :term:`vectorized
-       <vectorization>` array operation.  Examples include ``add``, ``sin`` and
-       ``logical_or``.
-
-   vectorization
-       Optimizing a looping block by specialized code. In a traditional sense,
-       vectorization performs the same operation on multiple elements with
-       fixed strides between them via specialized hardware. Compilers know how
-       to take advantage of well-constructed loops to implement such
-       optimizations. NumPy uses :ref:`vectorization <whatis-vectorization>`
-       to mean any optimization via specialized code performing the same
-       operations on multiple elements, typically achieving speedups by
-       avoiding some of the overhead in looking up and converting the elements.
-
-   view
-       An array that does not own its data, but refers to another array's
-       data instead.  For example, we may create a view that only shows
-       every second element of another array::
-
-         >>> x = np.arange(5)
-         >>> x
-         array([0, 1, 2, 3, 4])
-
-         >>> y = x[::2]
-         >>> y
-         array([0, 2, 4])
-
-         >>> x[0] = 3 # changing x changes y as well, since y is a view on x
-         >>> y
-         array([3, 2, 4])
-
-   wrapper
-       Python is a high-level (highly abstracted, or English-like) language.
-       This abstraction comes at a price in execution speed, and sometimes
-       it becomes necessary to use lower level languages to do fast
-       computations.  A wrapper is code that provides a bridge between
-       high and the low level languages, allowing, e.g., Python to execute
-       code written in C or Fortran.
-
-       Examples include ctypes, SWIG and Cython (which wraps C and C++)
-       and f2py (which wraps Fortran).
-
-"""
diff --git a/numpy/doc/indexing.py b/numpy/doc/indexing.py
deleted file mode 100644
index c7dda2790..000000000
--- a/numpy/doc/indexing.py
+++ /dev/null
@@ -1,456 +0,0 @@
-"""
-==============
-Array indexing
-==============
-
-Array indexing refers to any use of the square brackets ([]) to index
-array values. There are many options to indexing, which give numpy
-indexing great power, but with power comes some complexity and the
-potential for confusion. This section is just an overview of the
-various options and issues related to indexing. Aside from single
-element indexing, the details on most of these options are to be
-found in related sections.
-
-Assignment vs referencing
-=========================
-
-Most of the following examples show the use of indexing when
-referencing data in an array. The examples work just as well
-when assigning to an array. See the section at the end for
-specific examples and explanations on how assignments work.
-
-Single element indexing
-=======================
-
-Single element indexing for a 1-D array is what one expects. It work
-exactly like that for other standard Python sequences. It is 0-based,
-and accepts negative indices for indexing from the end of the array. ::
-
-    >>> x = np.arange(10)
-    >>> x[2]
-    2
-    >>> x[-2]
-    8
-
-Unlike lists and tuples, numpy arrays support multidimensional indexing
-for multidimensional arrays. That means that it is not necessary to
-separate each dimension's index into its own set of square brackets. ::
-
-    >>> x.shape = (2,5) # now x is 2-dimensional
-    >>> x[1,3]
-    8
-    >>> x[1,-1]
-    9
-
-Note that if one indexes a multidimensional array with fewer indices
-than dimensions, one gets a subdimensional array. For example: ::
-
-    >>> x[0]
-    array([0, 1, 2, 3, 4])
-
-That is, each index specified selects the array corresponding to the
-rest of the dimensions selected. In the above example, choosing 0
-means that the remaining dimension of length 5 is being left unspecified,
-and that what is returned is an array of that dimensionality and size.
-It must be noted that the returned array is not a copy of the original,
-but points to the same values in memory as does the original array.
-In  this case, the 1-D array at the first position (0) is returned.
-So using a single index on the returned array, results in a single
-element being returned. That is: ::
-
-    >>> x[0][2]
-    2
-
-So note that ``x[0,2] = x[0][2]`` though the second case is more
-inefficient as a new temporary array is created after the first index
-that is subsequently indexed by 2.
-
-Note to those used to IDL or Fortran memory order as it relates to
-indexing.  NumPy uses C-order indexing. That means that the last
-index usually represents the most rapidly changing memory location,
-unlike Fortran or IDL, where the first index represents the most
-rapidly changing location in memory. This difference represents a
-great potential for confusion.
-
-Other indexing options
-======================
-
-It is possible to slice and stride arrays to extract arrays of the
-same number of dimensions, but of different sizes than the original.
-The slicing and striding works exactly the same way it does for lists
-and tuples except that they can be applied to multiple dimensions as
-well. A few examples illustrates best: ::
-
- >>> x = np.arange(10)
- >>> x[2:5]
- array([2, 3, 4])
- >>> x[:-7]
- array([0, 1, 2])
- >>> x[1:7:2]
- array([1, 3, 5])
- >>> y = np.arange(35).reshape(5,7)
- >>> y[1:5:2,::3]
- array([[ 7, 10, 13],
-        [21, 24, 27]])
-
-Note that slices of arrays do not copy the internal array data but
-only produce new views of the original data. This is different from
-list or tuple slicing and an explicit ``copy()`` is recommended if
-the original data is not required anymore.
-
-It is possible to index arrays with other arrays for the purposes of
-selecting lists of values out of arrays into new arrays. There are
-two different ways of accomplishing this. One uses one or more arrays
-of index values. The other involves giving a boolean array of the proper
-shape to indicate the values to be selected. Index arrays are a very
-powerful tool that allow one to avoid looping over individual elements in
-arrays and thus greatly improve performance.
-
-It is possible to use special features to effectively increase the
-number of dimensions in an array through indexing so the resulting
-array acquires the shape needed for use in an expression or with a
-specific function.
-
-Index arrays
-============
-
-NumPy arrays may be indexed with other arrays (or any other sequence-
-like object that can be converted to an array, such as lists, with the
-exception of tuples; see the end of this document for why this is). The
-use of index arrays ranges from simple, straightforward cases to
-complex, hard-to-understand cases. For all cases of index arrays, what
-is returned is a copy of the original data, not a view as one gets for
-slices.
-
-Index arrays must be of integer type. Each value in the array indicates
-which value in the array to use in place of the index. To illustrate: ::
-
- >>> x = np.arange(10,1,-1)
- >>> x
- array([10,  9,  8,  7,  6,  5,  4,  3,  2])
- >>> x[np.array([3, 3, 1, 8])]
- array([7, 7, 9, 2])
-
-
-The index array consisting of the values 3, 3, 1 and 8 correspondingly
-create an array of length 4 (same as the index array) where each index
-is replaced by the value the index array has in the array being indexed.
-
-Negative values are permitted and work as they do with single indices
-or slices: ::
-
- >>> x[np.array([3,3,-3,8])]
- array([7, 7, 4, 2])
-
-It is an error to have index values out of bounds: ::
-
- >>> x[np.array([3, 3, 20, 8])]
- <type 'exceptions.IndexError'>: index 20 out of bounds 0<=index<9
-
-Generally speaking, what is returned when index arrays are used is
-an array with the same shape as the index array, but with the type
-and values of the array being indexed. As an example, we can use a
-multidimensional index array instead: ::
-
- >>> x[np.array([[1,1],[2,3]])]
- array([[9, 9],
-        [8, 7]])
-
-Indexing Multi-dimensional arrays
-=================================
-
-Things become more complex when multidimensional arrays are indexed,
-particularly with multidimensional index arrays. These tend to be
-more unusual uses, but they are permitted, and they are useful for some
-problems. We'll  start with the simplest multidimensional case (using
-the array y from the previous examples): ::
-
- >>> y[np.array([0,2,4]), np.array([0,1,2])]
- array([ 0, 15, 30])
-
-In this case, if the index arrays have a matching shape, and there is
-an index array for each dimension of the array being indexed, the
-resultant array has the same shape as the index arrays, and the values
-correspond to the index set for each position in the index arrays. In
-this example, the first index value is 0 for both index arrays, and
-thus the first value of the resultant array is y[0,0]. The next value
-is y[2,1], and the last is y[4,2].
-
-If the index arrays do not have the same shape, there is an attempt to
-broadcast them to the same shape.  If they cannot be broadcast to the
-same shape, an exception is raised: ::
-
- >>> y[np.array([0,2,4]), np.array([0,1])]
- <type 'exceptions.ValueError'>: shape mismatch: objects cannot be
- broadcast to a single shape
-
-The broadcasting mechanism permits index arrays to be combined with
-scalars for other indices. The effect is that the scalar value is used
-for all the corresponding values of the index arrays: ::
-
- >>> y[np.array([0,2,4]), 1]
- array([ 1, 15, 29])
-
-Jumping to the next level of complexity, it is possible to only
-partially index an array with index arrays. It takes a bit of thought
-to understand what happens in such cases. For example if we just use
-one index array with y: ::
-
- >>> y[np.array([0,2,4])]
- array([[ 0,  1,  2,  3,  4,  5,  6],
-        [14, 15, 16, 17, 18, 19, 20],
-        [28, 29, 30, 31, 32, 33, 34]])
-
-What results is the construction of a new array where each value of
-the index array selects one row from the array being indexed and the
-resultant array has the resulting shape (number of index elements,
-size of row).
-
-An example of where this may be useful is for a color lookup table
-where we want to map the values of an image into RGB triples for
-display. The lookup table could have a shape (nlookup, 3). Indexing
-such an array with an image with shape (ny, nx) with dtype=np.uint8
-(or any integer type so long as values are with the bounds of the
-lookup table) will result in an array of shape (ny, nx, 3) where a
-triple of RGB values is associated with each pixel location.
-
-In general, the shape of the resultant array will be the concatenation
-of the shape of the index array (or the shape that all the index arrays
-were broadcast to) with the shape of any unused dimensions (those not
-indexed) in the array being indexed.
-
-Boolean or "mask" index arrays
-==============================
-
-Boolean arrays used as indices are treated in a different manner
-entirely than index arrays. Boolean arrays must be of the same shape
-as the initial dimensions of the array being indexed. In the
-most straightforward case, the boolean array has the same shape: ::
-
- >>> b = y>20
- >>> y[b]
- array([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34])
-
-Unlike in the case of integer index arrays, in the boolean case, the
-result is a 1-D array containing all the elements in the indexed array
-corresponding to all the true elements in the boolean array. The
-elements in the indexed array are always iterated and returned in
-:term:`row-major` (C-style) order. The result is also identical to
-``y[np.nonzero(b)]``. As with index arrays, what is returned is a copy
-of the data, not a view as one gets with slices.
-
-The result will be multidimensional if y has more dimensions than b.
-For example: ::
-
- >>> b[:,5] # use a 1-D boolean whose first dim agrees with the first dim of y
- array([False, False, False,  True,  True])
- >>> y[b[:,5]]
- array([[21, 22, 23, 24, 25, 26, 27],
-        [28, 29, 30, 31, 32, 33, 34]])
-
-Here the 4th and 5th rows are selected from the indexed array and
-combined to make a 2-D array.
-
-In general, when the boolean array has fewer dimensions than the array
-being indexed, this is equivalent to y[b, ...], which means
-y is indexed by b followed by as many : as are needed to fill
-out the rank of y.
-Thus the shape of the result is one dimension containing the number
-of True elements of the boolean array, followed by the remaining
-dimensions of the array being indexed.
-
-For example, using a 2-D boolean array of shape (2,3)
-with four True elements to select rows from a 3-D array of shape
-(2,3,5) results in a 2-D result of shape (4,5): ::
-
- >>> x = np.arange(30).reshape(2,3,5)
- >>> x
- array([[[ 0,  1,  2,  3,  4],
-         [ 5,  6,  7,  8,  9],
-         [10, 11, 12, 13, 14]],
-        [[15, 16, 17, 18, 19],
-         [20, 21, 22, 23, 24],
-         [25, 26, 27, 28, 29]]])
- >>> b = np.array([[True, True, False], [False, True, True]])
- >>> x[b]
- array([[ 0,  1,  2,  3,  4],
-        [ 5,  6,  7,  8,  9],
-        [20, 21, 22, 23, 24],
-        [25, 26, 27, 28, 29]])
-
-For further details, consult the numpy reference documentation on array indexing.
-
-Combining index arrays with slices
-==================================
-
-Index arrays may be combined with slices. For example: ::
-
- >>> y[np.array([0, 2, 4]), 1:3]
- array([[ 1,  2],
-        [15, 16],
-        [29, 30]])
-
-In effect, the slice and index array operation are independent.
-The slice operation extracts columns with index 1 and 2,
-(i.e. the 2nd and 3rd columns),
-followed by the index array operation which extracts rows with 
-index 0, 2 and 4 (i.e the first, third and fifth rows).
-
-This is equivalent to::
-
- >>> y[:, 1:3][np.array([0, 2, 4]), :]
- array([[ 1,  2],
-        [15, 16],
-        [29, 30]])
-
-Likewise, slicing can be combined with broadcasted boolean indices: ::
-
- >>> b = y > 20
- >>> b
- array([[False, False, False, False, False, False, False],
-       [False, False, False, False, False, False, False],
-       [False, False, False, False, False, False, False],
-       [ True,  True,  True,  True,  True,  True,  True],
-       [ True,  True,  True,  True,  True,  True,  True]])
- >>> y[b[:,5],1:3]
- array([[22, 23],
-        [29, 30]])
-
-Structural indexing tools
-=========================
-
-To facilitate easy matching of array shapes with expressions and in
-assignments, the np.newaxis object can be used within array indices
-to add new dimensions with a size of 1. For example: ::
-
- >>> y.shape
- (5, 7)
- >>> y[:,np.newaxis,:].shape
- (5, 1, 7)
-
-Note that there are no new elements in the array, just that the
-dimensionality is increased. This can be handy to combine two
-arrays in a way that otherwise would require explicitly reshaping
-operations. For example: ::
-
- >>> x = np.arange(5)
- >>> x[:,np.newaxis] + x[np.newaxis,:]
- array([[0, 1, 2, 3, 4],
-        [1, 2, 3, 4, 5],
-        [2, 3, 4, 5, 6],
-        [3, 4, 5, 6, 7],
-        [4, 5, 6, 7, 8]])
-
-The ellipsis syntax maybe used to indicate selecting in full any
-remaining unspecified dimensions. For example: ::
-
- >>> z = np.arange(81).reshape(3,3,3,3)
- >>> z[1,...,2]
- array([[29, 32, 35],
-        [38, 41, 44],
-        [47, 50, 53]])
-
-This is equivalent to: ::
-
- >>> z[1,:,:,2]
- array([[29, 32, 35],
-        [38, 41, 44],
-        [47, 50, 53]])
-
-Assigning values to indexed arrays
-==================================
-
-As mentioned, one can select a subset of an array to assign to using
-a single index, slices, and index and mask arrays. The value being
-assigned to the indexed array must be shape consistent (the same shape
-or broadcastable to the shape the index produces). For example, it is
-permitted to assign a constant to a slice: ::
-
- >>> x = np.arange(10)
- >>> x[2:7] = 1
-
-or an array of the right size: ::
-
- >>> x[2:7] = np.arange(5)
-
-Note that assignments may result in changes if assigning
-higher types to lower types (like floats to ints) or even
-exceptions (assigning complex to floats or ints): ::
-
- >>> x[1] = 1.2
- >>> x[1]
- 1
- >>> x[1] = 1.2j
- TypeError: can't convert complex to int
-
-
-Unlike some of the references (such as array and mask indices)
-assignments are always made to the original data in the array
-(indeed, nothing else would make sense!). Note though, that some
-actions may not work as one may naively expect. This particular
-example is often surprising to people: ::
-
- >>> x = np.arange(0, 50, 10)
- >>> x
- array([ 0, 10, 20, 30, 40])
- >>> x[np.array([1, 1, 3, 1])] += 1
- >>> x
- array([ 0, 11, 20, 31, 40])
-
-Where people expect that the 1st location will be incremented by 3.
-In fact, it will only be incremented by 1. The reason is because
-a new array is extracted from the original (as a temporary) containing
-the values at 1, 1, 3, 1, then the value 1 is added to the temporary,
-and then the temporary is assigned back to the original array. Thus
-the value of the array at x[1]+1 is assigned to x[1] three times,
-rather than being incremented 3 times.
-
-Dealing with variable numbers of indices within programs
-========================================================
-
-The index syntax is very powerful but limiting when dealing with
-a variable number of indices. For example, if you want to write
-a function that can handle arguments with various numbers of
-dimensions without having to write special case code for each
-number of possible dimensions, how can that be done? If one
-supplies to the index a tuple, the tuple will be interpreted
-as a list of indices. For example (using the previous definition
-for the array z): ::
-
- >>> indices = (1,1,1,1)
- >>> z[indices]
- 40
-
-So one can use code to construct tuples of any number of indices
-and then use these within an index.
-
-Slices can be specified within programs by using the slice() function
-in Python. For example: ::
-
- >>> indices = (1,1,1,slice(0,2)) # same as [1,1,1,0:2]
- >>> z[indices]
- array([39, 40])
-
-Likewise, ellipsis can be specified by code by using the Ellipsis
-object: ::
-
- >>> indices = (1, Ellipsis, 1) # same as [1,...,1]
- >>> z[indices]
- array([[28, 31, 34],
-        [37, 40, 43],
-        [46, 49, 52]])
-
-For this reason it is possible to use the output from the np.nonzero()
-function directly as an index since it always returns a tuple of index
-arrays.
-
-Because the special treatment of tuples, they are not automatically
-converted to an array as a list would be. As an example: ::
-
- >>> z[[1,1,1,1]] # produces a large array
- array([[[[27, 28, 29],
-          [30, 31, 32], ...
- >>> z[(1,1,1,1)] # returns a single value
- 40
-
-"""
diff --git a/numpy/doc/internals.py b/numpy/doc/internals.py
deleted file mode 100644
index 6718f1108..000000000
--- a/numpy/doc/internals.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-===============
-Array Internals
-===============
-
-Internal organization of numpy arrays
-=====================================
-
-It helps to understand a bit about how numpy arrays are handled under the covers to help understand numpy better. This section will not go into great detail. Those wishing to understand the full details are referred to Travis Oliphant's book "Guide to NumPy".
-
-NumPy arrays consist of two major components, the raw array data (from now on,
-referred to as the data buffer), and the information about the raw array data.
-The data buffer is typically what people think of as arrays in C or Fortran,
-a contiguous (and fixed) block of memory containing fixed sized data items.
-NumPy also contains a significant set of data that describes how to interpret
-the data in the data buffer. This extra information contains (among other things):
-
- 1) The basic data element's size in bytes
- 2) The start of the data within the data buffer (an offset relative to the
-    beginning of the data buffer).
- 3) The number of dimensions and the size of each dimension
- 4) The separation between elements for each dimension (the 'stride'). This
-    does not have to be a multiple of the element size
- 5) The byte order of the data (which may not be the native byte order)
- 6) Whether the buffer is read-only
- 7) Information (via the dtype object) about the interpretation of the basic
-    data element. The basic data element may be as simple as a int or a float,
-    or it may be a compound object (e.g., struct-like), a fixed character field,
-    or Python object pointers.
- 8) Whether the array is to interpreted as C-order or Fortran-order.
-
-This arrangement allow for very flexible use of arrays. One thing that it allows
-is simple changes of the metadata to change the interpretation of the array buffer.
-Changing the byteorder of the array is a simple change involving no rearrangement
-of the data. The shape of the array can be changed very easily without changing
-anything in the data buffer or any data copying at all
-
-Among other things that are made possible is one can create a new array metadata
-object that uses the same data buffer
-to create a new view of that data buffer that has a different interpretation
-of the buffer (e.g., different shape, offset, byte order, strides, etc) but
-shares the same data bytes. Many operations in numpy do just this such as
-slices. Other operations, such as transpose, don't move data elements
-around in the array, but rather change the information about the shape and strides so that the indexing of the array changes, but the data in the doesn't move.
-
-Typically these new versions of the array metadata but the same data buffer are
-new 'views' into the data buffer. There is a different ndarray object, but it
-uses the same data buffer. This is why it is necessary to force copies through
-use of the .copy() method if one really wants to make a new and independent
-copy of the data buffer.
-
-New views into arrays mean the object reference counts for the data buffer
-increase. Simply doing away with the original array object will not remove the
-data buffer if other views of it still exist.
-
-Multidimensional Array Indexing Order Issues
-============================================
-
-What is the right way to index
-multi-dimensional arrays? Before you jump to conclusions about the one and
-true way to index multi-dimensional arrays, it pays to understand why this is
-a confusing issue. This section will try to explain in detail how numpy
-indexing works and why we adopt the convention we do for images, and when it
-may be appropriate to adopt other conventions.
-
-The first thing to understand is
-that there are two conflicting conventions for indexing 2-dimensional arrays.
-Matrix notation uses the first index to indicate which row is being selected and
-the second index to indicate which column is selected. This is opposite the
-geometrically oriented-convention for images where people generally think the
-first index represents x position (i.e., column) and the second represents y
-position (i.e., row). This alone is the source of much confusion;
-matrix-oriented users and image-oriented users expect two different things with
-regard to indexing.
-
-The second issue to understand is how indices correspond
-to the order the array is stored in memory. In Fortran the first index is the
-most rapidly varying index when moving through the elements of a two
-dimensional array as it is stored in memory. If you adopt the matrix
-convention for indexing, then this means the matrix is stored one column at a
-time (since the first index moves to the next row as it changes). Thus Fortran
-is considered a Column-major language. C has just the opposite convention. In
-C, the last index changes most rapidly as one moves through the array as
-stored in memory. Thus C is a Row-major language. The matrix is stored by
-rows. Note that in both cases it presumes that the matrix convention for
-indexing is being used, i.e., for both Fortran and C, the first index is the
-row. Note this convention implies that the indexing convention is invariant
-and that the data order changes to keep that so.
-
-But that's not the only way
-to look at it. Suppose one has large two-dimensional arrays (images or
-matrices) stored in data files. Suppose the data are stored by rows rather than
-by columns. If we are to preserve our index convention (whether matrix or
-image) that means that depending on the language we use, we may be forced to
-reorder the data if it is read into memory to preserve our indexing
-convention. For example if we read row-ordered data into memory without
-reordering, it will match the matrix indexing convention for C, but not for
-Fortran. Conversely, it will match the image indexing convention for Fortran,
-but not for C. For C, if one is using data stored in row order, and one wants
-to preserve the image index convention, the data must be reordered when
-reading into memory.
-
-In the end, which you do for Fortran or C depends on
-which is more important, not reordering data or preserving the indexing
-convention. For large images, reordering data is potentially expensive, and
-often the indexing convention is inverted to avoid that.
-
-The situation with
-numpy makes this issue yet more complicated. The internal machinery of numpy
-arrays is flexible enough to accept any ordering of indices. One can simply
-reorder indices by manipulating the internal stride information for arrays
-without reordering the data at all. NumPy will know how to map the new index
-order to the data without moving the data.
-
-So if this is true, why not choose
-the index order that matches what you most expect? In particular, why not define
-row-ordered images to use the image convention? (This is sometimes referred
-to as the Fortran convention vs the C convention, thus the 'C' and 'FORTRAN'
-order options for array ordering in numpy.) The drawback of doing this is
-potential performance penalties. It's common to access the data sequentially,
-either implicitly in array operations or explicitly by looping over rows of an
-image. When that is done, then the data will be accessed in non-optimal order.
-As the first index is incremented, what is actually happening is that elements
-spaced far apart in memory are being sequentially accessed, with usually poor
-memory access speeds. For example, for a two dimensional image 'im' defined so
-that im[0, 10] represents the value at x=0, y=10. To be consistent with usual
-Python behavior then im[0] would represent a column at x=0. Yet that data
-would be spread over the whole array since the data are stored in row order.
-Despite the flexibility of numpy's indexing, it can't really paper over the fact
-basic operations are rendered inefficient because of data order or that getting
-contiguous subarrays is still awkward (e.g., im[:,0] for the first row, vs
-im[0]), thus one can't use an idiom such as for row in im; for col in im does
-work, but doesn't yield contiguous column data.
-
-As it turns out, numpy is
-smart enough when dealing with ufuncs to determine which index is the most
-rapidly varying one in memory and uses that for the innermost loop. Thus for
-ufuncs there is no large intrinsic advantage to either approach in most cases.
-On the other hand, use of .flat with an FORTRAN ordered array will lead to
-non-optimal memory access as adjacent elements in the flattened array (iterator,
-actually) are not contiguous in memory.
-
-Indeed, the fact is that Python
-indexing on lists and other sequences naturally leads to an outside-to inside
-ordering (the first index gets the largest grouping, the next the next largest,
-and the last gets the smallest element). Since image data are normally stored
-by rows, this corresponds to position within rows being the last item indexed.
-
-If you do want to use Fortran ordering realize that
-there are two approaches to consider: 1) accept that the first index is just not
-the most rapidly changing in memory and have all your I/O routines reorder
-your data when going from memory to disk or visa versa, or use numpy's
-mechanism for mapping the first index to the most rapidly varying data. We
-recommend the former if possible. The disadvantage of the latter is that many
-of numpy's functions will yield arrays without Fortran ordering unless you are
-careful to use the 'order' keyword. Doing this would be highly inconvenient.
-
-Otherwise we recommend simply learning to reverse the usual order of indices
-when accessing elements of an array. Granted, it goes against the grain, but
-it is more in line with Python semantics and the natural order of the data.
-
-"""
diff --git a/numpy/doc/misc.py b/numpy/doc/misc.py
deleted file mode 100644
index fc1c4cd01..000000000
--- a/numpy/doc/misc.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""
-=============
-Miscellaneous
-=============
-
-IEEE 754 Floating Point Special Values
---------------------------------------
-
-Special values defined in numpy: nan, inf,
-
-NaNs can be used as a poor-man's mask (if you don't care what the
-original value was)
-
-Note: cannot use equality to test NaNs. E.g.: ::
-
- >>> myarr = np.array([1., 0., np.nan, 3.])
- >>> np.nonzero(myarr == np.nan)
- (array([], dtype=int64),)
- >>> np.nan == np.nan  # is always False! Use special numpy functions instead.
- False
- >>> myarr[myarr == np.nan] = 0. # doesn't work
- >>> myarr
- array([  1.,   0.,  NaN,   3.])
- >>> myarr[np.isnan(myarr)] = 0. # use this instead find
- >>> myarr
- array([ 1.,  0.,  0.,  3.])
-
-Other related special value functions: ::
-
- isinf():    True if value is inf
- isfinite(): True if not nan or inf
- nan_to_num(): Map nan to 0, inf to max float, -inf to min float
-
-The following corresponds to the usual functions except that nans are excluded
-from the results: ::
-
- nansum()
- nanmax()
- nanmin()
- nanargmax()
- nanargmin()
-
- >>> x = np.arange(10.)
- >>> x[3] = np.nan
- >>> x.sum()
- nan
- >>> np.nansum(x)
- 42.0
-
-How numpy handles numerical exceptions
---------------------------------------
-
-The default is to ``'warn'`` for ``invalid``, ``divide``, and ``overflow``
-and ``'ignore'`` for ``underflow``.  But this can be changed, and it can be
-set individually for different kinds of exceptions. The different behaviors
-are:
-
- - 'ignore' : Take no action when the exception occurs.
- - 'warn'   : Print a `RuntimeWarning` (via the Python `warnings` module).
- - 'raise'  : Raise a `FloatingPointError`.
- - 'call'   : Call a function specified using the `seterrcall` function.
- - 'print'  : Print a warning directly to ``stdout``.
- - 'log'    : Record error in a Log object specified by `seterrcall`.
-
-These behaviors can be set for all kinds of errors or specific ones:
-
- - all       : apply to all numeric exceptions
- - invalid   : when NaNs are generated
- - divide    : divide by zero (for integers as well!)
- - overflow  : floating point overflows
- - underflow : floating point underflows
-
-Note that integer divide-by-zero is handled by the same machinery.
-These behaviors are set on a per-thread basis.
-
-Examples
---------
-
-::
-
- >>> oldsettings = np.seterr(all='warn')
- >>> np.zeros(5,dtype=np.float32)/0.
- invalid value encountered in divide
- >>> j = np.seterr(under='ignore')
- >>> np.array([1.e-100])**10
- >>> j = np.seterr(invalid='raise')
- >>> np.sqrt(np.array([-1.]))
- FloatingPointError: invalid value encountered in sqrt
- >>> def errorhandler(errstr, errflag):
- ...      print("saw stupid error!")
- >>> np.seterrcall(errorhandler)
- <function err_handler at 0x...>
- >>> j = np.seterr(all='call')
- >>> np.zeros(5, dtype=np.int32)/0
- FloatingPointError: invalid value encountered in divide
- saw stupid error!
- >>> j = np.seterr(**oldsettings) # restore previous
- ...                              # error-handling settings
-
-Interfacing to C
-----------------
-Only a survey of the choices. Little detail on how each works.
-
-1) Bare metal, wrap your own C-code manually.
-
- - Plusses:
-
-   - Efficient
-   - No dependencies on other tools
-
- - Minuses:
-
-   - Lots of learning overhead:
-
-     - need to learn basics of Python C API
-     - need to learn basics of numpy C API
-     - need to learn how to handle reference counting and love it.
-
-   - Reference counting often difficult to get right.
-
-     - getting it wrong leads to memory leaks, and worse, segfaults
-
-   - API will change for Python 3.0!
-
-2) Cython
-
- - Plusses:
-
-   - avoid learning C API's
-   - no dealing with reference counting
-   - can code in pseudo python and generate C code
-   - can also interface to existing C code
-   - should shield you from changes to Python C api
-   - has become the de-facto standard within the scientific Python community
-   - fast indexing support for arrays
-
- - Minuses:
-
-   - Can write code in non-standard form which may become obsolete
-   - Not as flexible as manual wrapping
-
-3) ctypes
-
- - Plusses:
-
-   - part of Python standard library
-   - good for interfacing to existing sharable libraries, particularly
-     Windows DLLs
-   - avoids API/reference counting issues
-   - good numpy support: arrays have all these in their ctypes
-     attribute: ::
-
-       a.ctypes.data              a.ctypes.get_strides
-       a.ctypes.data_as           a.ctypes.shape
-       a.ctypes.get_as_parameter  a.ctypes.shape_as
-       a.ctypes.get_data          a.ctypes.strides
-       a.ctypes.get_shape         a.ctypes.strides_as
-
- - Minuses:
-
-   - can't use for writing code to be turned into C extensions, only a wrapper
-     tool.
-
-4) SWIG (automatic wrapper generator)
-
- - Plusses:
-
-   - around a long time
-   - multiple scripting language support
-   - C++ support
-   - Good for wrapping large (many functions) existing C libraries
-
- - Minuses:
-
-   - generates lots of code between Python and the C code
-   - can cause performance problems that are nearly impossible to optimize
-     out
-   - interface files can be hard to write
-   - doesn't necessarily avoid reference counting issues or needing to know
-     API's
-
-5) scipy.weave
-
- - Plusses:
-
-   - can turn many numpy expressions into C code
-   - dynamic compiling and loading of generated C code
-   - can embed pure C code in Python module and have weave extract, generate
-     interfaces and compile, etc.
-
- - Minuses:
-
-   - Future very uncertain: it's the only part of Scipy not ported to Python 3
-     and is effectively deprecated in favor of Cython.
-
-6) Psyco
-
- - Plusses:
-
-   - Turns pure python into efficient machine code through jit-like
-     optimizations
-   - very fast when it optimizes well
-
- - Minuses:
-
-   - Only on intel (windows?)
-   - Doesn't do much for numpy?
-
-Interfacing to Fortran:
------------------------
-The clear choice to wrap Fortran code is
-`f2py <https://docs.scipy.org/doc/numpy/f2py/>`_.
-
-Pyfort is an older alternative, but not supported any longer.
-Fwrap is a newer project that looked promising but isn't being developed any
-longer.
-
-Interfacing to C++:
--------------------
- 1) Cython
- 2) CXX
- 3) Boost.python
- 4) SWIG
- 5) SIP (used mainly in PyQT)
-
-"""
diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
deleted file mode 100644
index 359d4f7f4..000000000
--- a/numpy/doc/structured_arrays.py
+++ /dev/null
@@ -1,646 +0,0 @@
-"""
-=================
-Structured Arrays
-=================
-
-Introduction
-============
-
-Structured arrays are ndarrays whose datatype is a composition of simpler
-datatypes organized as a sequence of named :term:`fields <field>`. For example,
-::
-
- >>> x = np.array([('Rex', 9, 81.0), ('Fido', 3, 27.0)],
- ...              dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])
- >>> x
- array([('Rex', 9, 81.), ('Fido', 3, 27.)],
-       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
-
-Here ``x`` is a one-dimensional array of length two whose datatype is a
-structure with three fields: 1. A string of length 10 or less named 'name', 2.
-a 32-bit integer named 'age', and 3. a 32-bit float named 'weight'.
-
-If you index ``x`` at position 1 you get a structure::
-
- >>> x[1]
- ('Fido', 3, 27.0)
-
-You can access and modify individual fields of a structured array by indexing
-with the field name::
-
- >>> x['age']
- array([9, 3], dtype=int32)
- >>> x['age'] = 5
- >>> x
- array([('Rex', 5, 81.), ('Fido', 5, 27.)],
-       dtype=[('name', 'U10'), ('age', '<i4'), ('weight', '<f4')])
-
-Structured datatypes are designed to be able to mimic 'structs' in the C
-language, and share a similar memory layout. They are meant for interfacing with
-C code and for low-level manipulation of structured buffers, for example for
-interpreting binary blobs. For these purposes they support specialized features
-such as subarrays, nested datatypes, and unions, and allow control over the
-memory layout of the structure.
-
-Users looking to manipulate tabular data, such as stored in csv files, may find
-other pydata projects more suitable, such as xarray, pandas, or DataArray.
-These provide a high-level interface for tabular data analysis and are better
-optimized for that use. For instance, the C-struct-like memory layout of
-structured arrays in numpy can lead to poor cache behavior in comparison.
-
-.. _defining-structured-types:
-
-Structured Datatypes
-====================
-
-A structured datatype can be thought of as a sequence of bytes of a certain
-length (the structure's :term:`itemsize`) which is interpreted as a collection
-of fields. Each field has a name, a datatype, and a byte offset within the
-structure. The datatype of a field may be any numpy datatype including other
-structured datatypes, and it may also be a :term:`subarray data type` which
-behaves like an ndarray of a specified shape. The offsets of the fields are
-arbitrary, and fields may even overlap. These offsets are usually determined
-automatically by numpy, but can also be specified.
-
-Structured Datatype Creation
-----------------------------
-
-Structured datatypes may be created using the function :func:`numpy.dtype`.
-There are 4 alternative forms of specification which vary in flexibility and
-conciseness. These are further documented in the
-:ref:`Data Type Objects <arrays.dtypes.constructing>` reference page, and in
-summary they are:
-
-1.   A list of tuples, one tuple per field
-
-     Each tuple has the form ``(fieldname, datatype, shape)`` where shape is
-     optional. ``fieldname`` is a string (or tuple if titles are used, see
-     :ref:`Field Titles <titles>` below), ``datatype`` may be any object
-     convertible to a datatype, and ``shape`` is a tuple of integers specifying
-     subarray shape.
-
-      >>> np.dtype([('x', 'f4'), ('y', np.float32), ('z', 'f4', (2, 2))])
-      dtype([('x', '<f4'), ('y', '<f4'), ('z', '<f4', (2, 2))])
-
-     If ``fieldname`` is the empty string ``''``, the field will be given a
-     default name of the form ``f#``, where ``#`` is the integer index of the
-     field, counting from 0 from the left::
-
-      >>> np.dtype([('x', 'f4'), ('', 'i4'), ('z', 'i8')])
-      dtype([('x', '<f4'), ('f1', '<i4'), ('z', '<i8')])
-
-     The byte offsets of the fields within the structure and the total
-     structure itemsize are determined automatically.
-
-2.   A string of comma-separated dtype specifications
-
-     In this shorthand notation any of the :ref:`string dtype specifications
-     <arrays.dtypes.constructing>` may be used in a string and separated by
-     commas. The itemsize and byte offsets of the fields are determined
-     automatically, and the field names are given the default names ``f0``,
-     ``f1``, etc. ::
-
-      >>> np.dtype('i8, f4, S3')
-      dtype([('f0', '<i8'), ('f1', '<f4'), ('f2', 'S3')])
-      >>> np.dtype('3int8, float32, (2, 3)float64')
-      dtype([('f0', 'i1', (3,)), ('f1', '<f4'), ('f2', '<f8', (2, 3))])
-
-3.   A dictionary of field parameter arrays
-
-     This is the most flexible form of specification since it allows control
-     over the byte-offsets of the fields and the itemsize of the structure.
-
-     The dictionary has two required keys, 'names' and 'formats', and four
-     optional keys, 'offsets', 'itemsize', 'aligned' and 'titles'. The values
-     for 'names' and 'formats' should respectively be a list of field names and
-     a list of dtype specifications, of the same length. The optional 'offsets'
-     value should be a list of integer byte-offsets, one for each field within
-     the structure. If 'offsets' is not given the offsets are determined
-     automatically. The optional 'itemsize' value should be an integer
-     describing the total size in bytes of the dtype, which must be large
-     enough to contain all the fields.
-     ::
-
-      >>> np.dtype({'names': ['col1', 'col2'], 'formats': ['i4', 'f4']})
-      dtype([('col1', '<i4'), ('col2', '<f4')])
-      >>> np.dtype({'names': ['col1', 'col2'],
-      ...           'formats': ['i4', 'f4'],
-      ...           'offsets': [0, 4],
-      ...           'itemsize': 12})
-      dtype({'names':['col1','col2'], 'formats':['<i4','<f4'], 'offsets':[0,4], 'itemsize':12})
-
-     Offsets may be chosen such that the fields overlap, though this will mean
-     that assigning to one field may clobber any overlapping field's data. As
-     an exception, fields of :class:`numpy.object` type cannot overlap with
-     other fields, because of the risk of clobbering the internal object
-     pointer and then dereferencing it.
-
-     The optional 'aligned' value can be set to ``True`` to make the automatic
-     offset computation use aligned offsets (see :ref:`offsets-and-alignment`),
-     as if the 'align' keyword argument of :func:`numpy.dtype` had been set to
-     True.
-
-     The optional 'titles' value should be a list of titles of the same length
-     as 'names', see :ref:`Field Titles <titles>` below.
-
-4.   A dictionary of field names
-
-     The use of this form of specification is discouraged, but documented here
-     because older numpy code may use it. The keys of the dictionary are the
-     field names and the values are tuples specifying type and offset::
-
-      >>> np.dtype({'col1': ('i1', 0), 'col2': ('f4', 1)})
-      dtype([('col1', 'i1'), ('col2', '<f4')])
-
-     This form is discouraged because Python dictionaries do not preserve order
-     in Python versions before Python 3.6, and the order of the fields in a
-     structured dtype has meaning. :ref:`Field Titles <titles>` may be
-     specified by using a 3-tuple, see below.
-
-Manipulating and Displaying Structured Datatypes
-------------------------------------------------
-
-The list of field names of a structured datatype can be found in the ``names``
-attribute of the dtype object::
-
- >>> d = np.dtype([('x', 'i8'), ('y', 'f4')])
- >>> d.names
- ('x', 'y')
-
-The field names may be modified by assigning to the ``names`` attribute using a
-sequence of strings of the same length.
-
-The dtype object also has a dictionary-like attribute, ``fields``, whose keys
-are the field names (and :ref:`Field Titles <titles>`, see below) and whose
-values are tuples containing the dtype and byte offset of each field. ::
-
- >>> d.fields
- mappingproxy({'x': (dtype('int64'), 0), 'y': (dtype('float32'), 8)})
-
-Both the ``names`` and ``fields`` attributes will equal ``None`` for
-unstructured arrays. The recommended way to test if a dtype is structured is
-with `if dt.names is not None` rather than `if dt.names`, to account for dtypes
-with 0 fields.
-
-The string representation of a structured datatype is shown in the "list of
-tuples" form if possible, otherwise numpy falls back to using the more general
-dictionary form.
-
-.. _offsets-and-alignment:
-
-Automatic Byte Offsets and Alignment
-------------------------------------
-
-Numpy uses one of two methods to automatically determine the field byte offsets
-and the overall itemsize of a structured datatype, depending on whether
-``align=True`` was specified as a keyword argument to :func:`numpy.dtype`.
-
-By default (``align=False``), numpy will pack the fields together such that
-each field starts at the byte offset the previous field ended, and the fields
-are contiguous in memory. ::
-
- >>> def print_offsets(d):
- ...     print("offsets:", [d.fields[name][1] for name in d.names])
- ...     print("itemsize:", d.itemsize)
- >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2'))
- offsets: [0, 1, 2, 6, 7, 15]
- itemsize: 17
-
-If ``align=True`` is set, numpy will pad the structure in the same way many C
-compilers would pad a C-struct. Aligned structures can give a performance
-improvement in some cases, at the cost of increased datatype size. Padding
-bytes are inserted between fields such that each field's byte offset will be a
-multiple of that field's alignment, which is usually equal to the field's size
-in bytes for simple datatypes, see :c:member:`PyArray_Descr.alignment`.  The
-structure will also have trailing padding added so that its itemsize is a
-multiple of the largest field's alignment. ::
-
- >>> print_offsets(np.dtype('u1, u1, i4, u1, i8, u2', align=True))
- offsets: [0, 1, 4, 8, 16, 24]
- itemsize: 32
-
-Note that although almost all modern C compilers pad in this way by default,
-padding in C structs is C-implementation-dependent so this memory layout is not
-guaranteed to exactly match that of a corresponding struct in a C program. Some
-work may be needed, either on the numpy side or the C side, to obtain exact
-correspondence.
-
-If offsets were specified using the optional ``offsets`` key in the
-dictionary-based dtype specification, setting ``align=True`` will check that
-each field's offset is a multiple of its size and that the itemsize is a
-multiple of the largest field size, and raise an exception if not.
-
-If the offsets of the fields and itemsize of a structured array satisfy the
-alignment conditions, the array will have the ``ALIGNED`` :attr:`flag
-<numpy.ndarray.flags>` set.
-
-A convenience function :func:`numpy.lib.recfunctions.repack_fields` converts an
-aligned dtype or array to a packed one and vice versa. It takes either a dtype
-or structured ndarray as an argument, and returns a copy with fields re-packed,
-with or without padding bytes.
-
-.. _titles:
-
-Field Titles
-------------
-
-In addition to field names, fields may also have an associated :term:`title`,
-an alternate name, which is sometimes used as an additional description or
-alias for the field. The title may be used to index an array, just like a
-field name.
-
-To add titles when using the list-of-tuples form of dtype specification, the
-field name may be specified as a tuple of two strings instead of a single
-string, which will be the field's title and field name respectively. For
-example::
-
- >>> np.dtype([(('my title', 'name'), 'f4')])
- dtype([(('my title', 'name'), '<f4')])
-
-When using the first form of dictionary-based specification, the titles may be
-supplied as an extra ``'titles'`` key as described above. When using the second
-(discouraged) dictionary-based specification, the title can be supplied by
-providing a 3-element tuple ``(datatype, offset, title)`` instead of the usual
-2-element tuple::
-
- >>> np.dtype({'name': ('i4', 0, 'my title')})
- dtype([(('my title', 'name'), '<i4')])
-
-The ``dtype.fields`` dictionary will contain titles as keys, if any
-titles are used.  This means effectively that a field with a title will be
-represented twice in the fields dictionary. The tuple values for these fields
-will also have a third element, the field title. Because of this, and because
-the ``names`` attribute preserves the field order while the ``fields``
-attribute may not, it is recommended to iterate through the fields of a dtype
-using the ``names`` attribute of the dtype, which will not list titles, as
-in::
-
- >>> for name in d.names:
- ...     print(d.fields[name][:2])
- (dtype('int64'), 0)
- (dtype('float32'), 8)
-
-Union types
------------
-
-Structured datatypes are implemented in numpy to have base type
-:class:`numpy.void` by default, but it is possible to interpret other numpy
-types as structured types using the ``(base_dtype, dtype)`` form of dtype
-specification described in
-:ref:`Data Type Objects <arrays.dtypes.constructing>`.  Here, ``base_dtype`` is
-the desired underlying dtype, and fields and flags will be copied from
-``dtype``. This dtype is similar to a 'union' in C.
-
-Indexing and Assignment to Structured arrays
-============================================
-
-Assigning data to a Structured Array
-------------------------------------
-
-There are a number of ways to assign values to a structured array: Using python
-tuples, using scalar values, or using other structured arrays.
-
-Assignment from Python Native Types (Tuples)
-````````````````````````````````````````````
-
-The simplest way to assign values to a structured array is using python tuples.
-Each assigned value should be a tuple of length equal to the number of fields
-in the array, and not a list or array as these will trigger numpy's
-broadcasting rules. The tuple's elements are assigned to the successive fields
-of the array, from left to right::
-
- >>> x = np.array([(1, 2, 3), (4, 5, 6)], dtype='i8, f4, f8')
- >>> x[1] = (7, 8, 9)
- >>> x
- array([(1, 2., 3.), (7, 8., 9.)],
-      dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '<f8')])
-
-Assignment from Scalars
-```````````````````````
-
-A scalar assigned to a structured element will be assigned to all fields. This
-happens when a scalar is assigned to a structured array, or when an
-unstructured array is assigned to a structured array::
-
- >>> x = np.zeros(2, dtype='i8, f4, ?, S1')
- >>> x[:] = 3
- >>> x
- array([(3, 3., True, b'3'), (3, 3., True, b'3')],
-       dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
- >>> x[:] = np.arange(2)
- >>> x
- array([(0, 0., False, b'0'), (1, 1., True, b'1')],
-       dtype=[('f0', '<i8'), ('f1', '<f4'), ('f2', '?'), ('f3', 'S1')])
-
-Structured arrays can also be assigned to unstructured arrays, but only if the
-structured datatype has just a single field::
-
- >>> twofield = np.zeros(2, dtype=[('A', 'i4'), ('B', 'i4')])
- >>> onefield = np.zeros(2, dtype=[('A', 'i4')])
- >>> nostruct = np.zeros(2, dtype='i4')
- >>> nostruct[:] = twofield
- Traceback (most recent call last):
- ...
- TypeError: Cannot cast array data from dtype([('A', '<i4'), ('B', '<i4')]) to dtype('int32') according to the rule 'unsafe'
-
-Assignment from other Structured Arrays
-```````````````````````````````````````
-
-Assignment between two structured arrays occurs as if the source elements had
-been converted to tuples and then assigned to the destination elements. That
-is, the first field of the source array is assigned to the first field of the
-destination array, and the second field likewise, and so on, regardless of
-field names. Structured arrays with a different number of fields cannot be
-assigned to each other. Bytes of the destination structure which are not
-included in any of the fields are unaffected. ::
-
- >>> a = np.zeros(3, dtype=[('a', 'i8'), ('b', 'f4'), ('c', 'S3')])
- >>> b = np.ones(3, dtype=[('x', 'f4'), ('y', 'S3'), ('z', 'O')])
- >>> b[:] = a
- >>> b
- array([(0., b'0.0', b''), (0., b'0.0', b''), (0., b'0.0', b'')],
-       dtype=[('x', '<f4'), ('y', 'S3'), ('z', 'O')])
-
-
-Assignment involving subarrays
-``````````````````````````````
-
-When assigning to fields which are subarrays, the assigned value will first be
-broadcast to the shape of the subarray.
-
-Indexing Structured Arrays
---------------------------
-
-Accessing Individual Fields
-```````````````````````````
-
-Individual fields of a structured array may be accessed and modified by indexing
-the array with the field name. ::
-
- >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
- >>> x['foo']
- array([1, 3])
- >>> x['foo'] = 10
- >>> x
- array([(10, 2.), (10, 4.)],
-       dtype=[('foo', '<i8'), ('bar', '<f4')])
-
-The resulting array is a view into the original array. It shares the same
-memory locations and writing to the view will modify the original array. ::
-
- >>> y = x['bar']
- >>> y[:] = 11
- >>> x
- array([(10, 11.), (10, 11.)],
-       dtype=[('foo', '<i8'), ('bar', '<f4')])
-
-This view has the same dtype and itemsize as the indexed field, so it is
-typically a non-structured array, except in the case of nested structures.
-
- >>> y.dtype, y.shape, y.strides
- (dtype('float32'), (2,), (12,))
-
-If the accessed field is a subarray, the dimensions of the subarray
-are appended to the shape of the result::
-
-   >>> x = np.zeros((2, 2), dtype=[('a', np.int32), ('b', np.float64, (3, 3))])
-   >>> x['a'].shape
-   (2, 2)
-   >>> x['b'].shape
-   (2, 2, 3, 3)
-
-Accessing Multiple Fields
-```````````````````````````
-
-One can index and assign to a structured array with a multi-field index, where
-the index is a list of field names.
-
-.. warning::
-    The behavior of multi-field indexes changed from Numpy 1.15 to Numpy 1.16.
-
-The result of indexing with a multi-field index is a view into the original
-array, as follows::
-
- >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
- >>> a[['a', 'c']]
- array([(0, 0.), (0, 0.), (0, 0.)],
-      dtype={'names':['a','c'], 'formats':['<i4','<f4'], 'offsets':[0,8], 'itemsize':12})
-
-Assignment to the view modifies the original array. The view's fields will be
-in the order they were indexed. Note that unlike for single-field indexing, the
-dtype of the view has the same itemsize as the original array, and has fields
-at the same offsets as in the original array, and unindexed fields are merely
-missing.
-
-.. warning::
-    In Numpy 1.15, indexing an array with a multi-field index returned a copy of
-    the result above, but with fields packed together in memory as if
-    passed through :func:`numpy.lib.recfunctions.repack_fields`.
-
-    The new behavior as of Numpy 1.16 leads to extra "padding" bytes at the
-    location of unindexed fields compared to 1.15. You will need to update any
-    code which depends on the data having a "packed" layout. For instance code
-    such as::
-
-     >>> a[['a', 'c']].view('i8')  # Fails in Numpy 1.16
-     Traceback (most recent call last):
-        File "<stdin>", line 1, in <module>
-     ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
-
-    will need to be changed. This code has raised a ``FutureWarning`` since
-    Numpy 1.12, and similar code has raised ``FutureWarning`` since 1.7.
-
-    In 1.16 a number of functions have been introduced in the
-    :mod:`numpy.lib.recfunctions` module to help users account for this
-    change. These are
-    :func:`numpy.lib.recfunctions.repack_fields`.
-    :func:`numpy.lib.recfunctions.structured_to_unstructured`,
-    :func:`numpy.lib.recfunctions.unstructured_to_structured`,
-    :func:`numpy.lib.recfunctions.apply_along_fields`,
-    :func:`numpy.lib.recfunctions.assign_fields_by_name`,  and
-    :func:`numpy.lib.recfunctions.require_fields`.
-
-    The function :func:`numpy.lib.recfunctions.repack_fields` can always be
-    used to reproduce the old behavior, as it will return a packed copy of the
-    structured array. The code above, for example, can be replaced with:
-
-     >>> from numpy.lib.recfunctions import repack_fields
-     >>> repack_fields(a[['a', 'c']]).view('i8')  # supported in 1.16
-     array([0, 0, 0])
-
-    Furthermore, numpy now provides a new function
-    :func:`numpy.lib.recfunctions.structured_to_unstructured` which is a safer
-    and more efficient alternative for users who wish to convert structured
-    arrays to unstructured arrays, as the view above is often indeded to do.
-    This function allows safe conversion to an unstructured type taking into
-    account padding, often avoids a copy, and also casts the datatypes
-    as needed, unlike the view. Code such as:
-
-     >>> b = np.zeros(3, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
-     >>> b[['x', 'z']].view('f4')
-     array([0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)
-
-    can be made safer by replacing with:
-
-     >>> from numpy.lib.recfunctions import structured_to_unstructured
-     >>> structured_to_unstructured(b[['x', 'z']])
-     array([0, 0, 0])
-
-
-Assignment to an array with a multi-field index modifies the original array::
-
- >>> a[['a', 'c']] = (2, 3)
- >>> a
- array([(2, 0, 3.), (2, 0, 3.), (2, 0, 3.)],
-       dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<f4')])
-
-This obeys the structured array assignment rules described above. For example,
-this means that one can swap the values of two fields using appropriate
-multi-field indexes::
-
- >>> a[['a', 'c']] = a[['c', 'a']]
-
-Indexing with an Integer to get a Structured Scalar
-```````````````````````````````````````````````````
-
-Indexing a single element of a structured array (with an integer index) returns
-a structured scalar::
-
- >>> x = np.array([(1, 2., 3.)], dtype='i, f, f')
- >>> scalar = x[0]
- >>> scalar
- (1, 2., 3.)
- >>> type(scalar)
- <class 'numpy.void'>
-
-Unlike other numpy scalars, structured scalars are mutable and act like views
-into the original array, such that modifying the scalar will modify the
-original array. Structured scalars also support access and assignment by field
-name::
-
- >>> x = np.array([(1, 2), (3, 4)], dtype=[('foo', 'i8'), ('bar', 'f4')])
- >>> s = x[0]
- >>> s['bar'] = 100
- >>> x
- array([(1, 100.), (3, 4.)],
-       dtype=[('foo', '<i8'), ('bar', '<f4')])
-
-Similarly to tuples, structured scalars can also be indexed with an integer::
-
- >>> scalar = np.array([(1, 2., 3.)], dtype='i, f, f')[0]
- >>> scalar[0]
- 1
- >>> scalar[1] = 4
-
-Thus, tuples might be thought of as the native Python equivalent to numpy's
-structured types, much like native python integers are the equivalent to
-numpy's integer types. Structured scalars may be converted to a tuple by
-calling :func:`ndarray.item`::
-
- >>> scalar.item(), type(scalar.item())
- ((1, 4.0, 3.0), <class 'tuple'>)
-
-Viewing Structured Arrays Containing Objects
---------------------------------------------
-
-In order to prevent clobbering object pointers in fields of
-:class:`numpy.object` type, numpy currently does not allow views of structured
-arrays containing objects.
-
-Structure Comparison
---------------------
-
-If the dtypes of two void structured arrays are equal, testing the equality of
-the arrays will result in a boolean array with the dimensions of the original
-arrays, with elements set to ``True`` where all fields of the corresponding
-structures are equal. Structured dtypes are equal if the field names,
-dtypes and titles are the same, ignoring endianness, and the fields are in
-the same order::
-
- >>> a = np.zeros(2, dtype=[('a', 'i4'), ('b', 'i4')])
- >>> b = np.ones(2, dtype=[('a', 'i4'), ('b', 'i4')])
- >>> a == b
- array([False, False])
-
-Currently, if the dtypes of two void structured arrays are not equivalent the
-comparison fails, returning the scalar value ``False``. This behavior is
-deprecated as of numpy 1.10 and will raise an error or perform elementwise
-comparison in the future.
-
-The ``<`` and ``>`` operators always return ``False`` when comparing void
-structured arrays, and arithmetic and bitwise operations are not supported.
-
-Record Arrays
-=============
-
-As an optional convenience numpy provides an ndarray subclass,
-:class:`numpy.recarray`, and associated helper functions in the
-:mod:`numpy.rec` submodule, that allows access to fields of structured arrays
-by attribute instead of only by index. Record arrays also use a special
-datatype, :class:`numpy.record`, that allows field access by attribute on the
-structured scalars obtained from the array.
-
-The simplest way to create a record array is with :func:`numpy.rec.array`::
-
- >>> recordarr = np.rec.array([(1, 2., 'Hello'), (2, 3., "World")],
- ...                    dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'S10')])
- >>> recordarr.bar
- array([ 2.,  3.], dtype=float32)
- >>> recordarr[1:2]
- rec.array([(2, 3., b'World')],
-       dtype=[('foo', '<i4'), ('bar', '<f4'), ('baz', 'S10')])
- >>> recordarr[1:2].foo
- array([2], dtype=int32)
- >>> recordarr.foo[1:2]
- array([2], dtype=int32)
- >>> recordarr[1].baz
- b'World'
-
-:func:`numpy.rec.array` can convert a wide variety of arguments into record
-arrays, including structured arrays::
-
- >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
- ...             dtype=[('foo', 'i4'), ('bar', 'f4'), ('baz', 'S10')])
- >>> recordarr = np.rec.array(arr)
-
-The :mod:`numpy.rec` module provides a number of other convenience functions for
-creating record arrays, see :ref:`record array creation routines
-<routines.array-creation.rec>`.
-
-A record array representation of a structured array can be obtained using the
-appropriate `view <numpy-ndarray-view>`_::
-
- >>> arr = np.array([(1, 2., 'Hello'), (2, 3., "World")],
- ...                dtype=[('foo', 'i4'),('bar', 'f4'), ('baz', 'a10')])
- >>> recordarr = arr.view(dtype=np.dtype((np.record, arr.dtype)),
- ...                      type=np.recarray)
-
-For convenience, viewing an ndarray as type :class:`np.recarray` will
-automatically convert to :class:`np.record` datatype, so the dtype can be left
-out of the view::
-
- >>> recordarr = arr.view(np.recarray)
- >>> recordarr.dtype
- dtype((numpy.record, [('foo', '<i4'), ('bar', '<f4'), ('baz', 'S10')]))
-
-To get back to a plain ndarray both the dtype and type must be reset. The
-following view does so, taking into account the unusual case that the
-recordarr was not a structured type::
-
- >>> arr2 = recordarr.view(recordarr.dtype.fields or recordarr.dtype, np.ndarray)
-
-Record array fields accessed by index or by attribute are returned as a record
-array if the field has a structured type but as a plain ndarray otherwise. ::
-
- >>> recordarr = np.rec.array([('Hello', (1, 2)), ("World", (3, 4))],
- ...                 dtype=[('foo', 'S6'),('bar', [('A', int), ('B', int)])])
- >>> type(recordarr.foo)
- <class 'numpy.ndarray'>
- >>> type(recordarr.bar)
- <class 'numpy.recarray'>
-
-Note that if a field has the same name as an ndarray attribute, the ndarray
-attribute takes precedence. Such fields will be inaccessible by attribute but
-will still be accessible by index.
-
-"""
diff --git a/numpy/doc/subclassing.py b/numpy/doc/subclassing.py
deleted file mode 100644
index 7dc10e1c8..000000000
--- a/numpy/doc/subclassing.py
+++ /dev/null
@@ -1,752 +0,0 @@
-"""=============================
-Subclassing ndarray in python
-=============================
-
-Introduction
-------------
-
-Subclassing ndarray is relatively simple, but it has some complications
-compared to other Python objects.  On this page we explain the machinery
-that allows you to subclass ndarray, and the implications for
-implementing a subclass.
-
-ndarrays and object creation
-============================
-
-Subclassing ndarray is complicated by the fact that new instances of
-ndarray classes can come about in three different ways.  These are:
-
-#. Explicit constructor call - as in ``MySubClass(params)``.  This is
-   the usual route to Python instance creation.
-#. View casting - casting an existing ndarray as a given subclass
-#. New from template - creating a new instance from a template
-   instance. Examples include returning slices from a subclassed array,
-   creating return types from ufuncs, and copying arrays.  See
-   :ref:`new-from-template` for more details
-
-The last two are characteristics of ndarrays - in order to support
-things like array slicing.  The complications of subclassing ndarray are
-due to the mechanisms numpy has to support these latter two routes of
-instance creation.
-
-.. _view-casting:
-
-View casting
-------------
-
-*View casting* is the standard ndarray mechanism by which you take an
-ndarray of any subclass, and return a view of the array as another
-(specified) subclass:
-
->>> import numpy as np
->>> # create a completely useless ndarray subclass
->>> class C(np.ndarray): pass
->>> # create a standard ndarray
->>> arr = np.zeros((3,))
->>> # take a view of it, as our useless subclass
->>> c_arr = arr.view(C)
->>> type(c_arr)
-<class 'C'>
-
-.. _new-from-template:
-
-Creating new from template
---------------------------
-
-New instances of an ndarray subclass can also come about by a very
-similar mechanism to :ref:`view-casting`, when numpy finds it needs to
-create a new instance from a template instance.  The most obvious place
-this has to happen is when you are taking slices of subclassed arrays.
-For example:
-
->>> v = c_arr[1:]
->>> type(v) # the view is of type 'C'
-<class 'C'>
->>> v is c_arr # but it's a new instance
-False
-
-The slice is a *view* onto the original ``c_arr`` data.  So, when we
-take a view from the ndarray, we return a new ndarray, of the same
-class, that points to the data in the original.
-
-There are other points in the use of ndarrays where we need such views,
-such as copying arrays (``c_arr.copy()``), creating ufunc output arrays
-(see also :ref:`array-wrap`), and reducing methods (like
-``c_arr.mean()``).
-
-Relationship of view casting and new-from-template
---------------------------------------------------
-
-These paths both use the same machinery.  We make the distinction here,
-because they result in different input to your methods.  Specifically,
-:ref:`view-casting` means you have created a new instance of your array
-type from any potential subclass of ndarray.  :ref:`new-from-template`
-means you have created a new instance of your class from a pre-existing
-instance, allowing you - for example - to copy across attributes that
-are particular to your subclass.
-
-Implications for subclassing
-----------------------------
-
-If we subclass ndarray, we need to deal not only with explicit
-construction of our array type, but also :ref:`view-casting` or
-:ref:`new-from-template`.  NumPy has the machinery to do this, and this
-machinery that makes subclassing slightly non-standard.
-
-There are two aspects to the machinery that ndarray uses to support
-views and new-from-template in subclasses.
-
-The first is the use of the ``ndarray.__new__`` method for the main work
-of object initialization, rather then the more usual ``__init__``
-method.  The second is the use of the ``__array_finalize__`` method to
-allow subclasses to clean up after the creation of views and new
-instances from templates.
-
-A brief Python primer on ``__new__`` and ``__init__``
-=====================================================
-
-``__new__`` is a standard Python method, and, if present, is called
-before ``__init__`` when we create a class instance. See the `python
-__new__ documentation
-<https://docs.python.org/reference/datamodel.html#object.__new__>`_ for more detail.
-
-For example, consider the following Python code:
-
-.. testcode::
-
-  class C:
-      def __new__(cls, *args):
-          print('Cls in __new__:', cls)
-          print('Args in __new__:', args)
-          # The `object` type __new__ method takes a single argument.
-          return object.__new__(cls)
-
-      def __init__(self, *args):
-          print('type(self) in __init__:', type(self))
-          print('Args in __init__:', args)
-
-meaning that we get:
-
->>> c = C('hello')
-Cls in __new__: <class 'C'>
-Args in __new__: ('hello',)
-type(self) in __init__: <class 'C'>
-Args in __init__: ('hello',)
-
-When we call ``C('hello')``, the ``__new__`` method gets its own class
-as first argument, and the passed argument, which is the string
-``'hello'``.  After python calls ``__new__``, it usually (see below)
-calls our ``__init__`` method, with the output of ``__new__`` as the
-first argument (now a class instance), and the passed arguments
-following.
-
-As you can see, the object can be initialized in the ``__new__``
-method or the ``__init__`` method, or both, and in fact ndarray does
-not have an ``__init__`` method, because all the initialization is
-done in the ``__new__`` method.
-
-Why use ``__new__`` rather than just the usual ``__init__``?  Because
-in some cases, as for ndarray, we want to be able to return an object
-of some other class.  Consider the following:
-
-.. testcode::
-
-  class D(C):
-      def __new__(cls, *args):
-          print('D cls is:', cls)
-          print('D args in __new__:', args)
-          return C.__new__(C, *args)
-
-      def __init__(self, *args):
-          # we never get here
-          print('In D __init__')
-
-meaning that:
-
->>> obj = D('hello')
-D cls is: <class 'D'>
-D args in __new__: ('hello',)
-Cls in __new__: <class 'C'>
-Args in __new__: ('hello',)
->>> type(obj)
-<class 'C'>
-
-The definition of ``C`` is the same as before, but for ``D``, the
-``__new__`` method returns an instance of class ``C`` rather than
-``D``.  Note that the ``__init__`` method of ``D`` does not get
-called.  In general, when the ``__new__`` method returns an object of
-class other than the class in which it is defined, the ``__init__``
-method of that class is not called.
-
-This is how subclasses of the ndarray class are able to return views
-that preserve the class type.  When taking a view, the standard
-ndarray machinery creates the new ndarray object with something
-like::
-
-  obj = ndarray.__new__(subtype, shape, ...
-
-where ``subdtype`` is the subclass.  Thus the returned view is of the
-same class as the subclass, rather than being of class ``ndarray``.
-
-That solves the problem of returning views of the same type, but now
-we have a new problem.  The machinery of ndarray can set the class
-this way, in its standard methods for taking views, but the ndarray
-``__new__`` method knows nothing of what we have done in our own
-``__new__`` method in order to set attributes, and so on.  (Aside -
-why not call ``obj = subdtype.__new__(...`` then?  Because we may not
-have a ``__new__`` method with the same call signature).
-
-The role of ``__array_finalize__``
-==================================
-
-``__array_finalize__`` is the mechanism that numpy provides to allow
-subclasses to handle the various ways that new instances get created.
-
-Remember that subclass instances can come about in these three ways:
-
-#. explicit constructor call (``obj = MySubClass(params)``).  This will
-   call the usual sequence of ``MySubClass.__new__`` then (if it exists)
-   ``MySubClass.__init__``.
-#. :ref:`view-casting`
-#. :ref:`new-from-template`
-
-Our ``MySubClass.__new__`` method only gets called in the case of the
-explicit constructor call, so we can't rely on ``MySubClass.__new__`` or
-``MySubClass.__init__`` to deal with the view casting and
-new-from-template.  It turns out that ``MySubClass.__array_finalize__``
-*does* get called for all three methods of object creation, so this is
-where our object creation housekeeping usually goes.
-
-* For the explicit constructor call, our subclass will need to create a
-  new ndarray instance of its own class.  In practice this means that
-  we, the authors of the code, will need to make a call to
-  ``ndarray.__new__(MySubClass,...)``, a class-hierarchy prepared call to
-  ``super(MySubClass, cls).__new__(cls, ...)``, or do view casting of an
-  existing array (see below)
-* For view casting and new-from-template, the equivalent of
-  ``ndarray.__new__(MySubClass,...`` is called, at the C level.
-
-The arguments that ``__array_finalize__`` receives differ for the three
-methods of instance creation above.
-
-The following code allows us to look at the call sequences and arguments:
-
-.. testcode::
-
-   import numpy as np
-
-   class C(np.ndarray):
-       def __new__(cls, *args, **kwargs):
-           print('In __new__ with class %s' % cls)
-           return super(C, cls).__new__(cls, *args, **kwargs)
-
-       def __init__(self, *args, **kwargs):
-           # in practice you probably will not need or want an __init__
-           # method for your subclass
-           print('In __init__ with class %s' % self.__class__)
-
-       def __array_finalize__(self, obj):
-           print('In array_finalize:')
-           print('   self type is %s' % type(self))
-           print('   obj type is %s' % type(obj))
-
-
-Now:
-
->>> # Explicit constructor
->>> c = C((10,))
-In __new__ with class <class 'C'>
-In array_finalize:
-   self type is <class 'C'>
-   obj type is <type 'NoneType'>
-In __init__ with class <class 'C'>
->>> # View casting
->>> a = np.arange(10)
->>> cast_a = a.view(C)
-In array_finalize:
-   self type is <class 'C'>
-   obj type is <type 'numpy.ndarray'>
->>> # Slicing (example of new-from-template)
->>> cv = c[:1]
-In array_finalize:
-   self type is <class 'C'>
-   obj type is <class 'C'>
-
-The signature of ``__array_finalize__`` is::
-
-    def __array_finalize__(self, obj):
-
-One sees that the ``super`` call, which goes to
-``ndarray.__new__``, passes ``__array_finalize__`` the new object, of our
-own class (``self``) as well as the object from which the view has been
-taken (``obj``).  As you can see from the output above, the ``self`` is
-always a newly created instance of our subclass, and the type of ``obj``
-differs for the three instance creation methods:
-
-* When called from the explicit constructor, ``obj`` is ``None``
-* When called from view casting, ``obj`` can be an instance of any
-  subclass of ndarray, including our own.
-* When called in new-from-template, ``obj`` is another instance of our
-  own subclass, that we might use to update the new ``self`` instance.
-
-Because ``__array_finalize__`` is the only method that always sees new
-instances being created, it is the sensible place to fill in instance
-defaults for new object attributes, among other tasks.
-
-This may be clearer with an example.
-
-Simple example - adding an extra attribute to ndarray
------------------------------------------------------
-
-.. testcode::
-
-  import numpy as np
-
-  class InfoArray(np.ndarray):
-
-      def __new__(subtype, shape, dtype=float, buffer=None, offset=0,
-                  strides=None, order=None, info=None):
-          # Create the ndarray instance of our type, given the usual
-          # ndarray input arguments.  This will call the standard
-          # ndarray constructor, but return an object of our type.
-          # It also triggers a call to InfoArray.__array_finalize__
-          obj = super(InfoArray, subtype).__new__(subtype, shape, dtype,
-                                                  buffer, offset, strides,
-                                                  order)
-          # set the new 'info' attribute to the value passed
-          obj.info = info
-          # Finally, we must return the newly created object:
-          return obj
-
-      def __array_finalize__(self, obj):
-          # ``self`` is a new object resulting from
-          # ndarray.__new__(InfoArray, ...), therefore it only has
-          # attributes that the ndarray.__new__ constructor gave it -
-          # i.e. those of a standard ndarray.
-          #
-          # We could have got to the ndarray.__new__ call in 3 ways:
-          # From an explicit constructor - e.g. InfoArray():
-          #    obj is None
-          #    (we're in the middle of the InfoArray.__new__
-          #    constructor, and self.info will be set when we return to
-          #    InfoArray.__new__)
-          if obj is None: return
-          # From view casting - e.g arr.view(InfoArray):
-          #    obj is arr
-          #    (type(obj) can be InfoArray)
-          # From new-from-template - e.g infoarr[:3]
-          #    type(obj) is InfoArray
-          #
-          # Note that it is here, rather than in the __new__ method,
-          # that we set the default value for 'info', because this
-          # method sees all creation of default objects - with the
-          # InfoArray.__new__ constructor, but also with
-          # arr.view(InfoArray).
-          self.info = getattr(obj, 'info', None)
-          # We do not need to return anything
-
-
-Using the object looks like this:
-
-  >>> obj = InfoArray(shape=(3,)) # explicit constructor
-  >>> type(obj)
-  <class 'InfoArray'>
-  >>> obj.info is None
-  True
-  >>> obj = InfoArray(shape=(3,), info='information')
-  >>> obj.info
-  'information'
-  >>> v = obj[1:] # new-from-template - here - slicing
-  >>> type(v)
-  <class 'InfoArray'>
-  >>> v.info
-  'information'
-  >>> arr = np.arange(10)
-  >>> cast_arr = arr.view(InfoArray) # view casting
-  >>> type(cast_arr)
-  <class 'InfoArray'>
-  >>> cast_arr.info is None
-  True
-
-This class isn't very useful, because it has the same constructor as the
-bare ndarray object, including passing in buffers and shapes and so on.
-We would probably prefer the constructor to be able to take an already
-formed ndarray from the usual numpy calls to ``np.array`` and return an
-object.
-
-Slightly more realistic example - attribute added to existing array
--------------------------------------------------------------------
-
-Here is a class that takes a standard ndarray that already exists, casts
-as our type, and adds an extra attribute.
-
-.. testcode::
-
-  import numpy as np
-
-  class RealisticInfoArray(np.ndarray):
-
-      def __new__(cls, input_array, info=None):
-          # Input array is an already formed ndarray instance
-          # We first cast to be our class type
-          obj = np.asarray(input_array).view(cls)
-          # add the new attribute to the created instance
-          obj.info = info
-          # Finally, we must return the newly created object:
-          return obj
-
-      def __array_finalize__(self, obj):
-          # see InfoArray.__array_finalize__ for comments
-          if obj is None: return
-          self.info = getattr(obj, 'info', None)
-
-
-So:
-
-  >>> arr = np.arange(5)
-  >>> obj = RealisticInfoArray(arr, info='information')
-  >>> type(obj)
-  <class 'RealisticInfoArray'>
-  >>> obj.info
-  'information'
-  >>> v = obj[1:]
-  >>> type(v)
-  <class 'RealisticInfoArray'>
-  >>> v.info
-  'information'
-
-.. _array-ufunc:
-
-``__array_ufunc__`` for ufuncs
-------------------------------
-
-  .. versionadded:: 1.13
-
-A subclass can override what happens when executing numpy ufuncs on it by
-overriding the default ``ndarray.__array_ufunc__`` method. This method is
-executed *instead* of the ufunc and should return either the result of the
-operation, or :obj:`NotImplemented` if the operation requested is not
-implemented.
-
-The signature of ``__array_ufunc__`` is::
-
-    def __array_ufunc__(ufunc, method, *inputs, **kwargs):
-
-    - *ufunc* is the ufunc object that was called.
-    - *method* is a string indicating how the Ufunc was called, either
-      ``"__call__"`` to indicate it was called directly, or one of its
-      :ref:`methods<ufuncs.methods>`: ``"reduce"``, ``"accumulate"``,
-      ``"reduceat"``, ``"outer"``, or ``"at"``.
-    - *inputs* is a tuple of the input arguments to the ``ufunc``
-    - *kwargs* contains any optional or keyword arguments passed to the
-      function. This includes any ``out`` arguments, which are always
-      contained in a tuple.
-
-A typical implementation would convert any inputs or outputs that are
-instances of one's own class, pass everything on to a superclass using
-``super()``, and finally return the results after possible
-back-conversion. An example, taken from the test case
-``test_ufunc_override_with_super`` in ``core/tests/test_umath.py``, is the
-following.
-
-.. testcode::
-
-    input numpy as np
-
-    class A(np.ndarray):
-        def __array_ufunc__(self, ufunc, method, *inputs, out=None, **kwargs):
-            args = []
-            in_no = []
-            for i, input_ in enumerate(inputs):
-                if isinstance(input_, A):
-                    in_no.append(i)
-                    args.append(input_.view(np.ndarray))
-                else:
-                    args.append(input_)
-
-            outputs = out
-            out_no = []
-            if outputs:
-                out_args = []
-                for j, output in enumerate(outputs):
-                    if isinstance(output, A):
-                        out_no.append(j)
-                        out_args.append(output.view(np.ndarray))
-                    else:
-                        out_args.append(output)
-                kwargs['out'] = tuple(out_args)
-            else:
-                outputs = (None,) * ufunc.nout
-
-            info = {}
-            if in_no:
-                info['inputs'] = in_no
-            if out_no:
-                info['outputs'] = out_no
-
-            results = super(A, self).__array_ufunc__(ufunc, method,
-                                                     *args, **kwargs)
-            if results is NotImplemented:
-                return NotImplemented
-
-            if method == 'at':
-                if isinstance(inputs[0], A):
-                    inputs[0].info = info
-                return
-
-            if ufunc.nout == 1:
-                results = (results,)
-
-            results = tuple((np.asarray(result).view(A)
-                             if output is None else output)
-                            for result, output in zip(results, outputs))
-            if results and isinstance(results[0], A):
-                results[0].info = info
-
-            return results[0] if len(results) == 1 else results
-
-So, this class does not actually do anything interesting: it just
-converts any instances of its own to regular ndarray (otherwise, we'd
-get infinite recursion!), and adds an ``info`` dictionary that tells
-which inputs and outputs it converted. Hence, e.g.,
-
->>> a = np.arange(5.).view(A)
->>> b = np.sin(a)
->>> b.info
-{'inputs': [0]}
->>> b = np.sin(np.arange(5.), out=(a,))
->>> b.info
-{'outputs': [0]}
->>> a = np.arange(5.).view(A)
->>> b = np.ones(1).view(A)
->>> c = a + b
->>> c.info
-{'inputs': [0, 1]}
->>> a += b
->>> a.info
-{'inputs': [0, 1], 'outputs': [0]}
-
-Note that another approach would be to to use ``getattr(ufunc,
-methods)(*inputs, **kwargs)`` instead of the ``super`` call. For this example,
-the result would be identical, but there is a difference if another operand
-also defines ``__array_ufunc__``. E.g., lets assume that we evalulate
-``np.add(a, b)``, where ``b`` is an instance of another class ``B`` that has
-an override.  If you use ``super`` as in the example,
-``ndarray.__array_ufunc__`` will notice that ``b`` has an override, which
-means it cannot evaluate the result itself. Thus, it will return
-`NotImplemented` and so will our class ``A``. Then, control will be passed
-over to ``b``, which either knows how to deal with us and produces a result,
-or does not and returns `NotImplemented`, raising a ``TypeError``.
-
-If instead, we replace our ``super`` call with ``getattr(ufunc, method)``, we
-effectively do ``np.add(a.view(np.ndarray), b)``. Again, ``B.__array_ufunc__``
-will be called, but now it sees an ``ndarray`` as the other argument. Likely,
-it will know how to handle this, and return a new instance of the ``B`` class
-to us. Our example class is not set up to handle this, but it might well be
-the best approach if, e.g., one were to re-implement ``MaskedArray`` using
-``__array_ufunc__``.
-
-As a final note: if the ``super`` route is suited to a given class, an
-advantage of using it is that it helps in constructing class hierarchies.
-E.g., suppose that our other class ``B`` also used the ``super`` in its
-``__array_ufunc__`` implementation, and we created a class ``C`` that depended
-on both, i.e., ``class C(A, B)`` (with, for simplicity, not another
-``__array_ufunc__`` override). Then any ufunc on an instance of ``C`` would
-pass on to ``A.__array_ufunc__``, the ``super`` call in ``A`` would go to
-``B.__array_ufunc__``, and the ``super`` call in ``B`` would go to
-``ndarray.__array_ufunc__``, thus allowing ``A`` and ``B`` to collaborate.
-
-.. _array-wrap:
-
-``__array_wrap__`` for ufuncs and other functions
--------------------------------------------------
-
-Prior to numpy 1.13, the behaviour of ufuncs could only be tuned using
-``__array_wrap__`` and ``__array_prepare__``. These two allowed one to
-change the output type of a ufunc, but, in contrast to
-``__array_ufunc__``, did not allow one to make any changes to the inputs.
-It is hoped to eventually deprecate these, but ``__array_wrap__`` is also
-used by other numpy functions and methods, such as ``squeeze``, so at the
-present time is still needed for full functionality.
-
-Conceptually, ``__array_wrap__`` "wraps up the action" in the sense of
-allowing a subclass to set the type of the return value and update
-attributes and metadata.  Let's show how this works with an example.  First
-we return to the simpler example subclass, but with a different name and
-some print statements:
-
-.. testcode::
-
-  import numpy as np
-
-  class MySubClass(np.ndarray):
-
-      def __new__(cls, input_array, info=None):
-          obj = np.asarray(input_array).view(cls)
-          obj.info = info
-          return obj
-
-      def __array_finalize__(self, obj):
-          print('In __array_finalize__:')
-          print('   self is %s' % repr(self))
-          print('   obj is %s' % repr(obj))
-          if obj is None: return
-          self.info = getattr(obj, 'info', None)
-
-      def __array_wrap__(self, out_arr, context=None):
-          print('In __array_wrap__:')
-          print('   self is %s' % repr(self))
-          print('   arr is %s' % repr(out_arr))
-          # then just call the parent
-          return super(MySubClass, self).__array_wrap__(self, out_arr, context)
-
-We run a ufunc on an instance of our new array:
-
->>> obj = MySubClass(np.arange(5), info='spam')
-In __array_finalize__:
-   self is MySubClass([0, 1, 2, 3, 4])
-   obj is array([0, 1, 2, 3, 4])
->>> arr2 = np.arange(5)+1
->>> ret = np.add(arr2, obj)
-In __array_wrap__:
-   self is MySubClass([0, 1, 2, 3, 4])
-   arr is array([1, 3, 5, 7, 9])
-In __array_finalize__:
-   self is MySubClass([1, 3, 5, 7, 9])
-   obj is MySubClass([0, 1, 2, 3, 4])
->>> ret
-MySubClass([1, 3, 5, 7, 9])
->>> ret.info
-'spam'
-
-Note that the ufunc (``np.add``) has called the ``__array_wrap__`` method
-with arguments ``self`` as ``obj``, and ``out_arr`` as the (ndarray) result
-of the addition.  In turn, the default ``__array_wrap__``
-(``ndarray.__array_wrap__``) has cast the result to class ``MySubClass``,
-and called ``__array_finalize__`` - hence the copying of the ``info``
-attribute.  This has all happened at the C level.
-
-But, we could do anything we wanted:
-
-.. testcode::
-
-  class SillySubClass(np.ndarray):
-
-      def __array_wrap__(self, arr, context=None):
-          return 'I lost your data'
-
->>> arr1 = np.arange(5)
->>> obj = arr1.view(SillySubClass)
->>> arr2 = np.arange(5)
->>> ret = np.multiply(obj, arr2)
->>> ret
-'I lost your data'
-
-So, by defining a specific ``__array_wrap__`` method for our subclass,
-we can tweak the output from ufuncs. The ``__array_wrap__`` method
-requires ``self``, then an argument - which is the result of the ufunc -
-and an optional parameter *context*. This parameter is returned by
-ufuncs as a 3-element tuple: (name of the ufunc, arguments of the ufunc,
-domain of the ufunc), but is not set by other numpy functions. Though,
-as seen above, it is possible to do otherwise, ``__array_wrap__`` should
-return an instance of its containing class.  See the masked array
-subclass for an implementation.
-
-In addition to ``__array_wrap__``, which is called on the way out of the
-ufunc, there is also an ``__array_prepare__`` method which is called on
-the way into the ufunc, after the output arrays are created but before any
-computation has been performed. The default implementation does nothing
-but pass through the array. ``__array_prepare__`` should not attempt to
-access the array data or resize the array, it is intended for setting the
-output array type, updating attributes and metadata, and performing any
-checks based on the input that may be desired before computation begins.
-Like ``__array_wrap__``, ``__array_prepare__`` must return an ndarray or
-subclass thereof or raise an error.
-
-Extra gotchas - custom ``__del__`` methods and ndarray.base
------------------------------------------------------------
-
-One of the problems that ndarray solves is keeping track of memory
-ownership of ndarrays and their views.  Consider the case where we have
-created an ndarray, ``arr`` and have taken a slice with ``v = arr[1:]``.
-The two objects are looking at the same memory.  NumPy keeps track of
-where the data came from for a particular array or view, with the
-``base`` attribute:
-
->>> # A normal ndarray, that owns its own data
->>> arr = np.zeros((4,))
->>> # In this case, base is None
->>> arr.base is None
-True
->>> # We take a view
->>> v1 = arr[1:]
->>> # base now points to the array that it derived from
->>> v1.base is arr
-True
->>> # Take a view of a view
->>> v2 = v1[1:]
->>> # base points to the view it derived from
->>> v2.base is v1
-True
-
-In general, if the array owns its own memory, as for ``arr`` in this
-case, then ``arr.base`` will be None - there are some exceptions to this
-- see the numpy book for more details.
-
-The ``base`` attribute is useful in being able to tell whether we have
-a view or the original array.  This in turn can be useful if we need
-to know whether or not to do some specific cleanup when the subclassed
-array is deleted.  For example, we may only want to do the cleanup if
-the original array is deleted, but not the views.  For an example of
-how this can work, have a look at the ``memmap`` class in
-``numpy.core``.
-
-Subclassing and Downstream Compatibility
-----------------------------------------
-
-When sub-classing ``ndarray`` or creating duck-types that mimic the ``ndarray``
-interface, it is your responsibility to decide how aligned your APIs will be
-with those of numpy. For convenience, many numpy functions that have a corresponding
-``ndarray`` method (e.g., ``sum``, ``mean``, ``take``, ``reshape``) work by checking
-if the first argument to a function has a method of the same name. If it exists, the
-method is called instead of coercing the arguments to a numpy array.
-
-For example, if you want your sub-class or duck-type to be compatible with
-numpy's ``sum`` function, the method signature for this object's ``sum`` method
-should be the following:
-
-.. testcode::
-
-    def sum(self, axis=None, dtype=None, out=None, keepdims=False):
-    ...
-
-This is the exact same method signature for ``np.sum``, so now if a user calls
-``np.sum`` on this object, numpy will call the object's own ``sum`` method and
-pass in these arguments enumerated above in the signature, and no errors will
-be raised because the signatures are completely compatible with each other.
-
-If, however, you decide to deviate from this signature and do something like this:
-
-.. testcode::
-
-   def sum(self, axis=None, dtype=None):
-   ...
-
-This object is no longer compatible with ``np.sum`` because if you call ``np.sum``,
-it will pass in unexpected arguments ``out`` and ``keepdims``, causing a TypeError
-to be raised.
-
-If you wish to maintain compatibility with numpy and its subsequent versions (which
-might add new keyword arguments) but do not want to surface all of numpy's arguments,
-your function's signature should accept ``**kwargs``. For example:
-
-.. testcode::
-
-   def sum(self, axis=None, dtype=None, **unused_kwargs):
-   ...
-
-This object is now compatible with ``np.sum`` again because any extraneous arguments
-(i.e. keywords that are not ``axis`` or ``dtype``) will be hidden away in the
-``**unused_kwargs`` parameter.
-
-"""
diff --git a/numpy/emath.pyi b/numpy/emath.pyi
new file mode 100644
index 000000000..032ec9505
--- /dev/null
+++ b/numpy/emath.pyi
@@ -0,0 +1,11 @@
+from typing import Any
+
+sqrt: Any
+log: Any
+log2: Any
+logn: Any
+log10: Any
+power: Any
+arccos: Any
+arcsin: Any
+arctanh: Any
diff --git a/numpy/f2py/__init__.py b/numpy/f2py/__init__.py
index 949bac0ff..07ab6cd7d 100644
--- a/numpy/f2py/__init__.py
+++ b/numpy/f2py/__init__.py
@@ -9,7 +9,6 @@ import subprocess
 import os
 
 from . import f2py2e
-from . import f2py_testing
 from . import diagnose
 
 run_main = f2py2e.run_main
@@ -21,7 +20,8 @@ def compile(source,
             extra_args='',
             verbose=True,
             source_fn=None,
-            extension='.f'
+            extension='.f',
+            full_output=False
            ):
     """
     Build extension module from a Fortran 77 source string with f2py.
@@ -55,10 +55,19 @@ def compile(source,
 
         .. versionadded:: 1.11.0
 
+    full_output : bool, optional
+        If True, return a `subprocess.CompletedProcess` containing
+        the stdout and stderr of the compile process, instead of just
+        the status code.
+
+        .. versionadded:: 1.20.0
+
+
     Returns
     -------
-    result : int
-        0 on success
+    result : int or `subprocess.CompletedProcess`
+        0 on success, or a `subprocess.CompletedProcess` if
+        ``full_output=True``
 
     Examples
     --------
@@ -95,24 +104,50 @@ def compile(source,
              '-c',
              'import numpy.f2py as f2py2e;f2py2e.main()'] + args
         try:
-            output = subprocess.check_output(c)
-        except subprocess.CalledProcessError as exc:
-            status = exc.returncode
-            output = ''
+            cp = subprocess.run(c, stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
         except OSError:
             # preserve historic status code used by exec_command()
-            status = 127
-            output = ''
+            cp = subprocess.CompletedProcess(c, 127, stdout=b'', stderr=b'')
         else:
-            status = 0
-            output = output.decode()
-        if verbose:
-            print(output)
+            if verbose:
+                print(cp.stdout.decode())
     finally:
         if source_fn is None:
             os.remove(fname)
-    return status
 
-from numpy._pytesttester import PytestTester
-test = PytestTester(__name__)
-del PytestTester
+    if full_output:
+        return cp
+    else:
+        return cp.returncode
+
+
+if sys.version_info[:2] >= (3, 7):
+    # module level getattr is only supported in 3.7 onwards
+    # https://www.python.org/dev/peps/pep-0562/
+    def __getattr__(attr):
+
+        # Avoid importing things that aren't needed for building
+        # which might import the main numpy module
+        if attr == "f2py_testing":
+            import numpy.f2py.f2py_testing as f2py_testing
+            return f2py_testing
+
+        elif attr == "test":
+            from numpy._pytesttester import PytestTester
+            test = PytestTester(__name__)
+            return test
+
+        else:
+            raise AttributeError("module {!r} has no attribute "
+                                 "{!r}".format(__name__, attr))
+
+    def __dir__():
+        return list(globals().keys() | {"f2py_testing", "test"})
+
+else:
+    from . import f2py_testing
+
+    from numpy._pytesttester import PytestTester
+    test = PytestTester(__name__)
+    del PytestTester
diff --git a/numpy/f2py/__init__.pyi b/numpy/f2py/__init__.pyi
new file mode 100644
index 000000000..602517957
--- /dev/null
+++ b/numpy/f2py/__init__.pyi
@@ -0,0 +1,5 @@
+from typing import Any
+
+run_main: Any
+compile: Any
+f2py_testing: Any
diff --git a/numpy/f2py/__version__.py b/numpy/f2py/__version__.py
index 104c2e1a8..e20d7c1db 100644
--- a/numpy/f2py/__version__.py
+++ b/numpy/f2py/__version__.py
@@ -1,8 +1 @@
-major = 2
-
-try:
-    from __svn_version__ import version
-    version_info = (major, version)
-    version = '%s_%s' % version_info
-except (ImportError, ValueError):
-    version = str(major)
+from numpy.version import version
diff --git a/numpy/f2py/capi_maps.py b/numpy/f2py/capi_maps.py
index fabbfc4c2..472ddde43 100644
--- a/numpy/f2py/capi_maps.py
+++ b/numpy/f2py/capi_maps.py
@@ -11,8 +11,6 @@ $Date: 2005/05/06 10:57:33 $
 Pearu Peterson
 
 """
-__version__ = "$Revision: 1.60 $"[10:-1]
-
 from . import __version__
 f2py_version = __version__.version
 
diff --git a/numpy/f2py/cfuncs.py b/numpy/f2py/cfuncs.py
index ccbc9b0fb..26b43e7e6 100644
--- a/numpy/f2py/cfuncs.py
+++ b/numpy/f2py/cfuncs.py
@@ -286,11 +286,11 @@ static int f2py_size(PyArrayObject* var, ...)
 """
 
 cppmacros[
-    'pyobj_from_char1'] = '#define pyobj_from_char1(v) (PyInt_FromLong(v))'
+    'pyobj_from_char1'] = '#define pyobj_from_char1(v) (PyLong_FromLong(v))'
 cppmacros[
-    'pyobj_from_short1'] = '#define pyobj_from_short1(v) (PyInt_FromLong(v))'
+    'pyobj_from_short1'] = '#define pyobj_from_short1(v) (PyLong_FromLong(v))'
 needs['pyobj_from_int1'] = ['signed_char']
-cppmacros['pyobj_from_int1'] = '#define pyobj_from_int1(v) (PyInt_FromLong(v))'
+cppmacros['pyobj_from_int1'] = '#define pyobj_from_int1(v) (PyLong_FromLong(v))'
 cppmacros[
     'pyobj_from_long1'] = '#define pyobj_from_long1(v) (PyLong_FromLong(v))'
 needs['pyobj_from_long_long1'] = ['long_long']
@@ -320,10 +320,10 @@ cppmacros[
     'pyobj_from_complex_float1'] = '#define pyobj_from_complex_float1(v) (PyComplex_FromDoubles(v.r,v.i))'
 needs['pyobj_from_string1'] = ['string']
 cppmacros[
-    'pyobj_from_string1'] = '#define pyobj_from_string1(v) (PyString_FromString((char *)v))'
+    'pyobj_from_string1'] = '#define pyobj_from_string1(v) (PyUnicode_FromString((char *)v))'
 needs['pyobj_from_string1size'] = ['string']
 cppmacros[
-    'pyobj_from_string1size'] = '#define pyobj_from_string1size(v,len) (PyUString_FromStringAndSize((char *)v, len))'
+    'pyobj_from_string1size'] = '#define pyobj_from_string1size(v,len) (PyUnicode_FromStringAndSize((char *)v, len))'
 needs['TRYPYARRAYTEMPLATE'] = ['PRINTPYOBJERR']
 cppmacros['TRYPYARRAYTEMPLATE'] = """\
 /* New SciPy */
@@ -436,9 +436,9 @@ cppmacros['GETSTRFROMPYTUPLE'] = """\
         PyObject *rv_cb_str = PyTuple_GetItem((tuple),(index));\\
         if (rv_cb_str == NULL)\\
             goto capi_fail;\\
-        if (PyString_Check(rv_cb_str)) {\\
+        if (PyBytes_Check(rv_cb_str)) {\\
             str[len-1]='\\0';\\
-            STRINGCOPYN((str),PyString_AS_STRING((PyStringObject*)rv_cb_str),(len));\\
+            STRINGCOPYN((str),PyBytes_AS_STRING((PyBytesObject*)rv_cb_str),(len));\\
         } else {\\
             PRINTPYOBJERR(rv_cb_str);\\
             PyErr_SetString(#modulename#_error,\"string object expected\");\\
@@ -629,7 +629,9 @@ capi_fail:
 """
 needs['string_from_pyobj'] = ['string', 'STRINGMALLOC', 'STRINGCOPYN']
 cfuncs['string_from_pyobj'] = """\
-static int string_from_pyobj(string *str,int *len,const string inistr,PyObject *obj,const char *errmess) {
+static int
+string_from_pyobj(string *str,int *len,const string inistr,PyObject *obj,const char *errmess)
+{
     PyArrayObject *arr = NULL;
     PyObject *tmp = NULL;
 #ifdef DEBUGCFUNCS
@@ -655,7 +657,7 @@ fprintf(stderr,\"string_from_pyobj(str='%s',len=%d,inistr='%s',obj=%p)\\n\",(cha
         STRINGCOPYN(*str,PyArray_DATA(arr),*len+1);
         return 1;
     }
-    if (PyString_Check(obj)) {
+    if (PyBytes_Check(obj)) {
         tmp = obj;
         Py_INCREF(tmp);
     }
@@ -675,200 +677,244 @@ fprintf(stderr,\"string_from_pyobj(str='%s',len=%d,inistr='%s',obj=%p)\\n\",(cha
     }
     if (tmp == NULL) goto capi_fail;
     if (*len == -1)
-        *len = PyString_GET_SIZE(tmp);
+        *len = PyBytes_GET_SIZE(tmp);
     STRINGMALLOC(*str,*len);
-    STRINGCOPYN(*str,PyString_AS_STRING(tmp),*len+1);
+    STRINGCOPYN(*str,PyBytes_AS_STRING(tmp),*len+1);
     Py_DECREF(tmp);
     return 1;
 capi_fail:
     Py_XDECREF(tmp);
     {
         PyObject* err = PyErr_Occurred();
-        if (err==NULL) err = #modulename#_error;
-        PyErr_SetString(err,errmess);
+        if (err == NULL) {
+            err = #modulename#_error;
+        }
+        PyErr_SetString(err, errmess);
     }
     return 0;
 }
 """
+
+
 needs['char_from_pyobj'] = ['int_from_pyobj']
 cfuncs['char_from_pyobj'] = """\
-static int char_from_pyobj(char* v,PyObject *obj,const char *errmess) {
-    int i=0;
-    if (int_from_pyobj(&i,obj,errmess)) {
+static int
+char_from_pyobj(char* v, PyObject *obj, const char *errmess) {
+    int i = 0;
+    if (int_from_pyobj(&i, obj, errmess)) {
         *v = (char)i;
         return 1;
     }
     return 0;
 }
 """
+
+
 needs['signed_char_from_pyobj'] = ['int_from_pyobj', 'signed_char']
 cfuncs['signed_char_from_pyobj'] = """\
-static int signed_char_from_pyobj(signed_char* v,PyObject *obj,const char *errmess) {
-    int i=0;
-    if (int_from_pyobj(&i,obj,errmess)) {
+static int
+signed_char_from_pyobj(signed_char* v, PyObject *obj, const char *errmess) {
+    int i = 0;
+    if (int_from_pyobj(&i, obj, errmess)) {
         *v = (signed_char)i;
         return 1;
     }
     return 0;
 }
 """
+
+
 needs['short_from_pyobj'] = ['int_from_pyobj']
 cfuncs['short_from_pyobj'] = """\
-static int short_from_pyobj(short* v,PyObject *obj,const char *errmess) {
-    int i=0;
-    if (int_from_pyobj(&i,obj,errmess)) {
+static int
+short_from_pyobj(short* v, PyObject *obj, const char *errmess) {
+    int i = 0;
+    if (int_from_pyobj(&i, obj, errmess)) {
         *v = (short)i;
         return 1;
     }
     return 0;
 }
 """
+
+
 cfuncs['int_from_pyobj'] = """\
-static int int_from_pyobj(int* v,PyObject *obj,const char *errmess) {
+static int
+int_from_pyobj(int* v, PyObject *obj, const char *errmess)
+{
     PyObject* tmp = NULL;
-    if (PyInt_Check(obj)) {
-        *v = (int)PyInt_AS_LONG(obj);
-        return 1;
+
+    if (PyLong_Check(obj)) {
+        *v = Npy__PyLong_AsInt(obj);
+        return !(*v == -1 && PyErr_Occurred());
     }
-    tmp = PyNumber_Int(obj);
+
+    tmp = PyNumber_Long(obj);
     if (tmp) {
-        *v = PyInt_AS_LONG(tmp);
+        *v = Npy__PyLong_AsInt(tmp);
         Py_DECREF(tmp);
-        return 1;
+        return !(*v == -1 && PyErr_Occurred());
     }
+
     if (PyComplex_Check(obj))
         tmp = PyObject_GetAttrString(obj,\"real\");
-    else if (PyString_Check(obj) || PyUnicode_Check(obj))
+    else if (PyBytes_Check(obj) || PyUnicode_Check(obj))
         /*pass*/;
     else if (PySequence_Check(obj))
-        tmp = PySequence_GetItem(obj,0);
+        tmp = PySequence_GetItem(obj, 0);
     if (tmp) {
         PyErr_Clear();
-        if (int_from_pyobj(v,tmp,errmess)) {Py_DECREF(tmp); return 1;}
+        if (int_from_pyobj(v, tmp, errmess)) {
+            Py_DECREF(tmp);
+            return 1;
+        }
         Py_DECREF(tmp);
     }
     {
         PyObject* err = PyErr_Occurred();
-        if (err==NULL) err = #modulename#_error;
-        PyErr_SetString(err,errmess);
+        if (err == NULL) {
+            err = #modulename#_error;
+        }
+        PyErr_SetString(err, errmess);
     }
     return 0;
 }
 """
+
+
 cfuncs['long_from_pyobj'] = """\
-static int long_from_pyobj(long* v,PyObject *obj,const char *errmess) {
+static int
+long_from_pyobj(long* v, PyObject *obj, const char *errmess) {
     PyObject* tmp = NULL;
-    if (PyInt_Check(obj)) {
-        *v = PyInt_AS_LONG(obj);
-        return 1;
+
+    if (PyLong_Check(obj)) {
+        *v = PyLong_AsLong(obj);
+        return !(*v == -1 && PyErr_Occurred());
     }
-    tmp = PyNumber_Int(obj);
+
+    tmp = PyNumber_Long(obj);
     if (tmp) {
-        *v = PyInt_AS_LONG(tmp);
+        *v = PyLong_AsLong(tmp);
         Py_DECREF(tmp);
-        return 1;
+        return !(*v == -1 && PyErr_Occurred());
     }
+
     if (PyComplex_Check(obj))
         tmp = PyObject_GetAttrString(obj,\"real\");
-    else if (PyString_Check(obj) || PyUnicode_Check(obj))
+    else if (PyBytes_Check(obj) || PyUnicode_Check(obj))
         /*pass*/;
     else if (PySequence_Check(obj))
         tmp = PySequence_GetItem(obj,0);
+
     if (tmp) {
         PyErr_Clear();
-        if (long_from_pyobj(v,tmp,errmess)) {Py_DECREF(tmp); return 1;}
+        if (long_from_pyobj(v, tmp, errmess)) {
+            Py_DECREF(tmp);
+            return 1;
+        }
         Py_DECREF(tmp);
     }
     {
         PyObject* err = PyErr_Occurred();
-        if (err==NULL) err = #modulename#_error;
-        PyErr_SetString(err,errmess);
+        if (err == NULL) {
+            err = #modulename#_error;
+        }
+        PyErr_SetString(err, errmess);
     }
     return 0;
 }
 """
+
+
 needs['long_long_from_pyobj'] = ['long_long']
 cfuncs['long_long_from_pyobj'] = """\
-static int long_long_from_pyobj(long_long* v,PyObject *obj,const char *errmess) {
+static int
+long_long_from_pyobj(long_long* v, PyObject *obj, const char *errmess)
+{
     PyObject* tmp = NULL;
+
     if (PyLong_Check(obj)) {
         *v = PyLong_AsLongLong(obj);
-        return (!PyErr_Occurred());
-    }
-    if (PyInt_Check(obj)) {
-        *v = (long_long)PyInt_AS_LONG(obj);
-        return 1;
+        return !(*v == -1 && PyErr_Occurred());
     }
+
     tmp = PyNumber_Long(obj);
     if (tmp) {
         *v = PyLong_AsLongLong(tmp);
         Py_DECREF(tmp);
-        return (!PyErr_Occurred());
+        return !(*v == -1 && PyErr_Occurred());
     }
+
     if (PyComplex_Check(obj))
         tmp = PyObject_GetAttrString(obj,\"real\");
-    else if (PyString_Check(obj) || PyUnicode_Check(obj))
+    else if (PyBytes_Check(obj) || PyUnicode_Check(obj))
         /*pass*/;
     else if (PySequence_Check(obj))
         tmp = PySequence_GetItem(obj,0);
     if (tmp) {
         PyErr_Clear();
-        if (long_long_from_pyobj(v,tmp,errmess)) {Py_DECREF(tmp); return 1;}
+        if (long_long_from_pyobj(v, tmp, errmess)) {
+            Py_DECREF(tmp);
+            return 1;
+        }
         Py_DECREF(tmp);
     }
     {
         PyObject* err = PyErr_Occurred();
-        if (err==NULL) err = #modulename#_error;
+        if (err == NULL) {
+            err = #modulename#_error;
+        }
         PyErr_SetString(err,errmess);
     }
     return 0;
 }
 """
+
+
 needs['long_double_from_pyobj'] = ['double_from_pyobj', 'long_double']
 cfuncs['long_double_from_pyobj'] = """\
-static int long_double_from_pyobj(long_double* v,PyObject *obj,const char *errmess) {
+static int
+long_double_from_pyobj(long_double* v, PyObject *obj, const char *errmess)
+{
     double d=0;
     if (PyArray_CheckScalar(obj)){
         if PyArray_IsScalar(obj, LongDouble) {
             PyArray_ScalarAsCtype(obj, v);
             return 1;
         }
-        else if (PyArray_Check(obj) && PyArray_TYPE(obj)==NPY_LONGDOUBLE) {
+        else if (PyArray_Check(obj) && PyArray_TYPE(obj) == NPY_LONGDOUBLE) {
             (*v) = *((npy_longdouble *)PyArray_DATA(obj));
             return 1;
         }
     }
-    if (double_from_pyobj(&d,obj,errmess)) {
+    if (double_from_pyobj(&d, obj, errmess)) {
         *v = (long_double)d;
         return 1;
     }
     return 0;
 }
 """
+
+
 cfuncs['double_from_pyobj'] = """\
-static int double_from_pyobj(double* v,PyObject *obj,const char *errmess) {
+static int
+double_from_pyobj(double* v, PyObject *obj, const char *errmess)
+{
     PyObject* tmp = NULL;
     if (PyFloat_Check(obj)) {
-#ifdef __sgi
         *v = PyFloat_AsDouble(obj);
-#else
-        *v = PyFloat_AS_DOUBLE(obj);
-#endif
-        return 1;
+        return !(*v == -1.0 && PyErr_Occurred());
     }
+
     tmp = PyNumber_Float(obj);
     if (tmp) {
-#ifdef __sgi
         *v = PyFloat_AsDouble(tmp);
-#else
-        *v = PyFloat_AS_DOUBLE(tmp);
-#endif
         Py_DECREF(tmp);
-        return 1;
+        return !(*v == -1.0 && PyErr_Occurred());
     }
     if (PyComplex_Check(obj))
         tmp = PyObject_GetAttrString(obj,\"real\");
-    else if (PyString_Check(obj) || PyUnicode_Check(obj))
+    else if (PyBytes_Check(obj) || PyUnicode_Check(obj))
         /*pass*/;
     else if (PySequence_Check(obj))
         tmp = PySequence_GetItem(obj,0);
@@ -885,9 +931,13 @@ static int double_from_pyobj(double* v,PyObject *obj,const char *errmess) {
     return 0;
 }
 """
+
+
 needs['float_from_pyobj'] = ['double_from_pyobj']
 cfuncs['float_from_pyobj'] = """\
-static int float_from_pyobj(float* v,PyObject *obj,const char *errmess) {
+static int
+float_from_pyobj(float* v, PyObject *obj, const char *errmess)
+{
     double d=0.0;
     if (double_from_pyobj(&d,obj,errmess)) {
         *v = (float)d;
@@ -896,11 +946,15 @@ static int float_from_pyobj(float* v,PyObject *obj,const char *errmess) {
     return 0;
 }
 """
+
+
 needs['complex_long_double_from_pyobj'] = ['complex_long_double', 'long_double',
                                            'complex_double_from_pyobj']
 cfuncs['complex_long_double_from_pyobj'] = """\
-static int complex_long_double_from_pyobj(complex_long_double* v,PyObject *obj,const char *errmess) {
-    complex_double cd={0.0,0.0};
+static int
+complex_long_double_from_pyobj(complex_long_double* v, PyObject *obj, const char *errmess)
+{
+    complex_double cd = {0.0,0.0};
     if (PyArray_CheckScalar(obj)){
         if PyArray_IsScalar(obj, CLongDouble) {
             PyArray_ScalarAsCtype(obj, v);
@@ -920,13 +974,17 @@ static int complex_long_double_from_pyobj(complex_long_double* v,PyObject *obj,c
     return 0;
 }
 """
+
+
 needs['complex_double_from_pyobj'] = ['complex_double']
 cfuncs['complex_double_from_pyobj'] = """\
-static int complex_double_from_pyobj(complex_double* v,PyObject *obj,const char *errmess) {
+static int
+complex_double_from_pyobj(complex_double* v, PyObject *obj, const char *errmess) {
     Py_complex c;
     if (PyComplex_Check(obj)) {
-        c=PyComplex_AsCComplex(obj);
-        (*v).r=c.real, (*v).i=c.imag;
+        c = PyComplex_AsCComplex(obj);
+        (*v).r = c.real;
+        (*v).i = c.imag;
         return 1;
     }
     if (PyArray_IsScalar(obj, ComplexFloating)) {
@@ -955,30 +1013,24 @@ static int complex_double_from_pyobj(complex_double* v,PyObject *obj,const char
         else {
             arr = PyArray_FromScalar(obj, PyArray_DescrFromType(NPY_CDOUBLE));
         }
-        if (arr==NULL) return 0;
+        if (arr == NULL) {
+            return 0;
+        }
         (*v).r = ((npy_cdouble *)PyArray_DATA(arr))->real;
         (*v).i = ((npy_cdouble *)PyArray_DATA(arr))->imag;
         return 1;
     }
     /* Python does not provide PyNumber_Complex function :-( */
-    (*v).i=0.0;
+    (*v).i = 0.0;
     if (PyFloat_Check(obj)) {
-#ifdef __sgi
         (*v).r = PyFloat_AsDouble(obj);
-#else
-        (*v).r = PyFloat_AS_DOUBLE(obj);
-#endif
-        return 1;
-    }
-    if (PyInt_Check(obj)) {
-        (*v).r = (double)PyInt_AS_LONG(obj);
-        return 1;
+        return !((*v).r == -1.0 && PyErr_Occurred());
     }
     if (PyLong_Check(obj)) {
         (*v).r = PyLong_AsDouble(obj);
-        return (!PyErr_Occurred());
+        return !((*v).r == -1.0 && PyErr_Occurred());
     }
-    if (PySequence_Check(obj) && !(PyString_Check(obj) || PyUnicode_Check(obj))) {
+    if (PySequence_Check(obj) && !(PyBytes_Check(obj) || PyUnicode_Check(obj))) {
         PyObject *tmp = PySequence_GetItem(obj,0);
         if (tmp) {
             if (complex_double_from_pyobj(v,tmp,errmess)) {
@@ -997,10 +1049,14 @@ static int complex_double_from_pyobj(complex_double* v,PyObject *obj,const char
     return 0;
 }
 """
+
+
 needs['complex_float_from_pyobj'] = [
     'complex_float', 'complex_double_from_pyobj']
 cfuncs['complex_float_from_pyobj'] = """\
-static int complex_float_from_pyobj(complex_float* v,PyObject *obj,const char *errmess) {
+static int
+complex_float_from_pyobj(complex_float* v,PyObject *obj,const char *errmess)
+{
     complex_double cd={0.0,0.0};
     if (complex_double_from_pyobj(&cd,obj,errmess)) {
         (*v).r = (float)cd.r;
@@ -1010,6 +1066,8 @@ static int complex_float_from_pyobj(complex_float* v,PyObject *obj,const char *e
     return 0;
 }
 """
+
+
 needs['try_pyarr_from_char'] = ['pyobj_from_char1', 'TRYPYARRAYTEMPLATE']
 cfuncs[
     'try_pyarr_from_char'] = 'static int try_pyarr_from_char(PyObject* obj,char* v) {\n    TRYPYARRAYTEMPLATE(char,\'c\');\n}\n'
@@ -1047,14 +1105,18 @@ needs['try_pyarr_from_complex_double'] = [
 cfuncs[
     'try_pyarr_from_complex_double'] = 'static int try_pyarr_from_complex_double(PyObject* obj,complex_double* v) {\n    TRYCOMPLEXPYARRAYTEMPLATE(double,\'D\');\n}\n'
 
-needs['create_cb_arglist'] = ['CFUNCSMESS', 'PRINTPYOBJERR', 'MINMAX']
 
+needs['create_cb_arglist'] = ['CFUNCSMESS', 'PRINTPYOBJERR', 'MINMAX']
 # create the list of arguments to be used when calling back to python
 cfuncs['create_cb_arglist'] = """\
-static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofargs,const int nofoptargs,int *nofargs,PyTupleObject **args,const char *errmess) {
+static int
+create_cb_arglist(PyObject* fun, PyTupleObject* xa , const int maxnofargs,
+                  const int nofoptargs, int *nofargs, PyTupleObject **args,
+                  const char *errmess)
+{
     PyObject *tmp = NULL;
     PyObject *tmp_fun = NULL;
-    int tot,opt,ext,siz,i,di=0;
+    Py_ssize_t tot, opt, ext, siz, i, di = 0;
     CFUNCSMESS(\"create_cb_arglist\\n\");
     tot=opt=ext=siz=0;
     /* Get the total number of arguments */
@@ -1103,10 +1165,15 @@ static int create_cb_arglist(PyObject* fun,PyTupleObject* xa,const int maxnofarg
             Py_INCREF(tmp_fun);
         }
     }
-if (tmp_fun==NULL) {
-fprintf(stderr,\"Call-back argument must be function|instance|instance.__call__|f2py-function but got %s.\\n\",(fun==NULL?\"NULL\":Py_TYPE(fun)->tp_name));
-goto capi_fail;
-}
+
+    if (tmp_fun == NULL) {
+        fprintf(stderr,
+                \"Call-back argument must be function|instance|instance.__call__|f2py-function \"
+                \"but got %s.\\n\",
+                ((fun == NULL) ? \"NULL\" : Py_TYPE(fun)->tp_name));
+        goto capi_fail;
+    }
+
     if (PyObject_HasAttrString(tmp_fun,\"__code__\")) {
         if (PyObject_HasAttrString(tmp = PyObject_GetAttrString(tmp_fun,\"__code__\"),\"co_argcount\")) {
             PyObject *tmp_argcount = PyObject_GetAttrString(tmp,\"co_argcount\");
@@ -1114,7 +1181,7 @@ goto capi_fail;
             if (tmp_argcount == NULL) {
                 goto capi_fail;
             }
-            tot = PyInt_AsLong(tmp_argcount) - di;
+            tot = PyLong_AsSsize_t(tmp_argcount) - di;
             Py_DECREF(tmp_argcount);
         }
     }
@@ -1130,13 +1197,23 @@ goto capi_fail;
     /* Calculate the size of call-backs argument list */
     siz = MIN(maxnofargs+ext,tot);
     *nofargs = MAX(0,siz-ext);
+
 #ifdef DEBUGCFUNCS
-    fprintf(stderr,\"debug-capi:create_cb_arglist:maxnofargs(-nofoptargs),tot,opt,ext,siz,nofargs=%d(-%d),%d,%d,%d,%d,%d\\n\",maxnofargs,nofoptargs,tot,opt,ext,siz,*nofargs);
+    fprintf(stderr,
+            \"debug-capi:create_cb_arglist:maxnofargs(-nofoptargs),\"
+            \"tot,opt,ext,siz,nofargs = %d(-%d), %zd, %zd, %zd, %zd, %d\\n\",
+            maxnofargs, nofoptargs, tot, opt, ext, siz, *nofargs);
 #endif
-    if (siz<tot-opt) {
-        fprintf(stderr,\"create_cb_arglist: Failed to build argument list (siz) with enough arguments (tot-opt) required by user-supplied function (siz,tot,opt=%d,%d,%d).\\n\",siz,tot,opt);
+
+    if (siz < tot-opt) {
+        fprintf(stderr,
+                \"create_cb_arglist: Failed to build argument list \"
+                \"(siz) with enough arguments (tot-opt) required by \"
+                \"user-supplied function (siz,tot,opt=%zd, %zd, %zd).\\n\",
+                siz, tot, opt);
         goto capi_fail;
     }
+
     /* Initialize argument list */
     *args = (PyTupleObject *)PyTuple_New(siz);
     for (i=0;i<*nofargs;i++) {
@@ -1152,9 +1229,10 @@ goto capi_fail;
     CFUNCSMESS(\"create_cb_arglist-end\\n\");
     Py_DECREF(tmp_fun);
     return 1;
+
 capi_fail:
-    if ((PyErr_Occurred())==NULL)
-        PyErr_SetString(#modulename#_error,errmess);
+    if (PyErr_Occurred() == NULL)
+        PyErr_SetString(#modulename#_error, errmess);
     Py_XDECREF(tmp_fun);
     return 0;
 }
diff --git a/numpy/f2py/common_rules.py b/numpy/f2py/common_rules.py
index 90483e55b..937d8bc72 100644
--- a/numpy/f2py/common_rules.py
+++ b/numpy/f2py/common_rules.py
@@ -13,8 +13,6 @@ $Date: 2005/05/06 10:57:33 $
 Pearu Peterson
 
 """
-__version__ = "$Revision: 1.19 $"[10:-1]
-
 from . import __version__
 f2py_version = __version__.version
 
diff --git a/numpy/f2py/crackfortran.py b/numpy/f2py/crackfortran.py
index 3d2f97a56..2e95e4596 100755
--- a/numpy/f2py/crackfortran.py
+++ b/numpy/f2py/crackfortran.py
@@ -2103,8 +2103,9 @@ def buildimplicitrules(block):
 
 
 def myeval(e, g=None, l=None):
+    """ Like `eval` but returns only integers and floats """
     r = eval(e, g, l)
-    if type(r) in [type(0), type(0.0)]:
+    if type(r) in [int, float]:
         return r
     raise ValueError('r=%r' % (r))
 
@@ -2112,6 +2113,26 @@ getlincoef_re_1 = re.compile(r'\A\b\w+\b\Z', re.I)
 
 
 def getlincoef(e, xset):  # e = a*x+b ; x in xset
+    """
+    Obtain ``a`` and ``b`` when ``e == "a*x+b"``, where ``x`` is a symbol in
+    xset.
+
+    >>> getlincoef('2*x + 1', {'x'})
+    (2, 1, 'x')
+    >>> getlincoef('3*x + x*2 + 2 + 1', {'x'})
+    (5, 3, 'x')
+    >>> getlincoef('0', {'x'})
+    (0, 0, None)
+    >>> getlincoef('0*x', {'x'})
+    (0, 0, 'x')
+    >>> getlincoef('x*x', {'x'})
+    (None, None, None)
+
+    This can be tricked by sufficiently complex expressions
+
+    >>> getlincoef('(x - 0.5)*(x - 1.5)*(x - 1)*x + 2*x + 3', {'x'})
+    (2.0, 3.0, 'x')
+    """
     try:
         c = int(myeval(e, {}, {}))
         return 0, c, None
@@ -2166,6 +2187,37 @@ _varname_match = re.compile(r'\A[a-z]\w*\Z').match
 
 
 def getarrlen(dl, args, star='*'):
+    """
+    Parameters
+    ----------
+    dl : sequence of two str objects
+        dimensions of the array
+    args : Iterable[str]
+        symbols used in the expression
+    star : Any
+        unused
+
+    Returns
+    -------
+    expr : str
+        Some numeric expression as a string
+    arg : Optional[str]
+        If understood, the argument from `args` present in `expr`
+    expr2 : Optional[str]
+        If understood, an expression fragment that should be used as
+        ``"(%s%s".format(something, expr2)``.
+
+    Examples
+    --------
+    >>> getarrlen(['10*x + 20', '40*x'], {'x'})
+    ('30 * x - 19', 'x', '+19)/(30)')
+    >>> getarrlen(['1', '10*x + 20'], {'x'})
+    ('10 * x + 20', 'x', '-20)/(10)')
+    >>> getarrlen(['10*x + 20', '1'], {'x'})
+    ('-10 * x - 18', 'x', '+18)/(-10)')
+    >>> getarrlen(['20', '1'], {'x'})
+    ('-18', None, None)
+    """
     edl = []
     try:
         edl.append(myeval(dl[0], {}, {}))
diff --git a/numpy/f2py/f2py2e.py b/numpy/f2py/f2py2e.py
index 71a049e41..b45d985aa 100755
--- a/numpy/f2py/f2py2e.py
+++ b/numpy/f2py/f2py2e.py
@@ -29,18 +29,14 @@ from . import __version__
 from . import capi_maps
 
 f2py_version = __version__.version
+numpy_version = __version__.version
 errmess = sys.stderr.write
 # outmess=sys.stdout.write
 show = pprint.pprint
 outmess = auxfuncs.outmess
 
-try:
-    from numpy import __version__ as numpy_version
-except ImportError:
-    numpy_version = 'N/A'
-
-__usage__ = """\
-Usage:
+__usage__ =\
+f"""Usage:
 
 1) To construct extension module sources:
 
@@ -97,8 +93,8 @@ Options:
   --[no-]latex-doc Create (or not) <modulename>module.tex.
                    Default is --no-latex-doc.
   --short-latex    Create 'incomplete' LaTeX document (without commands
-                   \\documentclass, \\tableofcontents, and \\begin{document},
-                   \\end{document}).
+                   \\documentclass, \\tableofcontents, and \\begin{{document}},
+                   \\end{{document}}).
 
   --[no-]rest-doc Create (or not) <modulename>module.rst.
                    Default is --no-rest-doc.
@@ -167,12 +163,12 @@ Extra options (only effective with -c):
   array. Integer <int> sets the threshold for array sizes when
   a message should be shown.
 
-Version:     %s
-numpy Version: %s
+Version:     {f2py_version}
+numpy Version: {numpy_version}
 Requires:    Python 3.5 or higher.
 License:     NumPy license (see LICENSE.txt in the NumPy source code)
 Copyright 1999 - 2011 Pearu Peterson all rights reserved.
-http://cens.ioc.ee/projects/f2py2e/""" % (f2py_version, numpy_version)
+http://cens.ioc.ee/projects/f2py2e/"""
 
 
 def scaninputline(inputline):
@@ -604,7 +600,7 @@ def run_compile():
             if modulename:
                 break
 
-    extra_objects, sources = filter_files('', '[.](o|a|so)', sources)
+    extra_objects, sources = filter_files('', '[.](o|a|so|dylib)', sources)
     include_dirs, sources = filter_files('-I', '', sources, remove_prefix=1)
     library_dirs, sources = filter_files('-L', '', sources, remove_prefix=1)
     libraries, sources = filter_files('-l', '', sources, remove_prefix=1)
@@ -650,7 +646,9 @@ def run_compile():
     sys.argv.extend(['build',
                      '--build-temp', build_dir,
                      '--build-base', build_dir,
-                     '--build-platlib', '.'])
+                     '--build-platlib', '.',
+                     # disable CCompilerOpt
+                     '--disable-optimization'])
     if fc_flags:
         sys.argv.extend(['config_fc'] + fc_flags)
     if flib_flags:
diff --git a/numpy/f2py/rules.py b/numpy/f2py/rules.py
index 56f2033ff..f1490527e 100755
--- a/numpy/f2py/rules.py
+++ b/numpy/f2py/rules.py
@@ -50,18 +50,15 @@ $Date: 2005/08/30 08:58:42 $
 Pearu Peterson
 
 """
-__version__ = "$Revision: 1.129 $"[10:-1]
-
-from . import __version__
-f2py_version = __version__.version
-
-from .. import version as _numpy_version
-numpy_version = _numpy_version.version
-
 import os
 import time
 import copy
 
+# __version__.version is now the same as the NumPy version
+from . import __version__
+f2py_version = __version__.version
+numpy_version = __version__.version
+
 from .auxfuncs import (
     applyrules, debugcapi, dictappend, errmess, gentitle, getargs2,
     hascallstatement, hasexternals, hasinitvalue, hasnote, hasresultnote,
@@ -202,7 +199,7 @@ PyMODINIT_FUNC PyInit_#modulename#(void) {
 \tif (PyErr_Occurred())
 \t\t{PyErr_SetString(PyExc_ImportError, \"can't initialize module #modulename# (failed to import numpy)\"); return m;}
 \td = PyModule_GetDict(m);
-\ts = PyString_FromString(\"$R""" + """evision: $\");
+\ts = PyUnicode_FromString(\"#f2py_version#\");
 \tPyDict_SetItemString(d, \"__version__\", s);
 \tPy_DECREF(s);
 \ts = PyUnicode_FromString(
diff --git a/numpy/f2py/setup.py b/numpy/f2py/setup.py
index 80b47e527..0a35db477 100644
--- a/numpy/f2py/setup.py
+++ b/numpy/f2py/setup.py
@@ -30,6 +30,7 @@ def configuration(parent_package='', top_path=None):
     config.add_data_files(
         'src/fortranobject.c',
         'src/fortranobject.h')
+    config.add_data_files('*.pyi')
     return config
 
 
diff --git a/numpy/f2py/src/fortranobject.c b/numpy/f2py/src/fortranobject.c
index aa46c57d0..3275f90ad 100644
--- a/numpy/f2py/src/fortranobject.c
+++ b/numpy/f2py/src/fortranobject.c
@@ -213,6 +213,8 @@ format_def(char *buf, Py_ssize_t size, FortranDataDef def)
             return -1;
         }
         memcpy(p, notalloc, sizeof(notalloc));
+        p += sizeof(notalloc);
+        size -= sizeof(notalloc);
     }
 
     return p - buf;
@@ -255,7 +257,7 @@ fortran_doc(FortranDataDef def)
     }
     else {
         PyArray_Descr *d = PyArray_DescrFromType(def.type);
-        n = PyOS_snprintf(p, size, "'%c'-", d->type);
+        n = PyOS_snprintf(p, size, "%s : '%c'-", def.name, d->type);
         Py_DECREF(d);
         if (n < 0 || n >= size) {
             goto fail;
@@ -264,7 +266,7 @@ fortran_doc(FortranDataDef def)
         size -= n;
 
         if (def.data == NULL) {
-            n = format_def(p, size, def) == -1;
+            n = format_def(p, size, def);
             if (n < 0) {
                 goto fail;
             }
@@ -288,6 +290,7 @@ fortran_doc(FortranDataDef def)
             p += n;
             size -= n;
         }
+        
     }
     if (size <= 1) {
         goto fail;
diff --git a/numpy/f2py/src/test/foomodule.c b/numpy/f2py/src/test/foomodule.c
index caf3590d4..88ec62440 100644
--- a/numpy/f2py/src/test/foomodule.c
+++ b/numpy/f2py/src/test/foomodule.c
@@ -121,7 +121,7 @@ void initfoo() {
     m = Py_InitModule("foo", foo_module_methods);
 
     d = PyModule_GetDict(m);
-    s = PyString_FromString("This module 'foo' demonstrates the usage of fortranobject.");
+    s = PyUnicode_FromString("This module 'foo' demonstrates the usage of fortranobject.");
     PyDict_SetItemString(d, "__doc__", s);
 
     /* Fortran objects: */
diff --git a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
index 0db33e714..0411b62e0 100644
--- a/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
+++ b/numpy/f2py/tests/src/array_from_pyobj/wrapmodule.c
@@ -1,14 +1,9 @@
-/* File: wrapmodule.c
- * This file is auto-generated with f2py (version:2_1330).
- * Hand edited by Pearu.
- * f2py is a Fortran to Python Interface Generator (FPIG), Second Edition,
- * written by Pearu Peterson <pearu@cens.ioc.ee>.
- * See http://cens.ioc.ee/projects/f2py2e/
- * Generation date: Fri Oct 21 22:41:12 2005
- * $Revision:$
- * $Date:$
- * Do not edit this file directly unless you know what you are doing!!!
+/*
+ * This file was auto-generated with f2py (version:2_1330) and hand edited by
+ * Pearu for testing purposes.  Do not edit this file unless you know what you
+ * are doing!!!
  */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -55,7 +50,7 @@ static PyObject *f2py_rout_wrap_call(PyObject *capi_self,
     if (tmp == NULL) {
         goto fail;
     }
-    dims[i] = (npy_intp)PyInt_AsLong(tmp);
+    dims[i] = (npy_intp)PyLong_AsLong(tmp);
     Py_DECREF(tmp);
     if (dims[i] == -1 && PyErr_Occurred()) {
         goto fail;
@@ -107,8 +102,8 @@ static PyObject *f2py_rout_wrap_attrs(PyObject *capi_self,
   dimensions = PyTuple_New(PyArray_NDIM(arr));
   strides = PyTuple_New(PyArray_NDIM(arr));
   for (i=0;i<PyArray_NDIM(arr);++i) {
-    PyTuple_SetItem(dimensions,i,PyInt_FromLong(PyArray_DIM(arr,i)));
-    PyTuple_SetItem(strides,i,PyInt_FromLong(PyArray_STRIDE(arr,i)));
+    PyTuple_SetItem(dimensions,i,PyLong_FromLong(PyArray_DIM(arr,i)));
+    PyTuple_SetItem(strides,i,PyLong_FromLong(PyArray_STRIDE(arr,i)));
   }
   return Py_BuildValue("siNNO(cciii)ii",s,PyArray_NDIM(arr),
                        dimensions,strides,
@@ -149,15 +144,15 @@ PyMODINIT_FUNC PyInit_test_array_from_pyobj_ext(void) {
   if (PyErr_Occurred())
     Py_FatalError("can't initialize module wrap (failed to import numpy)");
   d = PyModule_GetDict(m);
-  s = PyString_FromString("This module 'wrap' is auto-generated with f2py (version:2_1330).\nFunctions:\n"
-"  arr = call(type_num,dims,intent,obj)\n"
-".");
+  s = PyUnicode_FromString("This module 'wrap' is auto-generated with f2py (version:2_1330).\nFunctions:\n"
+                           "  arr = call(type_num,dims,intent,obj)\n"
+                           ".");
   PyDict_SetItemString(d, "__doc__", s);
   wrap_error = PyErr_NewException ("wrap.error", NULL, NULL);
   Py_DECREF(s);
 
 #define ADDCONST(NAME, CONST)              \
-    s = PyInt_FromLong(CONST);             \
+    s = PyLong_FromLong(CONST);             \
     PyDict_SetItemString(d, NAME, s);      \
     Py_DECREF(s)
 
diff --git a/numpy/f2py/tests/src/module_data/mod.mod b/numpy/f2py/tests/src/module_data/mod.mod
new file mode 100644
index 000000000..8670a97e9
--- /dev/null
+++ b/numpy/f2py/tests/src/module_data/mod.mod
diff --git a/numpy/f2py/tests/src/module_data/module_data_docstring.f90 b/numpy/f2py/tests/src/module_data/module_data_docstring.f90
new file mode 100644
index 000000000..4505e0cbc
--- /dev/null
+++ b/numpy/f2py/tests/src/module_data/module_data_docstring.f90
@@ -0,0 +1,12 @@
+module mod
+  integer :: i
+  integer :: x(4)
+  real, dimension(2,3) :: a
+  real, allocatable, dimension(:,:) :: b
+contains
+  subroutine foo
+    integer :: k
+    k = 1
+    a(1,2) = a(1,2)+3
+  end subroutine foo
+end module mod
diff --git a/numpy/f2py/tests/test_block_docstring.py b/numpy/f2py/tests/test_block_docstring.py
index e431f5ba6..7d725165b 100644
--- a/numpy/f2py/tests/test_block_docstring.py
+++ b/numpy/f2py/tests/test_block_docstring.py
@@ -19,5 +19,5 @@ class TestBlockDocString(util.F2PyTest):
     @pytest.mark.xfail(IS_PYPY,
                        reason="PyPy cannot modify tp_doc after PyType_Ready")
     def test_block_docstring(self):
-        expected = "'i'-array(2,3)\n"
+        expected = "bar : 'i'-array(2,3)\n"
         assert_equal(self.module.block.__doc__, expected)
diff --git a/numpy/f2py/tests/test_module_doc.py b/numpy/f2py/tests/test_module_doc.py
new file mode 100644
index 000000000..4b9555cee
--- /dev/null
+++ b/numpy/f2py/tests/test_module_doc.py
@@ -0,0 +1,30 @@
+import os
+import sys
+import pytest
+import textwrap
+
+from . import util
+from numpy.testing import assert_equal, IS_PYPY
+
+
+def _path(*a):
+    return os.path.join(*((os.path.dirname(__file__),) + a))
+
+
+class TestModuleDocString(util.F2PyTest):
+    sources = [_path('src', 'module_data', 'module_data_docstring.f90')]
+
+    @pytest.mark.skipif(sys.platform=='win32',
+                        reason='Fails with MinGW64 Gfortran (Issue #9673)')
+    @pytest.mark.xfail(IS_PYPY,
+                       reason="PyPy cannot modify tp_doc after PyType_Ready")
+    def test_module_docstring(self):
+        assert_equal(self.module.mod.__doc__,
+                     textwrap.dedent('''\
+                     i : 'i'-scalar
+                     x : 'i'-array(4)
+                     a : 'f'-array(2,3)
+                     b : 'f'-array(-1,-1), not allocated\x00
+                     foo()\n
+                     Wrapper for ``foo``.\n\n''')
+                     )
diff --git a/numpy/f2py/tests/util.py b/numpy/f2py/tests/util.py
index c5b06697d..d5fa76fed 100644
--- a/numpy/f2py/tests/util.py
+++ b/numpy/f2py/tests/util.py
@@ -19,8 +19,6 @@ from numpy.compat import asbytes, asstr
 from numpy.testing import temppath
 from importlib import import_module
 
-from hashlib import md5
-
 #
 # Maintaining a temporary module directory
 #
diff --git a/numpy/fft/__init__.py b/numpy/fft/__init__.py
index af0859def..a86bb3ac0 100644
--- a/numpy/fft/__init__.py
+++ b/numpy/fft/__init__.py
@@ -4,6 +4,9 @@ Discrete Fourier Transform (:mod:`numpy.fft`)
 
 .. currentmodule:: numpy.fft
 
+The SciPy module `scipy.fft` is a more comprehensive superset
+of ``numpy.fft``, which includes only a basic set of routines.
+
 Standard FFTs
 -------------
 
diff --git a/numpy/fft/__init__.pyi b/numpy/fft/__init__.pyi
new file mode 100644
index 000000000..45190517f
--- /dev/null
+++ b/numpy/fft/__init__.pyi
@@ -0,0 +1,20 @@
+from typing import Any
+
+fft: Any
+ifft: Any
+rfft: Any
+irfft: Any
+hfft: Any
+ihfft: Any
+rfftn: Any
+irfftn: Any
+rfft2: Any
+irfft2: Any
+fft2: Any
+ifft2: Any
+fftn: Any
+ifftn: Any
+fftshift: Any
+ifftshift: Any
+fftfreq: Any
+rfftfreq: Any
diff --git a/numpy/fft/_pocketfft.c b/numpy/fft/_pocketfft.c
index 764116a84..ba9995f97 100644
--- a/numpy/fft/_pocketfft.c
+++ b/numpy/fft/_pocketfft.c
@@ -2206,7 +2206,6 @@ execute_complex(PyObject *a1, int is_forward, double fct)
     double *dptr = (double *)PyArray_DATA(data);
     int fail=0;
     Py_BEGIN_ALLOW_THREADS;
-    NPY_SIGINT_ON;
     plan = make_cfft_plan(npts);
     if (!plan) fail=1;
     if (!fail)
@@ -2217,7 +2216,6 @@ execute_complex(PyObject *a1, int is_forward, double fct)
           dptr += npts*2;
       }
     if (plan) destroy_cfft_plan(plan);
-    NPY_SIGINT_OFF;
     Py_END_ALLOW_THREADS;
     if (fail) {
       Py_XDECREF(data);
@@ -2258,7 +2256,6 @@ execute_real_forward(PyObject *a1, double fct)
              *dptr = (double *)PyArray_DATA(data);
 
       Py_BEGIN_ALLOW_THREADS;
-      NPY_SIGINT_ON;
       plan = make_rfft_plan(npts);
       if (!plan) fail=1;
       if (!fail)
@@ -2272,7 +2269,6 @@ execute_real_forward(PyObject *a1, double fct)
             dptr += npts;
       }
       if (plan) destroy_rfft_plan(plan);
-      NPY_SIGINT_OFF;
       Py_END_ALLOW_THREADS;
     }
     if (fail) {
@@ -2303,7 +2299,6 @@ execute_real_backward(PyObject *a1, double fct)
              *dptr = (double *)PyArray_DATA(data);
 
       Py_BEGIN_ALLOW_THREADS;
-      NPY_SIGINT_ON;
       plan = make_rfft_plan(npts);
       if (!plan) fail=1;
       if (!fail) {
@@ -2316,7 +2311,6 @@ execute_real_backward(PyObject *a1, double fct)
         }
       }
       if (plan) destroy_rfft_plan(plan);
-      NPY_SIGINT_OFF;
       Py_END_ALLOW_THREADS;
     }
     if (fail) {
diff --git a/numpy/fft/_pocketfft.py b/numpy/fft/_pocketfft.py
index 38ea69834..83ac86036 100644
--- a/numpy/fft/_pocketfft.py
+++ b/numpy/fft/_pocketfft.py
@@ -411,7 +411,7 @@ def rfft(a, n=None, axis=-1, norm=None):
 @array_function_dispatch(_fft_dispatcher)
 def irfft(a, n=None, axis=-1, norm=None):
     """
-    Compute the inverse of the n-point DFT for real input.
+    Computes the inverse of `rfft`.
 
     This function computes the inverse of the one-dimensional *n*-point
     discrete Fourier Transform of real input computed by `rfft`.
@@ -1249,7 +1249,7 @@ def rfft2(a, s=None, axes=(-2, -1), norm=None):
 @array_function_dispatch(_fftn_dispatcher)
 def irfftn(a, s=None, axes=None, norm=None):
     """
-    Compute the inverse of the N-dimensional FFT of real input.
+    Computes the inverse of `rfftn`.
 
     This function computes the inverse of the N-dimensional discrete
     Fourier Transform for real input over any number of axes in an
@@ -1359,7 +1359,7 @@ def irfftn(a, s=None, axes=None, norm=None):
 @array_function_dispatch(_fftn_dispatcher)
 def irfft2(a, s=None, axes=(-2, -1), norm=None):
     """
-    Compute the 2-dimensional inverse FFT of a real array.
+    Computes the inverse of `rfft2`.
 
     Parameters
     ----------
@@ -1388,6 +1388,10 @@ def irfft2(a, s=None, axes=(-2, -1), norm=None):
 
     See Also
     --------
+    rfft2 : The forward two-dimensional FFT of real input,
+            of which `irfft2` is the inverse.
+    rfft : The one-dimensional FFT for real input.
+    irfft : The inverse of the one-dimensional FFT of real input.
     irfftn : Compute the inverse of the N-dimensional FFT of real input.
 
     Notes
diff --git a/numpy/fft/helper.py b/numpy/fft/helper.py
index 3dacd9ee1..927ee1af1 100644
--- a/numpy/fft/helper.py
+++ b/numpy/fft/helper.py
@@ -2,7 +2,6 @@
 Discrete Fourier Transforms - helper.py
 
 """
-from numpy.compat import integer_types
 from numpy.core import integer, empty, arange, asarray, roll
 from numpy.core.overrides import array_function_dispatch, set_module
 
@@ -10,7 +9,7 @@ from numpy.core.overrides import array_function_dispatch, set_module
 
 __all__ = ['fftshift', 'ifftshift', 'fftfreq', 'rfftfreq']
 
-integer_types = integer_types + (integer,)
+integer_types = (int, integer)
 
 
 def _fftshift_dispatcher(x, axes=None):
diff --git a/numpy/fft/setup.py b/numpy/fft/setup.py
index 9ed824e4f..477948a09 100644
--- a/numpy/fft/setup.py
+++ b/numpy/fft/setup.py
@@ -14,6 +14,7 @@ def configuration(parent_package='',top_path=None):
                          define_macros=defs,
                          )
 
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/fft/tests/test_helper.py b/numpy/fft/tests/test_helper.py
index 68f5990af..3fb700bb3 100644
--- a/numpy/fft/tests/test_helper.py
+++ b/numpy/fft/tests/test_helper.py
@@ -85,7 +85,6 @@ class TestFFTShift:
 
     def test_equal_to_original(self):
         """ Test that the new (>=v1.15) implementation (see #10073) is equal to the original (<=v1.14) """
-        from numpy.compat import integer_types
         from numpy.core import asarray, concatenate, arange, take
 
         def original_fftshift(x, axes=None):
@@ -94,7 +93,7 @@ class TestFFTShift:
             ndim = tmp.ndim
             if axes is None:
                 axes = list(range(ndim))
-            elif isinstance(axes, integer_types):
+            elif isinstance(axes, int):
                 axes = (axes,)
             y = tmp
             for k in axes:
@@ -110,7 +109,7 @@ class TestFFTShift:
             ndim = tmp.ndim
             if axes is None:
                 axes = list(range(ndim))
-            elif isinstance(axes, integer_types):
+            elif isinstance(axes, int):
                 axes = (axes,)
             y = tmp
             for k in axes:
diff --git a/numpy/lib/__init__.py b/numpy/lib/__init__.py
index cb0de0d15..ad88ba347 100644
--- a/numpy/lib/__init__.py
+++ b/numpy/lib/__init__.py
@@ -35,7 +35,6 @@ from .polynomial import *
 from .utils import *
 from .arraysetops import *
 from .npyio import *
-from .financial import *
 from .arrayterator import Arrayterator
 from .arraypad import *
 from ._version import *
@@ -54,7 +53,6 @@ __all__ += polynomial.__all__
 __all__ += utils.__all__
 __all__ += arraysetops.__all__
 __all__ += npyio.__all__
-__all__ += financial.__all__
 __all__ += nanfunctions.__all__
 __all__ += histograms.__all__
 
diff --git a/numpy/lib/__init__.pyi b/numpy/lib/__init__.pyi
new file mode 100644
index 000000000..413e2ae1b
--- /dev/null
+++ b/numpy/lib/__init__.pyi
@@ -0,0 +1,177 @@
+from typing import Any
+
+emath: Any
+math: Any
+tracemalloc_domain: Any
+Arrayterator: Any
+iscomplexobj: Any
+isrealobj: Any
+imag: Any
+iscomplex: Any
+isreal: Any
+nan_to_num: Any
+real: Any
+real_if_close: Any
+typename: Any
+asfarray: Any
+mintypecode: Any
+asscalar: Any
+common_type: Any
+ravel_multi_index: Any
+unravel_index: Any
+mgrid: Any
+ogrid: Any
+r_: Any
+c_: Any
+s_: Any
+index_exp: Any
+ix_: Any
+ndenumerate: Any
+ndindex: Any
+fill_diagonal: Any
+diag_indices: Any
+diag_indices_from: Any
+select: Any
+piecewise: Any
+trim_zeros: Any
+copy: Any
+iterable: Any
+percentile: Any
+diff: Any
+gradient: Any
+angle: Any
+unwrap: Any
+sort_complex: Any
+disp: Any
+flip: Any
+rot90: Any
+extract: Any
+place: Any
+vectorize: Any
+asarray_chkfinite: Any
+average: Any
+bincount: Any
+digitize: Any
+cov: Any
+corrcoef: Any
+msort: Any
+median: Any
+sinc: Any
+hamming: Any
+hanning: Any
+bartlett: Any
+blackman: Any
+kaiser: Any
+trapz: Any
+i0: Any
+add_newdoc: Any
+add_docstring: Any
+meshgrid: Any
+delete: Any
+insert: Any
+append: Any
+interp: Any
+add_newdoc_ufunc: Any
+quantile: Any
+column_stack: Any
+row_stack: Any
+dstack: Any
+array_split: Any
+split: Any
+hsplit: Any
+vsplit: Any
+dsplit: Any
+apply_over_axes: Any
+expand_dims: Any
+apply_along_axis: Any
+kron: Any
+tile: Any
+get_array_wrap: Any
+take_along_axis: Any
+put_along_axis: Any
+broadcast_to: Any
+broadcast_arrays: Any
+diag: Any
+diagflat: Any
+eye: Any
+fliplr: Any
+flipud: Any
+tri: Any
+triu: Any
+tril: Any
+vander: Any
+histogram2d: Any
+mask_indices: Any
+tril_indices: Any
+tril_indices_from: Any
+triu_indices: Any
+triu_indices_from: Any
+fix: Any
+isneginf: Any
+isposinf: Any
+pad: Any
+poly: Any
+roots: Any
+polyint: Any
+polyder: Any
+polyadd: Any
+polysub: Any
+polymul: Any
+polydiv: Any
+polyval: Any
+poly1d: Any
+polyfit: Any
+RankWarning: Any
+issubclass_: Any
+issubsctype: Any
+issubdtype: Any
+deprecate: Any
+deprecate_with_doc: Any
+get_include: Any
+info: Any
+source: Any
+who: Any
+lookfor: Any
+byte_bounds: Any
+safe_eval: Any
+ediff1d: Any
+intersect1d: Any
+setxor1d: Any
+union1d: Any
+setdiff1d: Any
+unique: Any
+in1d: Any
+isin: Any
+savetxt: Any
+loadtxt: Any
+genfromtxt: Any
+ndfromtxt: Any
+mafromtxt: Any
+recfromtxt: Any
+recfromcsv: Any
+load: Any
+loads: Any
+save: Any
+savez: Any
+savez_compressed: Any
+packbits: Any
+unpackbits: Any
+fromregex: Any
+DataSource: Any
+nansum: Any
+nanmax: Any
+nanmin: Any
+nanargmax: Any
+nanargmin: Any
+nanmean: Any
+nanmedian: Any
+nanpercentile: Any
+nanvar: Any
+nanstd: Any
+nanprod: Any
+nancumsum: Any
+nancumprod: Any
+nanquantile: Any
+histogram: Any
+histogramdd: Any
+histogram_bin_edges: Any
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 7560bf4da..a576925d6 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -5,7 +5,7 @@ __docformat__ = "restructuredtext en"
 
 import numpy as np
 import numpy.core.numeric as nx
-from numpy.compat import asbytes, asunicode, bytes
+from numpy.compat import asbytes, asunicode
 
 
 def _decode_line(line, encoding=None):
@@ -18,6 +18,8 @@ def _decode_line(line, encoding=None):
     ----------
     line : str or bytes
          Line to be decoded.
+    encoding : str
+         Encoding used to decode `line`.
 
     Returns
     -------
@@ -27,9 +29,8 @@ def _decode_line(line, encoding=None):
     """
     if type(line) is bytes:
         if encoding is None:
-            line = line.decode('latin1')
-        else:
-            line = line.decode(encoding)
+            encoding = "latin1"
+        line = line.decode(encoding)
 
     return line
 
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index 7569e7651..8830b8147 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -207,23 +207,20 @@ def _get_linear_ramps(padded, axis, width_pair, end_value_pair):
     """
     edge_pair = _get_edges(padded, axis, width_pair)
 
-    left_ramp = np.linspace(
-        start=end_value_pair[0],
-        stop=edge_pair[0].squeeze(axis),  # Dimensions is replaced by linspace
-        num=width_pair[0],
-        endpoint=False,
-        dtype=padded.dtype,
-        axis=axis,
-    )
-
-    right_ramp = np.linspace(
-        start=end_value_pair[1],
-        stop=edge_pair[1].squeeze(axis),  # Dimension is replaced by linspace
-        num=width_pair[1],
-        endpoint=False,
-        dtype=padded.dtype,
-        axis=axis,
+    left_ramp, right_ramp = (
+        np.linspace(
+            start=end_value,
+            stop=edge.squeeze(axis), # Dimension is replaced by linspace
+            num=width,
+            endpoint=False,
+            dtype=padded.dtype,
+            axis=axis
+        )
+        for end_value, edge, width in zip(
+            end_value_pair, edge_pair, width_pair
+        )
     )
+        
     # Reverse linear space in appropriate dimension
     right_ramp = right_ramp[_slice_at_axis(slice(None, None, -1), axis)]
 
@@ -783,7 +780,7 @@ def pad(array, pad_width, mode='constant', **kwargs):
     try:
         unsupported_kwargs = set(kwargs) - set(allowed_kwargs[mode])
     except KeyError:
-        raise ValueError("mode '{}' is not supported".format(mode))
+        raise ValueError("mode '{}' is not supported".format(mode)) from None
     if unsupported_kwargs:
         raise ValueError("unsupported keyword arguments for mode '{}': {}"
                          .format(mode, unsupported_kwargs))
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index 6cca1738b..6c6c1ff80 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -1,28 +1,17 @@
 """
 Set operations for arrays based on sorting.
 
-:Contains:
-  unique,
-  isin,
-  ediff1d,
-  intersect1d,
-  setxor1d,
-  in1d,
-  union1d,
-  setdiff1d
-
-:Notes:
+Notes
+-----
 
 For floating point arrays, inaccurate results may appear due to usual round-off
 and floating point comparison issues.
 
 Speed could be gained in some operations by an implementation of
-sort(), that can provide directly the permutation vectors, avoiding
-thus calls to argsort().
+`numpy.sort`, that can provide directly the permutation vectors, thus avoiding
+calls to `numpy.argsort`.
 
-To do: Optionally return indices analogously to unique for all functions.
-
-:Author: Robert Cimrman
+Original author: Robert Cimrman
 
 """
 import functools
@@ -104,7 +93,7 @@ def ediff1d(ary, to_end=None, to_begin=None):
     else:
         to_begin = np.asanyarray(to_begin)
         if not np.can_cast(to_begin, dtype_req, casting="same_kind"):
-            raise TypeError("dtype of `to_end` must be compatible "
+            raise TypeError("dtype of `to_begin` must be compatible "
                             "with input `ary` under the `same_kind` rule.")
 
         to_begin = to_begin.ravel()
@@ -278,7 +267,7 @@ def unique(ar, return_index=False, return_inverse=False,
         ar = np.moveaxis(ar, axis, 0)
     except np.AxisError:
         # this removes the "axis1" or "axis2" prefix from the error message
-        raise np.AxisError(axis, ar.ndim)
+        raise np.AxisError(axis, ar.ndim) from None
 
     # Must reshape to a contiguous 2D array for this to work...
     orig_shape, orig_dtype = ar.shape, ar.dtype
@@ -300,10 +289,10 @@ def unique(ar, return_index=False, return_inverse=False,
             # array with shape `(len(ar),)`.  Because `dtype` in this case has
             # itemsize 0, the total size of the result is still 0 bytes.
             consolidated = np.empty(len(ar), dtype=dtype)
-    except TypeError:
+    except TypeError as e:
         # There's no good way to do this for object arrays, etc...
         msg = 'The axis argument to unique is not supported for dtype {dt}'
-        raise TypeError(msg.format(dt=ar.dtype))
+        raise TypeError(msg.format(dt=ar.dtype)) from e
 
     def reshape_uniq(uniq):
         n = len(uniq)
@@ -369,7 +358,9 @@ def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
         Input arrays. Will be flattened if not already 1D.
     assume_unique : bool
         If True, the input arrays are both assumed to be unique, which
-        can speed up the calculation.  Default is False.
+        can speed up the calculation.  If True but ``ar1`` or ``ar2`` are not
+        unique, incorrect results and out-of-bounds indices could result.
+        Default is False.
     return_indices : bool
         If True, the indices which correspond to the intersection of the two
         arrays are returned. The first instance of a value is used if there are
diff --git a/numpy/lib/financial.py b/numpy/lib/financial.py
deleted file mode 100644
index 709a79dc0..000000000
--- a/numpy/lib/financial.py
+++ /dev/null
@@ -1,967 +0,0 @@
-"""Some simple financial calculations
-
-patterned after spreadsheet computations.
-
-There is some complexity in each function
-so that the functions behave like ufuncs with
-broadcasting and being able to be called with scalars
-or arrays (or other sequences).
-
-Functions support the :class:`decimal.Decimal` type unless
-otherwise stated.
-"""
-import warnings
-from decimal import Decimal
-import functools
-
-import numpy as np
-from numpy.core import overrides
-
-
-_depmsg = ("numpy.{name} is deprecated and will be removed from NumPy 1.20. "
-           "Use numpy_financial.{name} instead "
-           "(https://pypi.org/project/numpy-financial/).")
-
-array_function_dispatch = functools.partial(
-    overrides.array_function_dispatch, module='numpy')
-
-
-__all__ = ['fv', 'pmt', 'nper', 'ipmt', 'ppmt', 'pv', 'rate',
-           'irr', 'npv', 'mirr']
-
-_when_to_num = {'end':0, 'begin':1,
-                'e':0, 'b':1,
-                0:0, 1:1,
-                'beginning':1,
-                'start':1,
-                'finish':0}
-
-def _convert_when(when):
-    #Test to see if when has already been converted to ndarray
-    #This will happen if one function calls another, for example ppmt
-    if isinstance(when, np.ndarray):
-        return when
-    try:
-        return _when_to_num[when]
-    except (KeyError, TypeError):
-        return [_when_to_num[x] for x in when]
-
-
-def _fv_dispatcher(rate, nper, pmt, pv, when=None):
-    warnings.warn(_depmsg.format(name='fv'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, nper, pmt, pv)
-
-
-@array_function_dispatch(_fv_dispatcher)
-def fv(rate, nper, pmt, pv, when='end'):
-    """
-    Compute the future value.
-
-    .. deprecated:: 1.18
-
-       `fv` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Given:
-     * a present value, `pv`
-     * an interest `rate` compounded once per period, of which
-       there are
-     * `nper` total
-     * a (fixed) payment, `pmt`, paid either
-     * at the beginning (`when` = {'begin', 1}) or the end
-       (`when` = {'end', 0}) of each period
-
-    Return:
-       the value at the end of the `nper` periods
-
-    Parameters
-    ----------
-    rate : scalar or array_like of shape(M, )
-        Rate of interest as decimal (not per cent) per period
-    nper : scalar or array_like of shape(M, )
-        Number of compounding periods
-    pmt : scalar or array_like of shape(M, )
-        Payment
-    pv : scalar or array_like of shape(M, )
-        Present value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}, optional
-        When payments are due ('begin' (1) or 'end' (0)).
-        Defaults to {'end', 0}.
-
-    Returns
-    -------
-    out : ndarray
-        Future values.  If all input is scalar, returns a scalar float.  If
-        any input is array_like, returns future values for each input element.
-        If multiple inputs are array_like, they all must have the same shape.
-
-    Notes
-    -----
-    The future value is computed by solving the equation::
-
-     fv +
-     pv*(1+rate)**nper +
-     pmt*(1 + rate*when)/rate*((1 + rate)**nper - 1) == 0
-
-    or, when ``rate == 0``::
-
-     fv + pv + pmt * nper == 0
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] Wheeler, D. A., E. Rathke, and R. Weir (Eds.) (2009, May).
-       Open Document Format for Office Applications (OpenDocument)v1.2,
-       Part 2: Recalculated Formula (OpenFormula) Format - Annotated Version,
-       Pre-Draft 12. Organization for the Advancement of Structured Information
-       Standards (OASIS). Billerica, MA, USA. [ODT Document].
-       Available:
-       http://www.oasis-open.org/committees/documents.php?wg_abbrev=office-formula
-       OpenDocument-formula-20090508.odt
-
-
-    Examples
-    --------
-    What is the future value after 10 years of saving $100 now, with
-    an additional monthly savings of $100.  Assume the interest rate is
-    5% (annually) compounded monthly?
-
-    >>> np.fv(0.05/12, 10*12, -100, -100)
-    15692.928894335748
-
-    By convention, the negative sign represents cash flow out (i.e. money not
-    available today).  Thus, saving $100 a month at 5% annual interest leads
-    to $15,692.93 available to spend in 10 years.
-
-    If any input is array_like, returns an array of equal shape.  Let's
-    compare different interest rates from the example above.
-
-    >>> a = np.array((0.05, 0.06, 0.07))/12
-    >>> np.fv(a, 10*12, -100, -100)
-    array([ 15692.92889434,  16569.87435405,  17509.44688102]) # may vary
-
-    """
-    when = _convert_when(when)
-    (rate, nper, pmt, pv, when) = map(np.asarray, [rate, nper, pmt, pv, when])
-    temp = (1+rate)**nper
-    fact = np.where(rate == 0, nper,
-                    (1 + rate*when)*(temp - 1)/rate)
-    return -(pv*temp + pmt*fact)
-
-
-def _pmt_dispatcher(rate, nper, pv, fv=None, when=None):
-    warnings.warn(_depmsg.format(name='pmt'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, nper, pv, fv)
-
-
-@array_function_dispatch(_pmt_dispatcher)
-def pmt(rate, nper, pv, fv=0, when='end'):
-    """
-    Compute the payment against loan principal plus interest.
-
-    .. deprecated:: 1.18
-
-       `pmt` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Given:
-     * a present value, `pv` (e.g., an amount borrowed)
-     * a future value, `fv` (e.g., 0)
-     * an interest `rate` compounded once per period, of which
-       there are
-     * `nper` total
-     * and (optional) specification of whether payment is made
-       at the beginning (`when` = {'begin', 1}) or the end
-       (`when` = {'end', 0}) of each period
-
-    Return:
-       the (fixed) periodic payment.
-
-    Parameters
-    ----------
-    rate : array_like
-        Rate of interest (per period)
-    nper : array_like
-        Number of compounding periods
-    pv : array_like
-        Present value
-    fv : array_like,  optional
-        Future value (default = 0)
-    when : {{'begin', 1}, {'end', 0}}, {string, int}
-        When payments are due ('begin' (1) or 'end' (0))
-
-    Returns
-    -------
-    out : ndarray
-        Payment against loan plus interest.  If all input is scalar, returns a
-        scalar float.  If any input is array_like, returns payment for each
-        input element. If multiple inputs are array_like, they all must have
-        the same shape.
-
-    Notes
-    -----
-    The payment is computed by solving the equation::
-
-     fv +
-     pv*(1 + rate)**nper +
-     pmt*(1 + rate*when)/rate*((1 + rate)**nper - 1) == 0
-
-    or, when ``rate == 0``::
-
-      fv + pv + pmt * nper == 0
-
-    for ``pmt``.
-
-    Note that computing a monthly mortgage payment is only
-    one use for this function.  For example, pmt returns the
-    periodic deposit one must make to achieve a specified
-    future balance given an initial deposit, a fixed,
-    periodically compounded interest rate, and the total
-    number of periods.
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] Wheeler, D. A., E. Rathke, and R. Weir (Eds.) (2009, May).
-       Open Document Format for Office Applications (OpenDocument)v1.2,
-       Part 2: Recalculated Formula (OpenFormula) Format - Annotated Version,
-       Pre-Draft 12. Organization for the Advancement of Structured Information
-       Standards (OASIS). Billerica, MA, USA. [ODT Document].
-       Available:
-       http://www.oasis-open.org/committees/documents.php
-       ?wg_abbrev=office-formulaOpenDocument-formula-20090508.odt
-
-    Examples
-    --------
-    What is the monthly payment needed to pay off a $200,000 loan in 15
-    years at an annual interest rate of 7.5%?
-
-    >>> np.pmt(0.075/12, 12*15, 200000)
-    -1854.0247200054619
-
-    In order to pay-off (i.e., have a future-value of 0) the $200,000 obtained
-    today, a monthly payment of $1,854.02 would be required.  Note that this
-    example illustrates usage of `fv` having a default value of 0.
-
-    """
-    when = _convert_when(when)
-    (rate, nper, pv, fv, when) = map(np.array, [rate, nper, pv, fv, when])
-    temp = (1 + rate)**nper
-    mask = (rate == 0)
-    masked_rate = np.where(mask, 1, rate)
-    fact = np.where(mask != 0, nper,
-                    (1 + masked_rate*when)*(temp - 1)/masked_rate)
-    return -(fv + pv*temp) / fact
-
-
-def _nper_dispatcher(rate, pmt, pv, fv=None, when=None):
-    warnings.warn(_depmsg.format(name='nper'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, pmt, pv, fv)
-
-
-@array_function_dispatch(_nper_dispatcher)
-def nper(rate, pmt, pv, fv=0, when='end'):
-    """
-    Compute the number of periodic payments.
-
-    .. deprecated:: 1.18
-
-       `nper` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    :class:`decimal.Decimal` type is not supported.
-
-    Parameters
-    ----------
-    rate : array_like
-        Rate of interest (per period)
-    pmt : array_like
-        Payment
-    pv : array_like
-        Present value
-    fv : array_like, optional
-        Future value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}, optional
-        When payments are due ('begin' (1) or 'end' (0))
-
-    Notes
-    -----
-    The number of periods ``nper`` is computed by solving the equation::
-
-     fv + pv*(1+rate)**nper + pmt*(1+rate*when)/rate*((1+rate)**nper-1) = 0
-
-    but if ``rate = 0`` then::
-
-     fv + pv + pmt*nper = 0
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-
-    Examples
-    --------
-    If you only had $150/month to pay towards the loan, how long would it take
-    to pay-off a loan of $8,000 at 7% annual interest?
-
-    >>> print(np.round(np.nper(0.07/12, -150, 8000), 5))
-    64.07335
-
-    So, over 64 months would be required to pay off the loan.
-
-    The same analysis could be done with several different interest rates
-    and/or payments and/or total amounts to produce an entire table.
-
-    >>> np.nper(*(np.ogrid[0.07/12: 0.08/12: 0.01/12,
-    ...                    -150   : -99     : 50    ,
-    ...                    8000   : 9001    : 1000]))
-    array([[[ 64.07334877,  74.06368256],
-            [108.07548412, 127.99022654]],
-           [[ 66.12443902,  76.87897353],
-            [114.70165583, 137.90124779]]])
-
-    """
-    when = _convert_when(when)
-    (rate, pmt, pv, fv, when) = map(np.asarray, [rate, pmt, pv, fv, when])
-
-    use_zero_rate = False
-    with np.errstate(divide="raise"):
-        try:
-            z = pmt*(1+rate*when)/rate
-        except FloatingPointError:
-            use_zero_rate = True
-
-    if use_zero_rate:
-        return (-fv + pv) / pmt
-    else:
-        A = -(fv + pv)/(pmt+0)
-        B = np.log((-fv+z) / (pv+z))/np.log(1+rate)
-        return np.where(rate == 0, A, B)
-
-
-def _ipmt_dispatcher(rate, per, nper, pv, fv=None, when=None):
-    warnings.warn(_depmsg.format(name='ipmt'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, per, nper, pv, fv)
-
-
-@array_function_dispatch(_ipmt_dispatcher)
-def ipmt(rate, per, nper, pv, fv=0, when='end'):
-    """
-    Compute the interest portion of a payment.
-
-    .. deprecated:: 1.18
-
-       `ipmt` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Parameters
-    ----------
-    rate : scalar or array_like of shape(M, )
-        Rate of interest as decimal (not per cent) per period
-    per : scalar or array_like of shape(M, )
-        Interest paid against the loan changes during the life or the loan.
-        The `per` is the payment period to calculate the interest amount.
-    nper : scalar or array_like of shape(M, )
-        Number of compounding periods
-    pv : scalar or array_like of shape(M, )
-        Present value
-    fv : scalar or array_like of shape(M, ), optional
-        Future value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}, optional
-        When payments are due ('begin' (1) or 'end' (0)).
-        Defaults to {'end', 0}.
-
-    Returns
-    -------
-    out : ndarray
-        Interest portion of payment.  If all input is scalar, returns a scalar
-        float.  If any input is array_like, returns interest payment for each
-        input element. If multiple inputs are array_like, they all must have
-        the same shape.
-
-    See Also
-    --------
-    ppmt, pmt, pv
-
-    Notes
-    -----
-    The total payment is made up of payment against principal plus interest.
-
-    ``pmt = ppmt + ipmt``
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-
-    Examples
-    --------
-    What is the amortization schedule for a 1 year loan of $2500 at
-    8.24% interest per year compounded monthly?
-
-    >>> principal = 2500.00
-
-    The 'per' variable represents the periods of the loan.  Remember that
-    financial equations start the period count at 1!
-
-    >>> per = np.arange(1*12) + 1
-    >>> ipmt = np.ipmt(0.0824/12, per, 1*12, principal)
-    >>> ppmt = np.ppmt(0.0824/12, per, 1*12, principal)
-
-    Each element of the sum of the 'ipmt' and 'ppmt' arrays should equal
-    'pmt'.
-
-    >>> pmt = np.pmt(0.0824/12, 1*12, principal)
-    >>> np.allclose(ipmt + ppmt, pmt)
-    True
-
-    >>> fmt = '{0:2d} {1:8.2f} {2:8.2f} {3:8.2f}'
-    >>> for payment in per:
-    ...     index = payment - 1
-    ...     principal = principal + ppmt[index]
-    ...     print(fmt.format(payment, ppmt[index], ipmt[index], principal))
-     1  -200.58   -17.17  2299.42
-     2  -201.96   -15.79  2097.46
-     3  -203.35   -14.40  1894.11
-     4  -204.74   -13.01  1689.37
-     5  -206.15   -11.60  1483.22
-     6  -207.56   -10.18  1275.66
-     7  -208.99    -8.76  1066.67
-     8  -210.42    -7.32   856.25
-     9  -211.87    -5.88   644.38
-    10  -213.32    -4.42   431.05
-    11  -214.79    -2.96   216.26
-    12  -216.26    -1.49    -0.00
-
-    >>> interestpd = np.sum(ipmt)
-    >>> np.round(interestpd, 2)
-    -112.98
-
-    """
-    when = _convert_when(when)
-    rate, per, nper, pv, fv, when = np.broadcast_arrays(rate, per, nper,
-                                                        pv, fv, when)
-    total_pmt = pmt(rate, nper, pv, fv, when)
-    ipmt = _rbl(rate, per, total_pmt, pv, when)*rate
-    try:
-        ipmt = np.where(when == 1, ipmt/(1 + rate), ipmt)
-        ipmt = np.where(np.logical_and(when == 1, per == 1), 0, ipmt)
-    except IndexError:
-        pass
-    return ipmt
-
-
-def _rbl(rate, per, pmt, pv, when):
-    """
-    This function is here to simply have a different name for the 'fv'
-    function to not interfere with the 'fv' keyword argument within the 'ipmt'
-    function.  It is the 'remaining balance on loan' which might be useful as
-    its own function, but is easily calculated with the 'fv' function.
-    """
-    return fv(rate, (per - 1), pmt, pv, when)
-
-
-def _ppmt_dispatcher(rate, per, nper, pv, fv=None, when=None):
-    warnings.warn(_depmsg.format(name='ppmt'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, per, nper, pv, fv)
-
-
-@array_function_dispatch(_ppmt_dispatcher)
-def ppmt(rate, per, nper, pv, fv=0, when='end'):
-    """
-    Compute the payment against loan principal.
-
-    .. deprecated:: 1.18
-
-       `ppmt` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Parameters
-    ----------
-    rate : array_like
-        Rate of interest (per period)
-    per : array_like, int
-        Amount paid against the loan changes.  The `per` is the period of
-        interest.
-    nper : array_like
-        Number of compounding periods
-    pv : array_like
-        Present value
-    fv : array_like, optional
-        Future value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}
-        When payments are due ('begin' (1) or 'end' (0))
-
-    See Also
-    --------
-    pmt, pv, ipmt
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-
-    """
-    total = pmt(rate, nper, pv, fv, when)
-    return total - ipmt(rate, per, nper, pv, fv, when)
-
-
-def _pv_dispatcher(rate, nper, pmt, fv=None, when=None):
-    warnings.warn(_depmsg.format(name='pv'),
-                  DeprecationWarning, stacklevel=3)
-    return (rate, nper, nper, pv, fv)
-
-
-@array_function_dispatch(_pv_dispatcher)
-def pv(rate, nper, pmt, fv=0, when='end'):
-    """
-    Compute the present value.
-
-    .. deprecated:: 1.18
-
-       `pv` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Given:
-     * a future value, `fv`
-     * an interest `rate` compounded once per period, of which
-       there are
-     * `nper` total
-     * a (fixed) payment, `pmt`, paid either
-     * at the beginning (`when` = {'begin', 1}) or the end
-       (`when` = {'end', 0}) of each period
-
-    Return:
-       the value now
-
-    Parameters
-    ----------
-    rate : array_like
-        Rate of interest (per period)
-    nper : array_like
-        Number of compounding periods
-    pmt : array_like
-        Payment
-    fv : array_like, optional
-        Future value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}, optional
-        When payments are due ('begin' (1) or 'end' (0))
-
-    Returns
-    -------
-    out : ndarray, float
-        Present value of a series of payments or investments.
-
-    Notes
-    -----
-    The present value is computed by solving the equation::
-
-     fv +
-     pv*(1 + rate)**nper +
-     pmt*(1 + rate*when)/rate*((1 + rate)**nper - 1) = 0
-
-    or, when ``rate = 0``::
-
-     fv + pv + pmt * nper = 0
-
-    for `pv`, which is then returned.
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] Wheeler, D. A., E. Rathke, and R. Weir (Eds.) (2009, May).
-       Open Document Format for Office Applications (OpenDocument)v1.2,
-       Part 2: Recalculated Formula (OpenFormula) Format - Annotated Version,
-       Pre-Draft 12. Organization for the Advancement of Structured Information
-       Standards (OASIS). Billerica, MA, USA. [ODT Document].
-       Available:
-       http://www.oasis-open.org/committees/documents.php?wg_abbrev=office-formula
-       OpenDocument-formula-20090508.odt
-
-    Examples
-    --------
-    What is the present value (e.g., the initial investment)
-    of an investment that needs to total $15692.93
-    after 10 years of saving $100 every month?  Assume the
-    interest rate is 5% (annually) compounded monthly.
-
-    >>> np.pv(0.05/12, 10*12, -100, 15692.93)
-    -100.00067131625819
-
-    By convention, the negative sign represents cash flow out
-    (i.e., money not available today).  Thus, to end up with
-    $15,692.93 in 10 years saving $100 a month at 5% annual
-    interest, one's initial deposit should also be $100.
-
-    If any input is array_like, ``pv`` returns an array of equal shape.
-    Let's compare different interest rates in the example above:
-
-    >>> a = np.array((0.05, 0.04, 0.03))/12
-    >>> np.pv(a, 10*12, -100, 15692.93)
-    array([ -100.00067132,  -649.26771385, -1273.78633713]) # may vary
-
-    So, to end up with the same $15692.93 under the same $100 per month
-    "savings plan," for annual interest rates of 4% and 3%, one would
-    need initial investments of $649.27 and $1273.79, respectively.
-
-    """
-    when = _convert_when(when)
-    (rate, nper, pmt, fv, when) = map(np.asarray, [rate, nper, pmt, fv, when])
-    temp = (1+rate)**nper
-    fact = np.where(rate == 0, nper, (1+rate*when)*(temp-1)/rate)
-    return -(fv + pmt*fact)/temp
-
-# Computed with Sage
-#  (y + (r + 1)^n*x + p*((r + 1)^n - 1)*(r*w + 1)/r)/(n*(r + 1)^(n - 1)*x -
-#  p*((r + 1)^n - 1)*(r*w + 1)/r^2 + n*p*(r + 1)^(n - 1)*(r*w + 1)/r +
-#  p*((r + 1)^n - 1)*w/r)
-
-def _g_div_gp(r, n, p, x, y, w):
-    t1 = (r+1)**n
-    t2 = (r+1)**(n-1)
-    return ((y + t1*x + p*(t1 - 1)*(r*w + 1)/r) /
-                (n*t2*x - p*(t1 - 1)*(r*w + 1)/(r**2) + n*p*t2*(r*w + 1)/r +
-                 p*(t1 - 1)*w/r))
-
-
-def _rate_dispatcher(nper, pmt, pv, fv, when=None, guess=None, tol=None,
-                     maxiter=None):
-    warnings.warn(_depmsg.format(name='rate'),
-                  DeprecationWarning, stacklevel=3)
-    return (nper, pmt, pv, fv)
-
-
-# Use Newton's iteration until the change is less than 1e-6
-#  for all values or a maximum of 100 iterations is reached.
-#  Newton's rule is
-#  r_{n+1} = r_{n} - g(r_n)/g'(r_n)
-#     where
-#  g(r) is the formula
-#  g'(r) is the derivative with respect to r.
-@array_function_dispatch(_rate_dispatcher)
-def rate(nper, pmt, pv, fv, when='end', guess=None, tol=None, maxiter=100):
-    """
-    Compute the rate of interest per period.
-
-    .. deprecated:: 1.18
-
-       `rate` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Parameters
-    ----------
-    nper : array_like
-        Number of compounding periods
-    pmt : array_like
-        Payment
-    pv : array_like
-        Present value
-    fv : array_like
-        Future value
-    when : {{'begin', 1}, {'end', 0}}, {string, int}, optional
-        When payments are due ('begin' (1) or 'end' (0))
-    guess : Number, optional
-        Starting guess for solving the rate of interest, default 0.1
-    tol : Number, optional
-        Required tolerance for the solution, default 1e-6
-    maxiter : int, optional
-        Maximum iterations in finding the solution
-
-    Notes
-    -----
-    The rate of interest is computed by iteratively solving the
-    (non-linear) equation::
-
-     fv + pv*(1+rate)**nper + pmt*(1+rate*when)/rate * ((1+rate)**nper - 1) = 0
-
-    for ``rate``.
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] Wheeler, D. A., E. Rathke, and R. Weir (Eds.) (2009, May).
-       Open Document Format for Office Applications (OpenDocument)v1.2,
-       Part 2: Recalculated Formula (OpenFormula) Format - Annotated Version,
-       Pre-Draft 12. Organization for the Advancement of Structured Information
-       Standards (OASIS). Billerica, MA, USA. [ODT Document].
-       Available:
-       http://www.oasis-open.org/committees/documents.php?wg_abbrev=office-formula
-       OpenDocument-formula-20090508.odt
-
-    """
-    when = _convert_when(when)
-    default_type = Decimal if isinstance(pmt, Decimal) else float
-
-    # Handle casting defaults to Decimal if/when pmt is a Decimal and
-    # guess and/or tol are not given default values
-    if guess is None:
-        guess = default_type('0.1')
-
-    if tol is None:
-        tol = default_type('1e-6')
-
-    (nper, pmt, pv, fv, when) = map(np.asarray, [nper, pmt, pv, fv, when])
-
-    rn = guess
-    iterator = 0
-    close = False
-    while (iterator < maxiter) and not close:
-        rnp1 = rn - _g_div_gp(rn, nper, pmt, pv, fv, when)
-        diff = abs(rnp1-rn)
-        close = np.all(diff < tol)
-        iterator += 1
-        rn = rnp1
-    if not close:
-        # Return nan's in array of the same shape as rn
-        return np.nan + rn
-    else:
-        return rn
-
-
-def _irr_dispatcher(values):
-    warnings.warn(_depmsg.format(name='irr'),
-                  DeprecationWarning, stacklevel=3)
-    return (values,)
-
-
-@array_function_dispatch(_irr_dispatcher)
-def irr(values):
-    """
-    Return the Internal Rate of Return (IRR).
-
-    .. deprecated:: 1.18
-
-       `irr` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    This is the "average" periodically compounded rate of return
-    that gives a net present value of 0.0; for a more complete explanation,
-    see Notes below.
-
-    :class:`decimal.Decimal` type is not supported.
-
-    Parameters
-    ----------
-    values : array_like, shape(N,)
-        Input cash flows per time period.  By convention, net "deposits"
-        are negative and net "withdrawals" are positive.  Thus, for
-        example, at least the first element of `values`, which represents
-        the initial investment, will typically be negative.
-
-    Returns
-    -------
-    out : float
-        Internal Rate of Return for periodic input values.
-
-    Notes
-    -----
-    The IRR is perhaps best understood through an example (illustrated
-    using np.irr in the Examples section below).  Suppose one invests 100
-    units and then makes the following withdrawals at regular (fixed)
-    intervals: 39, 59, 55, 20.  Assuming the ending value is 0, one's 100
-    unit investment yields 173 units; however, due to the combination of
-    compounding and the periodic withdrawals, the "average" rate of return
-    is neither simply 0.73/4 nor (1.73)^0.25-1.  Rather, it is the solution
-    (for :math:`r`) of the equation:
-
-    .. math:: -100 + \\frac{39}{1+r} + \\frac{59}{(1+r)^2}
-     + \\frac{55}{(1+r)^3} + \\frac{20}{(1+r)^4} = 0
-
-    In general, for `values` :math:`= [v_0, v_1, ... v_M]`,
-    irr is the solution of the equation: [2]_
-
-    .. math:: \\sum_{t=0}^M{\\frac{v_t}{(1+irr)^{t}}} = 0
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] L. J. Gitman, "Principles of Managerial Finance, Brief," 3rd ed.,
-       Addison-Wesley, 2003, pg. 348.
-
-    Examples
-    --------
-    >>> round(np.irr([-100, 39, 59, 55, 20]), 5)
-    0.28095
-    >>> round(np.irr([-100, 0, 0, 74]), 5)
-    -0.0955
-    >>> round(np.irr([-100, 100, 0, -7]), 5)
-    -0.0833
-    >>> round(np.irr([-100, 100, 0, 7]), 5)
-    0.06206
-    >>> round(np.irr([-5, 10.5, 1, -8, 1]), 5)
-    0.0886
-
-    """
-    # `np.roots` call is why this function does not support Decimal type.
-    #
-    # Ultimately Decimal support needs to be added to np.roots, which has
-    # greater implications on the entire linear algebra module and how it does
-    # eigenvalue computations.
-    res = np.roots(values[::-1])
-    mask = (res.imag == 0) & (res.real > 0)
-    if not mask.any():
-        return np.nan
-    res = res[mask].real
-    # NPV(rate) = 0 can have more than one solution so we return
-    # only the solution closest to zero.
-    rate = 1/res - 1
-    rate = rate.item(np.argmin(np.abs(rate)))
-    return rate
-
-
-def _npv_dispatcher(rate, values):
-    warnings.warn(_depmsg.format(name='npv'),
-                  DeprecationWarning, stacklevel=3)
-    return (values,)
-
-
-@array_function_dispatch(_npv_dispatcher)
-def npv(rate, values):
-    """
-    Returns the NPV (Net Present Value) of a cash flow series.
-
-    .. deprecated:: 1.18
-
-       `npv` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Parameters
-    ----------
-    rate : scalar
-        The discount rate.
-    values : array_like, shape(M, )
-        The values of the time series of cash flows.  The (fixed) time
-        interval between cash flow "events" must be the same as that for
-        which `rate` is given (i.e., if `rate` is per year, then precisely
-        a year is understood to elapse between each cash flow event).  By
-        convention, investments or "deposits" are negative, income or
-        "withdrawals" are positive; `values` must begin with the initial
-        investment, thus `values[0]` will typically be negative.
-
-    Returns
-    -------
-    out : float
-        The NPV of the input cash flow series `values` at the discount
-        `rate`.
-
-    Warnings
-    --------
-    ``npv`` considers a series of cashflows starting in the present (t = 0).
-    NPV can also be defined with a series of future cashflows, paid at the
-    end, rather than the start, of each period. If future cashflows are used,
-    the first cashflow `values[0]` must be zeroed and added to the net
-    present value of the future cashflows. This is demonstrated in the
-    examples.
-
-    Notes
-    -----
-    Returns the result of: [2]_
-
-    .. math :: \\sum_{t=0}^{M-1}{\\frac{values_t}{(1+rate)^{t}}}
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    .. [2] L. J. Gitman, "Principles of Managerial Finance, Brief," 3rd ed.,
-       Addison-Wesley, 2003, pg. 346.
-
-    Examples
-    --------
-    Consider a potential project with an initial investment of $40 000 and
-    projected cashflows of $5 000, $8 000, $12 000 and $30 000 at the end of
-    each period discounted at a rate of 8% per period. To find the project's
-    net present value:
-
-    >>> rate, cashflows = 0.08, [-40_000, 5_000, 8_000, 12_000, 30_000]
-    >>> np.npv(rate, cashflows).round(5)
-    3065.22267
-
-    It may be preferable to split the projected cashflow into an initial
-    investment and expected future cashflows. In this case, the value of
-    the initial cashflow is zero and the initial investment is later added
-    to the future cashflows net present value:
-
-    >>> initial_cashflow = cashflows[0]
-    >>> cashflows[0] = 0
-    >>> np.round(np.npv(rate, cashflows) + initial_cashflow, 5)
-    3065.22267
-
-    """
-    values = np.asarray(values)
-    return (values / (1+rate)**np.arange(0, len(values))).sum(axis=0)
-
-
-def _mirr_dispatcher(values, finance_rate, reinvest_rate):
-    warnings.warn(_depmsg.format(name='mirr'),
-                  DeprecationWarning, stacklevel=3)
-    return (values,)
-
-
-@array_function_dispatch(_mirr_dispatcher)
-def mirr(values, finance_rate, reinvest_rate):
-    """
-    Modified internal rate of return.
-
-    .. deprecated:: 1.18
-
-       `mirr` is deprecated; for details, see NEP 32 [1]_.
-       Use the corresponding function in the numpy-financial library,
-       https://pypi.org/project/numpy-financial.
-
-    Parameters
-    ----------
-    values : array_like
-        Cash flows (must contain at least one positive and one negative
-        value) or nan is returned.  The first value is considered a sunk
-        cost at time zero.
-    finance_rate : scalar
-        Interest rate paid on the cash flows
-    reinvest_rate : scalar
-        Interest rate received on the cash flows upon reinvestment
-
-    Returns
-    -------
-    out : float
-        Modified internal rate of return
-
-    References
-    ----------
-    .. [1] NumPy Enhancement Proposal (NEP) 32,
-       https://numpy.org/neps/nep-0032-remove-financial-functions.html
-    """
-    values = np.asarray(values)
-    n = values.size
-
-    # Without this explicit cast the 1/(n - 1) computation below
-    # becomes a float, which causes TypeError when using Decimal
-    # values.
-    if isinstance(finance_rate, Decimal):
-        n = Decimal(n)
-
-    pos = values > 0
-    neg = values < 0
-    if not (pos.any() and neg.any()):
-        return np.nan
-    numer = np.abs(npv(reinvest_rate, values*pos))
-    denom = np.abs(npv(finance_rate, values*neg))
-    return (numer/denom)**(1/(n - 1))*(1 + reinvest_rate) - 1
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 4e6e731c1..5d951e262 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -41,7 +41,7 @@ Capabilities
 - Is straightforward to reverse engineer. Datasets often live longer than
   the programs that created them. A competent developer should be
   able to create a solution in their preferred programming language to
-  read most ``.npy`` files that he has been given without much
+  read most ``.npy`` files that they have been given without much
   documentation.
 
 - Allows memory-mapping of the data. See `open_memmep`.
@@ -746,7 +746,7 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
             # Friendlier error message
             raise UnicodeError("Unpickling a python object failed: %r\n"
                                "You may need to pass the encoding= option "
-                               "to numpy.load" % (err,))
+                               "to numpy.load" % (err,)) from err
     else:
         if isfileobj(fp):
             # We can use the fast fromfile() function.
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 42ea8c7a7..696fe617b 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -431,10 +431,13 @@ def asarray_chkfinite(a, dtype=None, order=None):
         of lists and ndarrays.  Success requires no NaNs or Infs.
     dtype : data-type, optional
         By default, the data-type is inferred from the input data.
-    order : {'C', 'F'}, optional
-         Whether to use row-major (C-style) or
-         column-major (Fortran-style) memory representation.
-         Defaults to 'C'.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Memory layout.  'A' and 'K' depend on the order of input array a.
+        'C' row-major (C-style),
+        'F' column-major (Fortran-style) memory representation.
+        'A' (any) means 'F' if `a` is Fortran contiguous, 'C' otherwise
+        'K' (keep) preserve input order
+        Defaults to 'C'.
 
     Returns
     -------
@@ -1287,7 +1290,7 @@ def _interp_dispatcher(x, xp, fp, left=None, right=None, period=None):
 @array_function_dispatch(_interp_dispatcher)
 def interp(x, xp, fp, left=None, right=None, period=None):
     """
-    One-dimensional linear interpolation.
+    One-dimensional linear interpolation for monotonically increasing sample points.
 
     Returns the one-dimensional piecewise linear interpolant to a function
     with given discrete data points (`xp`, `fp`), evaluated at `x`.
@@ -1334,8 +1337,8 @@ def interp(x, xp, fp, left=None, right=None, period=None):
     --------
     scipy.interpolate
 
-    Notes
-    -----
+    Warnings
+    --------
     The x-coordinate sequence is expected to be increasing, but this is not
     explicitly enforced.  However, if the sequence `xp` is non-increasing,
     interpolation results are meaningless.
@@ -1447,7 +1450,7 @@ def angle(z, deg=False):
         The counterclockwise angle from the positive real axis on the complex
         plane in the range ``(-pi, pi]``, with dtype as numpy.float64.
 
-        ..versionchanged:: 1.16.0
+        .. versionchanged:: 1.16.0
             This function works on subclasses of ndarray like `ma.array`.
 
     See Also
@@ -1622,6 +1625,7 @@ def trim_zeros(filt, trim='fb'):
     [1, 2]
 
     """
+
     first = 0
     trim = trim.upper()
     if 'F' in trim:
@@ -1931,8 +1935,8 @@ class vectorize:
         .. versionadded:: 1.7.0
 
     cache : bool, optional
-       If `True`, then cache the first function call that determines the number
-       of outputs if `otypes` is not provided.
+        If `True`, then cache the first function call that determines the number
+        of outputs if `otypes` is not provided.
 
         .. versionadded:: 1.7.0
 
@@ -2264,13 +2268,13 @@ class vectorize:
 
 
 def _cov_dispatcher(m, y=None, rowvar=None, bias=None, ddof=None,
-                    fweights=None, aweights=None):
+                    fweights=None, aweights=None, *, dtype=None):
     return (m, y, fweights, aweights)
 
 
 @array_function_dispatch(_cov_dispatcher)
 def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
-        aweights=None):
+        aweights=None, *, dtype=None):
     """
     Estimate a covariance matrix, given data and weights.
 
@@ -2321,6 +2325,11 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
         weights can be used to assign probabilities to observation vectors.
 
         .. versionadded:: 1.10
+    dtype : data-type, optional
+        Data-type of the result. By default, the return data-type will have
+        at least `numpy.float64` precision.
+
+        .. versionadded:: 1.20
 
     Returns
     -------
@@ -2396,13 +2405,16 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
     if m.ndim > 2:
         raise ValueError("m has more than 2 dimensions")
 
-    if y is None:
-        dtype = np.result_type(m, np.float64)
-    else:
+    if y is not None:
         y = np.asarray(y)
         if y.ndim > 2:
             raise ValueError("y has more than 2 dimensions")
-        dtype = np.result_type(m, y, np.float64)
+
+    if dtype is None:
+        if y is None:
+            dtype = np.result_type(m, np.float64)
+        else:
+            dtype = np.result_type(m, y, np.float64)
 
     X = array(m, ndmin=2, dtype=dtype)
     if not rowvar and X.shape[0] != 1:
@@ -2482,12 +2494,14 @@ def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,
     return c.squeeze()
 
 
-def _corrcoef_dispatcher(x, y=None, rowvar=None, bias=None, ddof=None):
+def _corrcoef_dispatcher(x, y=None, rowvar=None, bias=None, ddof=None, *,
+                         dtype=None):
     return (x, y)
 
 
 @array_function_dispatch(_corrcoef_dispatcher)
-def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
+def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue, *,
+             dtype=None):
     """
     Return Pearson product-moment correlation coefficients.
 
@@ -2521,6 +2535,11 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
         Has no effect, do not use.
 
         .. deprecated:: 1.10.0
+    dtype : data-type, optional
+        Data-type of the result. By default, the return data-type will have
+        at least `numpy.float64` precision.
+
+        .. versionadded:: 1.20
 
     Returns
     -------
@@ -2543,11 +2562,11 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
     for backwards compatibility with previous versions of this function.  These
     arguments had no effect on the return values of the function and can be
     safely ignored in this and previous versions of numpy.
-    
+
     Examples
-    --------   
+    --------
     In this example we generate two random arrays, ``xarr`` and ``yarr``, and
-    compute the row-wise and column-wise Pearson correlation coefficients, 
+    compute the row-wise and column-wise Pearson correlation coefficients,
     ``R``. Since ``rowvar`` is  true by  default, we first find the row-wise
     Pearson correlation coefficients between the variables of ``xarr``.
 
@@ -2563,11 +2582,11 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
     array([[ 1.        ,  0.99256089, -0.68080986],
            [ 0.99256089,  1.        , -0.76492172],
            [-0.68080986, -0.76492172,  1.        ]])
-    
-    If we add another set of variables and observations ``yarr``, we can 
+
+    If we add another set of variables and observations ``yarr``, we can
     compute the row-wise Pearson correlation coefficients between the
     variables in ``xarr`` and ``yarr``.
-   
+
     >>> yarr = rng.random((3, 3))
     >>> yarr
     array([[0.45038594, 0.37079802, 0.92676499],
@@ -2589,7 +2608,7 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
              1.        ]])
 
     Finally if we use the option ``rowvar=False``, the columns are now
-    being treated as the variables and we will find the column-wise Pearson 
+    being treated as the variables and we will find the column-wise Pearson
     correlation coefficients between variables in ``xarr`` and ``yarr``.
 
     >>> R3 = np.corrcoef(xarr, yarr, rowvar=False)
@@ -2612,7 +2631,7 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, ddof=np._NoValue):
         # 2015-03-15, 1.10
         warnings.warn('bias and ddof have no effect and are deprecated',
                       DeprecationWarning, stacklevel=3)
-    c = cov(x, y, rowvar)
+    c = cov(x, y, rowvar, dtype=dtype)
     try:
         d = diag(c)
     except ValueError:
@@ -2729,8 +2748,8 @@ def blackman(M):
         return array([])
     if M == 1:
         return ones(1, float)
-    n = arange(0, M)
-    return 0.42 - 0.5*cos(2.0*pi*n/(M-1)) + 0.08*cos(4.0*pi*n/(M-1))
+    n = arange(1-M, M, 2)
+    return 0.42 + 0.5*cos(pi*n/(M-1)) + 0.08*cos(2.0*pi*n/(M-1))
 
 
 @set_module('numpy')
@@ -2838,8 +2857,8 @@ def bartlett(M):
         return array([])
     if M == 1:
         return ones(1, float)
-    n = arange(0, M)
-    return where(less_equal(n, (M-1)/2.0), 2.0*n/(M-1), 2.0 - 2.0*n/(M-1))
+    n = arange(1-M, M, 2)
+    return where(less_equal(n, 0), 1 + n/(M-1), 1 - n/(M-1))
 
 
 @set_module('numpy')
@@ -2942,8 +2961,8 @@ def hanning(M):
         return array([])
     if M == 1:
         return ones(1, float)
-    n = arange(0, M)
-    return 0.5 - 0.5*cos(2.0*pi*n/(M-1))
+    n = arange(1-M, M, 2)
+    return 0.5 + 0.5*cos(pi*n/(M-1))
 
 
 @set_module('numpy')
@@ -3042,8 +3061,8 @@ def hamming(M):
         return array([])
     if M == 1:
         return ones(1, float)
-    n = arange(0, M)
-    return 0.54 - 0.46*cos(2.0*pi*n/(M-1))
+    n = arange(1-M, M, 2)
+    return 0.54 + 0.46*cos(pi*n/(M-1))
 
 
 ## Code from cephes for i0
@@ -3139,25 +3158,18 @@ def i0(x):
     """
     Modified Bessel function of the first kind, order 0.
 
-    Usually denoted :math:`I_0`.  This function does broadcast, but will *not*
-    "up-cast" int dtype arguments unless accompanied by at least one float or
-    complex dtype argument (see Raises below).
+    Usually denoted :math:`I_0`.
 
     Parameters
     ----------
-    x : array_like, dtype float or complex
+    x : array_like of float
         Argument of the Bessel function.
 
     Returns
     -------
-    out : ndarray, shape = x.shape, dtype = x.dtype
+    out : ndarray, shape = x.shape, dtype = float
         The modified Bessel function evaluated at each of the elements of `x`.
 
-    Raises
-    ------
-    TypeError: array cannot be safely cast to required type
-        If argument consists exclusively of int dtypes.
-
     See Also
     --------
     scipy.special.i0, scipy.special.iv, scipy.special.ive
@@ -3187,12 +3199,16 @@ def i0(x):
     Examples
     --------
     >>> np.i0(0.)
-    array(1.0)  # may vary
-    >>> np.i0([0., 1. + 2j])
-    array([ 1.00000000+0.j        ,  0.18785373+0.64616944j])  # may vary
+    array(1.0)
+    >>> np.i0([0, 1, 2, 3])
+    array([1.        , 1.26606588, 2.2795853 , 4.88079259])
 
     """
     x = np.asanyarray(x)
+    if x.dtype.kind == 'c':
+        raise TypeError("i0 not supported for complex values")
+    if x.dtype.kind != 'f':
+        x = x.astype(float)
     x = np.abs(x)
     return piecewise(x, [x <= 8.0], [_i0_1, _i0_2])
 
@@ -4228,10 +4244,9 @@ def meshgrid(*xi, copy=True, sparse=False, indexing='xy'):
 
     See Also
     --------
-    index_tricks.mgrid : Construct a multi-dimensional "meshgrid"
-                     using indexing notation.
-    index_tricks.ogrid : Construct an open multi-dimensional "meshgrid"
-                     using indexing notation.
+    mgrid : Construct a multi-dimensional "meshgrid" using indexing notation.
+    ogrid : Construct an open multi-dimensional "meshgrid" using indexing 
+            notation.
 
     Examples
     --------
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index cba713ede..9d3de69dd 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -1,6 +1,7 @@
 import functools
 import sys
 import math
+import warnings
 
 import numpy.core.numeric as _nx
 from numpy.core.numeric import (
@@ -154,15 +155,15 @@ class nd_grid:
                     start = 0
                 if step is None:
                     step = 1
-                if isinstance(step, complex):
+                if isinstance(step, (_nx.complexfloating, complex)):
                     size.append(int(abs(step)))
                     typ = float
                 else:
                     size.append(
                         int(math.ceil((key[k].stop - start)/(step*1.0))))
-                if (isinstance(step, float) or
-                        isinstance(start, float) or
-                        isinstance(key[k].stop, float)):
+                if (isinstance(step, (_nx.floating, float)) or
+                        isinstance(start, (_nx.floating, float)) or
+                        isinstance(key[k].stop, (_nx.floating, float))):
                     typ = float
             if self.sparse:
                 nn = [_nx.arange(_x, dtype=_t)
@@ -176,7 +177,7 @@ class nd_grid:
                     start = 0
                 if step is None:
                     step = 1
-                if isinstance(step, complex):
+                if isinstance(step, (_nx.complexfloating, complex)):
                     step = int(abs(step))
                     if step != 1:
                         step = (key[k].stop - start)/float(step-1)
@@ -194,7 +195,7 @@ class nd_grid:
             start = key.start
             if start is None:
                 start = 0
-            if isinstance(step, complex):
+            if isinstance(step, (_nx.complexfloating, complex)):
                 step = abs(step)
                 length = int(step)
                 if step != 1:
@@ -344,7 +345,7 @@ class AxisConcatenator:
                     start = 0
                 if step is None:
                     step = 1
-                if isinstance(step, complex):
+                if isinstance(step, (_nx.complexfloating, complex)):
                     size = int(abs(step))
                     newobj = linspace(start, stop, num=size)
                 else:
@@ -659,7 +660,15 @@ class ndindex:
         Increment the multi-dimensional index by one.
 
         This method is for backward compatibility only: do not use.
+
+        .. deprecated:: 1.20.0
+            This method has been advised against since numpy 1.8.0, but only
+            started emitting DeprecationWarning as of this version.
         """
+        # NumPy 1.20.0, 2020-09-08
+        warnings.warn(
+            "`ndindex.ndincr()` is deprecated, use `next(ndindex)` instead",
+            DeprecationWarning, stacklevel=2)
         next(self)
 
     def __next__(self):
@@ -769,9 +778,11 @@ def fill_diagonal(a, val, wrap=False):
     a : array, at least 2-D.
       Array whose diagonal is to be filled, it gets modified in-place.
 
-    val : scalar
-      Value to be written on the diagonal, its type must be compatible with
-      that of the array a.
+    val : scalar or array_like
+      Value(s) to write on the diagonal. If `val` is scalar, the value is
+      written along the diagonal. If array-like, the flattened `val` is
+      written along the diagonal, repeating if necessary to fill all
+      diagonal entries.
 
     wrap : bool
       For tall matrices in NumPy version up to 1.6.2, the
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index 003550432..409016adb 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -244,7 +244,7 @@ def nanmin(a, axis=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary. See
-        `ufuncs-output-type` for more details.
+        :ref:`ufuncs-output-type` for more details.
 
         .. versionadded:: 1.8.0
     keepdims : bool, optional
@@ -359,7 +359,7 @@ def nanmax(a, axis=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary. See
-        `ufuncs-output-type` for more details.
+        :ref:`ufuncs-output-type` for more details.
 
         .. versionadded:: 1.8.0
     keepdims : bool, optional
@@ -584,7 +584,7 @@ def nansum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``. If provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.  See
-        `ufuncs-output-type` for more details. The casting of NaN to integer
+        :ref:`ufuncs-output-type` for more details. The casting of NaN to integer
         can yield unexpected results.
 
         .. versionadded:: 1.8.0
@@ -681,7 +681,7 @@ def nanprod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``. If provided, it must have the same shape as the
         expected output, but the type will be cast if necessary. See
-        `ufuncs-output-type` for more details. The casting of NaN to integer
+        :ref:`ufuncs-output-type` for more details. The casting of NaN to integer
         can yield unexpected results.
     keepdims : bool, optional
         If True, the axes which are reduced are left in the result as
@@ -749,7 +749,7 @@ def nancumsum(a, axis=None, dtype=None, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must
         have the same shape and buffer length as the expected output
-        but the type will be cast if necessary. See `ufuncs-output-type` for
+        but the type will be cast if necessary. See :ref:`ufuncs-output-type` for
         more details.
 
     Returns
@@ -888,7 +888,7 @@ def nanmean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary. See
-        `ufuncs-output-type` for more details.
+        :ref:`ufuncs-output-type` for more details.
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
         in the result as dimensions with size one. With this option,
@@ -1256,7 +1256,7 @@ def nanquantile(a, q, axis=None, out=None, overwrite_input=False,
     Compute the qth quantile of the data along the specified axis,
     while ignoring nan values.
     Returns the qth quantile(s) of the array elements.
-    
+
     .. versionadded:: 1.15.0
 
     Parameters
@@ -1472,7 +1472,7 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     mean : Average
     var : Variance while not ignoring NaNs
     nanstd, nanmean
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -1624,7 +1624,7 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     --------
     var, mean, std
     nanvar, nanmean
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 520e9c9ec..af8e28e42 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -14,7 +14,7 @@ from . import format
 from ._datasource import DataSource
 from numpy.core import overrides
 from numpy.core.multiarray import packbits, unpackbits
-from numpy.core.overrides import set_module
+from numpy.core.overrides import set_array_function_like_doc, set_module
 from numpy.core._internal import recursive
 from ._iotools import (
     LineSplitter, NameValidator, StringConverter, ConverterError,
@@ -23,8 +23,8 @@ from ._iotools import (
     )
 
 from numpy.compat import (
-    asbytes, asstr, asunicode, bytes, os_fspath, os_PathLike,
-    pickle, contextlib_nullcontext
+    asbytes, asstr, asunicode, os_fspath, os_PathLike,
+    pickle
     )
 
 
@@ -86,7 +86,7 @@ class BagObj:
         try:
             return object.__getattribute__(self, '_obj')[key]
         except KeyError:
-            raise AttributeError(key)
+            raise AttributeError(key) from None
 
     def __dir__(self):
         """
@@ -178,6 +178,9 @@ class NpzFile(Mapping):
     array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
     """
+    # Make __exit__ safe if zipfile_factory raises an exception
+    zip = None
+    fid = None
 
     def __init__(self, fid, own_fid=False, allow_pickle=False,
                  pickle_kwargs=None):
@@ -197,8 +200,6 @@ class NpzFile(Mapping):
         self.f = BagObj(self)
         if own_fid:
             self.fid = fid
-        else:
-            self.fid = None
 
     def __enter__(self):
         return self
@@ -445,9 +446,9 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
                                  "when allow_pickle=False")
             try:
                 return pickle.load(fid, **pickle_kwargs)
-            except Exception:
+            except Exception as e:
                 raise IOError(
-                    "Failed to interpret file %s as a pickle" % repr(file))
+                    "Failed to interpret file %s as a pickle" % repr(file)) from e
 
 
 def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None):
@@ -516,7 +517,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
     # [1 2] [1 3]
     """
     if hasattr(file, 'write'):
-        file_ctx = contextlib_nullcontext(file)
+        file_ctx = contextlib.nullcontext(file)
     else:
         file = os_fspath(file)
         if not file.endswith('.npy'):
@@ -711,44 +712,14 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
 
     zipf = zipfile_factory(file, mode="w", compression=compression)
 
-    if sys.version_info >= (3, 6):
-        # Since Python 3.6 it is possible to write directly to a ZIP file.
-        for key, val in namedict.items():
-            fname = key + '.npy'
-            val = np.asanyarray(val)
-            # always force zip64, gh-10776
-            with zipf.open(fname, 'w', force_zip64=True) as fid:
-                format.write_array(fid, val,
-                                   allow_pickle=allow_pickle,
-                                   pickle_kwargs=pickle_kwargs)
-    else:
-        # Stage arrays in a temporary file on disk, before writing to zip.
-
-        # Import deferred for startup time improvement
-        import tempfile
-        # Since target file might be big enough to exceed capacity of a global
-        # temporary directory, create temp file side-by-side with the target file.
-        file_dir, file_prefix = os.path.split(file) if _is_string_like(file) else (None, 'tmp')
-        fd, tmpfile = tempfile.mkstemp(prefix=file_prefix, dir=file_dir, suffix='-numpy.npy')
-        os.close(fd)
-        try:
-            for key, val in namedict.items():
-                fname = key + '.npy'
-                fid = open(tmpfile, 'wb')
-                try:
-                    format.write_array(fid, np.asanyarray(val),
-                                       allow_pickle=allow_pickle,
-                                       pickle_kwargs=pickle_kwargs)
-                    fid.close()
-                    fid = None
-                    zipf.write(tmpfile, arcname=fname)
-                except IOError as exc:
-                    raise IOError("Failed to write to %s: %s" % (tmpfile, exc))
-                finally:
-                    if fid:
-                        fid.close()
-        finally:
-            os.remove(tmpfile)
+    for key, val in namedict.items():
+        fname = key + '.npy'
+        val = np.asanyarray(val)
+        # always force zip64, gh-10776
+        with zipf.open(fname, 'w', force_zip64=True) as fid:
+            format.write_array(fid, val,
+                               allow_pickle=allow_pickle,
+                               pickle_kwargs=pickle_kwargs)
 
     zipf.close()
 
@@ -789,10 +760,17 @@ def _getconv(dtype):
 _loadtxt_chunksize = 50000
 
 
+def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
+                        converters=None, skiprows=None, usecols=None, unpack=None,
+                        ndmin=None, encoding=None, max_rows=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
 def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             converters=None, skiprows=0, usecols=None, unpack=False,
-            ndmin=0, encoding='bytes', max_rows=None):
+            ndmin=0, encoding='bytes', max_rows=None, *, like=None):
     r"""
     Load data from a text file.
 
@@ -837,8 +815,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             fourth column the same way as ``usecols = (3,)`` would.
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
-        unpacked using ``x, y, z = loadtxt(...)``.  When used with a structured
-        data-type, arrays are returned for each field.  Default is False.
+        unpacked using ``x, y, z = loadtxt(...)``.  When used with a
+        structured data-type, arrays are returned for each field.
+        Default is False.
     ndmin : int, optional
         The returned array will have at least `ndmin` dimensions.
         Otherwise mono-dimensional axes will be squeezed.
@@ -859,6 +838,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         is to read all the lines.
 
         .. versionadded:: 1.16.0
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -916,6 +898,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
            [-17.57,  63.94]])
     """
 
+    if like is not None:
+        return _loadtxt_with_like(
+            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            converters=converters, skiprows=skiprows, usecols=usecols,
+            unpack=unpack, ndmin=ndmin, encoding=encoding,
+            max_rows=max_rows, like=like
+        )
+
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     # Nested functions used by loadtxt.
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -975,10 +965,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         if comments is not None:
             line = regex_comments.split(line, maxsplit=1)[0]
         line = line.strip('\r\n')
-        if line:
-            return line.split(delimiter)
-        else:
-            return []
+        return line.split(delimiter) if line else []
 
     def read_data(chunk_size):
         """Parse each line, including the first.
@@ -1040,11 +1027,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
 
     user_converters = converters
 
+    byte_converters = False
     if encoding == 'bytes':
         encoding = None
         byte_converters = True
-    else:
-        byte_converters = False
 
     if usecols is not None:
         # Allow usecols to be a single int or a sequence of ints
@@ -1200,6 +1186,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         return X
 
 
+_loadtxt_with_like = array_function_dispatch(
+    _loadtxt_dispatcher
+)(loadtxt)
+
+
 def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
                         header=None, footer=None, comments=None,
                         encoding=None):
@@ -1440,10 +1431,10 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
             for row in X:
                 try:
                     v = format % tuple(row) + newline
-                except TypeError:
+                except TypeError as e:
                     raise TypeError("Mismatch between array dtype ('%s') and "
                                     "format specifier ('%s')"
-                                    % (str(X.dtype), format))
+                                    % (str(X.dtype), format)) from e
                 fh.write(v)
 
         if len(footer) > 0:
@@ -1496,7 +1487,7 @@ def fromregex(file, regexp, dtype, encoding=None):
     -----
     Dtypes for structured arrays can be specified in several forms, but all
     forms specify at least the data type and field name. For details see
-    `doc.structured_arrays`.
+    `basics.rec`.
 
     Examples
     --------
@@ -1553,6 +1544,18 @@ def fromregex(file, regexp, dtype, encoding=None):
 #####--------------------------------------------------------------------------
 
 
+def _genfromtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
+                           skip_header=None, skip_footer=None, converters=None,
+                           missing_values=None, filling_values=None, usecols=None,
+                           names=None, excludelist=None, deletechars=None,
+                           replace_space=None, autostrip=None, case_sensitive=None,
+                           defaultfmt=None, unpack=None, usemask=None, loose=None,
+                           invalid_raise=None, max_rows=None, encoding=None, *,
+                           like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
 def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                skip_header=0, skip_footer=0, converters=None,
@@ -1561,7 +1564,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
                deletechars=''.join(sorted(NameValidator.defaultdeletechars)),
                replace_space='_', autostrip=False, case_sensitive=True,
                defaultfmt="f%i", unpack=None, usemask=False, loose=True,
-               invalid_raise=True, max_rows=None, encoding='bytes'):
+               invalid_raise=True, max_rows=None, encoding='bytes', *,
+               like=None):
     """
     Load data from a text file, with missing values handled as specified.
 
@@ -1633,7 +1637,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         If 'lower', field names are converted to lower case.
     unpack : bool, optional
         If True, the returned array is transposed, so that arguments may be
-        unpacked using ``x, y, z = loadtxt(...)``
+        unpacked using ``x, y, z = genfromtxt(...)``.  When used with a
+        structured data-type, arrays are returned for each field.
+        Default is False.
     usemask : bool, optional
         If True, return a masked array.
         If False, return a regular array.
@@ -1658,6 +1664,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         to None the system default is used. The default value is 'bytes'.
 
         .. versionadded:: 1.14.0
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1736,6 +1745,21 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
       dtype=[('f0', 'S12'), ('f1', 'S12')])
 
     """
+
+    if like is not None:
+        return _genfromtxt_with_like(
+            fname, dtype=dtype, comments=comments, delimiter=delimiter,
+            skip_header=skip_header, skip_footer=skip_footer,
+            converters=converters, missing_values=missing_values,
+            filling_values=filling_values, usecols=usecols, names=names,
+            excludelist=excludelist, deletechars=deletechars,
+            replace_space=replace_space, autostrip=autostrip,
+            case_sensitive=case_sensitive, defaultfmt=defaultfmt,
+            unpack=unpack, usemask=usemask, loose=loose,
+            invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding,
+            like=like
+        )
+
     if max_rows is not None:
         if skip_footer:
             raise ValueError(
@@ -1768,7 +1792,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
             fid_ctx = contextlib.closing(fid)
         else:
             fid = fname
-            fid_ctx = contextlib_nullcontext(fid)
+            fid_ctx = contextlib.nullcontext(fid)
         fhd = iter(fid)
     except TypeError as e:
         raise TypeError(
@@ -2244,9 +2268,23 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
     if usemask:
         output = output.view(MaskedArray)
         output._mask = outputmask
+    output = np.squeeze(output)
     if unpack:
-        return output.squeeze().T
-    return output.squeeze()
+        if names is None:
+            return output.T
+        elif len(names) == 1:
+            # squeeze single-name dtypes too
+            return output[names[0]]
+        else:
+            # For structured arrays with multiple fields,
+            # return an array for each field.
+            return [output[field] for field in names]
+    return output
+
+
+_genfromtxt_with_like = array_function_dispatch(
+    _genfromtxt_dispatcher
+)(genfromtxt)
 
 
 def ndfromtxt(fname, **kwargs):
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 420ec245b..0fd9bbd79 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -426,7 +426,7 @@ def polyder(p, m=1):
     >>> np.polyder(p, 3)
     poly1d([6])
     >>> np.polyder(p, 4)
-    poly1d([0.])
+    poly1d([0])
 
     """
     m = int(m)
@@ -494,11 +494,12 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False):
     cov : bool or str, optional
         If given and not `False`, return not just the estimate but also its
         covariance matrix. By default, the covariance are scaled by
-        chi2/sqrt(N-dof), i.e., the weights are presumed to be unreliable
-        except in a relative sense and everything is scaled such that the
-        reduced chi2 is unity. This scaling is omitted if ``cov='unscaled'``,
-        as is relevant for the case that the weights are 1/sigma**2, with
-        sigma known to be a reliable estimate of the uncertainty.
+        chi2/dof, where dof = M - (deg + 1), i.e., the weights are presumed 
+        to be unreliable except in a relative sense and everything is scaled 
+        such that the reduced chi2 is unity. This scaling is omitted if 
+        ``cov='unscaled'``, as is relevant for the case that the weights are 
+        1/sigma**2, with sigma known to be a reliable estimate of the 
+        uncertainty.
 
     Returns
     -------
@@ -753,11 +754,11 @@ def polyval(p, x):
     >>> np.polyval([3,0,1], 5)  # 3 * 5**2 + 0 * 5**1 + 1
     76
     >>> np.polyval([3,0,1], np.poly1d(5))
-    poly1d([76.])
+    poly1d([76])
     >>> np.polyval(np.poly1d([3,0,1]), 5)
     76
     >>> np.polyval(np.poly1d([3,0,1]), np.poly1d(5))
-    poly1d([76.])
+    poly1d([76])
 
     """
     p = NX.asarray(p)
@@ -1016,7 +1017,7 @@ def polydiv(u, v):
     (array([1.5 , 1.75]), array([0.25]))
 
     """
-    truepoly = (isinstance(u, poly1d) or isinstance(u, poly1d))
+    truepoly = (isinstance(u, poly1d) or isinstance(v, poly1d))
     u = atleast_1d(u) + 0.0
     v = atleast_1d(v) + 0.0
     # w has the common type
@@ -1235,7 +1236,7 @@ class poly1d:
             raise ValueError("Polynomial must be 1d only.")
         c_or_r = trim_zeros(c_or_r, trim='f')
         if len(c_or_r) == 0:
-            c_or_r = NX.array([0.])
+            c_or_r = NX.array([0], dtype=c_or_r.dtype)
         self._coeffs = c_or_r
         if variable is None:
             variable = 'x'
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index a11d5f2c7..fbfbca73d 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -513,7 +513,7 @@ def drop_fields(base, drop_names, usemask=True, asrecarray=False):
 
     Nested fields are supported.
 
-    ..versionchanged: 1.18.0
+    .. versionchanged:: 1.18.0
         `drop_fields` returns an array with 0 fields if all fields are dropped,
         rather than returning ``None`` as it did previously.
 
@@ -899,7 +899,7 @@ def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
 @array_function_dispatch(_structured_to_unstructured_dispatcher)
 def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
     """
-    Converts and n-D structured array into an (n+1)-D unstructured array.
+    Converts an n-D structured array into an (n+1)-D unstructured array.
 
     The new array will have a new last dimension equal in size to the
     number of field-elements of the input array. If not supplied, the output
@@ -996,7 +996,7 @@ def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
 def unstructured_to_structured(arr, dtype=None, names=None, align=False,
                                copy=False, casting='unsafe'):
     """
-    Converts and n-D unstructured array into an (n-1)-D structured array.
+    Converts an n-D unstructured array into an (n-1)-D structured array.
 
     The last dimension of the input array is converted into a structure, with
     number of field-elements equal to the size of the last dimension of the
diff --git a/numpy/lib/scimath.py b/numpy/lib/scimath.py
index 555a3d5a8..2b0d38c37 100644
--- a/numpy/lib/scimath.py
+++ b/numpy/lib/scimath.py
@@ -14,6 +14,22 @@ module provide the mathematically valid answers in the complex plane::
 Similarly, `sqrt`, other base logarithms, `power` and trig functions are
 correctly handled.  See their respective docstrings for specific examples.
 
+Functions
+---------
+
+.. autosummary::
+   :toctree: generated/
+
+   sqrt
+   log
+   log2
+   logn
+   log10
+   power
+   arccos
+   arcsin
+   arctanh
+
 """
 import numpy.core.numeric as nx
 import numpy.core.numerictypes as nt
diff --git a/numpy/lib/setup.py b/numpy/lib/setup.py
index b3f441f38..7520b72d7 100644
--- a/numpy/lib/setup.py
+++ b/numpy/lib/setup.py
@@ -4,6 +4,7 @@ def configuration(parent_package='',top_path=None):
     config = Configuration('lib', parent_package, top_path)
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index bc6718eca..cbc4641d8 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -756,11 +756,11 @@ def array_split(ary, indices_or_sections, axis=0):
     --------
     >>> x = np.arange(8.0)
     >>> np.array_split(x, 3)
-        [array([0.,  1.,  2.]), array([3.,  4.,  5.]), array([6.,  7.])]
+    [array([0.,  1.,  2.]), array([3.,  4.,  5.]), array([6.,  7.])]
 
-    >>> x = np.arange(7.0)
-    >>> np.array_split(x, 3)
-        [array([0.,  1.,  2.]), array([3.,  4.]), array([5.,  6.])]
+    >>> x = np.arange(9)
+    >>> np.array_split(x, 4)
+    [array([0, 1, 2]), array([3, 4]), array([5, 6]), array([7, 8])]
 
     """
     try:
@@ -870,7 +870,7 @@ def split(ary, indices_or_sections, axis=0):
         N = ary.shape[axis]
         if N % sections:
             raise ValueError(
-                'array split does not result in an equal division')
+                'array split does not result in an equal division') from None
     return array_split(ary, indices_or_sections, axis)
 
 
diff --git a/numpy/lib/stride_tricks.py b/numpy/lib/stride_tricks.py
index 502235bdf..82c8a57c8 100644
--- a/numpy/lib/stride_tricks.py
+++ b/numpy/lib/stride_tricks.py
@@ -6,9 +6,10 @@ NumPy reference guide.
 
 """
 import numpy as np
-from numpy.core.overrides import array_function_dispatch
+from numpy.core.numeric import normalize_axis_tuple
+from numpy.core.overrides import array_function_dispatch, set_module
 
-__all__ = ['broadcast_to', 'broadcast_arrays']
+__all__ = ['broadcast_to', 'broadcast_arrays', 'broadcast_shapes']
 
 
 class DummyArray:
@@ -65,8 +66,10 @@ def as_strided(x, shape=None, strides=None, subok=False, writeable=True):
 
     See also
     --------
-    broadcast_to: broadcast an array to a given shape.
+    broadcast_to : broadcast an array to a given shape.
     reshape : reshape an array.
+    lib.stride_tricks.sliding_window_view :
+        userfriendly and safe function for the creation of sliding window views.
 
     Notes
     -----
@@ -111,6 +114,228 @@ def as_strided(x, shape=None, strides=None, subok=False, writeable=True):
     return view
 
 
+def _sliding_window_view_dispatcher(x, window_shape, axis=None, *,
+                                    subok=None, writeable=None):
+    return (x,)
+
+
+@array_function_dispatch(_sliding_window_view_dispatcher)
+def sliding_window_view(x, window_shape, axis=None, *,
+                        subok=False, writeable=False):
+    """
+    Create a sliding window view into the array with the given window shape.
+
+    Also known as rolling or moving window, the window slides across all
+    dimensions of the array and extracts subsets of the array at all window
+    positions.
+    
+    .. versionadded:: 1.20.0
+
+    Parameters
+    ----------
+    x : array_like
+        Array to create the sliding window view from.
+    window_shape : int or tuple of int
+        Size of window over each axis that takes part in the sliding window.
+        If `axis` is not present, must have same length as the number of input
+        array dimensions. Single integers `i` are treated as if they were the
+        tuple `(i,)`.
+    axis : int or tuple of int, optional
+        Axis or axes along which the sliding window is applied.
+        By default, the sliding window is applied to all axes and
+        `window_shape[i]` will refer to axis `i` of `x`.
+        If `axis` is given as a `tuple of int`, `window_shape[i]` will refer to
+        the axis `axis[i]` of `x`.
+        Single integers `i` are treated as if they were the tuple `(i,)`.
+    subok : bool, optional
+        If True, sub-classes will be passed-through, otherwise the returned
+        array will be forced to be a base-class array (default).
+    writeable : bool, optional
+        When true, allow writing to the returned view. The default is false,
+        as this should be used with caution: the returned view contains the
+        same memory location multiple times, so writing to one location will
+        cause others to change.
+
+    Returns
+    -------
+    view : ndarray
+        Sliding window view of the array. The sliding window dimensions are
+        inserted at the end, and the original dimensions are trimmed as
+        required by the size of the sliding window.
+        That is, ``view.shape = x_shape_trimmed + window_shape``, where
+        ``x_shape_trimmed`` is ``x.shape`` with every entry reduced by one less
+        than the corresponding window size.
+
+    See Also
+    --------
+    lib.stride_tricks.as_strided: A lower-level and less safe routine for
+        creating arbitrary views from custom shape and strides.
+    broadcast_to: broadcast an array to a given shape.
+
+    Notes
+    -----
+    For many applications using a sliding window view can be convenient, but
+    potentially very slow. Often specialized solutions exist, for example:
+
+    - `scipy.signal.fftconvolve`
+
+    - filtering functions in `scipy.ndimage`
+
+    - moving window functions provided by
+      `bottleneck <https://github.com/pydata/bottleneck>`_.
+
+    As a rough estimate, a sliding window approach with an input size of `N`
+    and a window size of `W` will scale as `O(N*W)` where frequently a special
+    algorithm can achieve `O(N)`. That means that the sliding window variant
+    for a window size of 100 can be a 100 times slower than a more specialized
+    version.
+
+    Nevertheless, for small window sizes, when no custom algorithm exists, or
+    as a prototyping and developing tool, this function can be a good solution.
+
+    Examples
+    --------
+    >>> x = np.arange(6)
+    >>> x.shape
+    (6,)
+    >>> v = sliding_window_view(x, 3)
+    >>> v.shape
+    (4, 3)
+    >>> v
+    array([[0, 1, 2],
+           [1, 2, 3],
+           [2, 3, 4],
+           [3, 4, 5]])
+
+    This also works in more dimensions, e.g.
+
+    >>> i, j = np.ogrid[:3, :4]
+    >>> x = 10*i + j
+    >>> x.shape
+    (3, 4)
+    >>> x
+    array([[ 0,  1,  2,  3],
+           [10, 11, 12, 13],
+           [20, 21, 22, 23]])
+    >>> shape = (2,2)
+    >>> v = sliding_window_view(x, shape)
+    >>> v.shape
+    (2, 3, 2, 2)
+    >>> v
+    array([[[[ 0,  1],
+             [10, 11]],
+            [[ 1,  2],
+             [11, 12]],
+            [[ 2,  3],
+             [12, 13]]],
+           [[[10, 11],
+             [20, 21]],
+            [[11, 12],
+             [21, 22]],
+            [[12, 13],
+             [22, 23]]]])
+
+    The axis can be specified explicitly:
+
+    >>> v = sliding_window_view(x, 3, 0)
+    >>> v.shape
+    (1, 4, 3)
+    >>> v
+    array([[[ 0, 10, 20],
+            [ 1, 11, 21],
+            [ 2, 12, 22],
+            [ 3, 13, 23]]])
+
+    The same axis can be used several times. In that case, every use reduces
+    the corresponding original dimension:
+
+    >>> v = sliding_window_view(x, (2, 3), (1, 1))
+    >>> v.shape
+    (3, 1, 2, 3)
+    >>> v
+    array([[[[ 0,  1,  2],
+             [ 1,  2,  3]]],
+           [[[10, 11, 12],
+             [11, 12, 13]]],
+           [[[20, 21, 22],
+             [21, 22, 23]]]])
+
+    Combining with stepped slicing (`::step`), this can be used to take sliding
+    views which skip elements:
+
+    >>> x = np.arange(7)
+    >>> sliding_window_view(x, 5)[:, ::2]
+    array([[0, 2, 4],
+           [1, 3, 5],
+           [2, 4, 6]])
+
+    or views which move by multiple elements
+
+    >>> x = np.arange(7)
+    >>> sliding_window_view(x, 3)[::2, :]
+    array([[0, 1, 2],
+           [2, 3, 4],
+           [4, 5, 6]])
+
+    A common application of `sliding_window_view` is the calculation of running
+    statistics. The simplest example is the
+    `moving average <https://en.wikipedia.org/wiki/Moving_average>`_:
+
+    >>> x = np.arange(6)
+    >>> x.shape
+    (6,)
+    >>> v = sliding_window_view(x, 3)
+    >>> v.shape
+    (4, 3)
+    >>> v
+    array([[0, 1, 2],
+           [1, 2, 3],
+           [2, 3, 4],
+           [3, 4, 5]])
+    >>> moving_average = v.mean(axis=-1)
+    >>> moving_average
+    array([1., 2., 3., 4.])
+
+    Note that a sliding window approach is often **not** optimal (see Notes).
+    """
+    window_shape = (tuple(window_shape)
+                    if np.iterable(window_shape)
+                    else (window_shape,))
+    # first convert input to array, possibly keeping subclass
+    x = np.array(x, copy=False, subok=subok)
+
+    window_shape_array = np.array(window_shape)
+    if np.any(window_shape_array < 0):
+        raise ValueError('`window_shape` cannot contain negative values')
+
+    if axis is None:
+        axis = tuple(range(x.ndim))
+        if len(window_shape) != len(axis):
+            raise ValueError(f'Since axis is `None`, must provide '
+                             f'window_shape for all dimensions of `x`; '
+                             f'got {len(window_shape)} window_shape elements '
+                             f'and `x.ndim` is {x.ndim}.')
+    else:
+        axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
+        if len(window_shape) != len(axis):
+            raise ValueError(f'Must provide matching length window_shape and '
+                             f'axis; got {len(window_shape)} window_shape '
+                             f'elements and {len(axis)} axes elements.')
+
+    out_strides = x.strides + tuple(x.strides[ax] for ax in axis)
+
+    # note: same axis can be windowed repeatedly
+    x_shape_trimmed = list(x.shape)
+    for ax, dim in zip(axis, window_shape):
+        if x_shape_trimmed[ax] < dim:
+            raise ValueError(
+                'window shape cannot be larger than input array shape')
+        x_shape_trimmed[ax] -= dim - 1
+    out_shape = tuple(x_shape_trimmed) + window_shape
+    return as_strided(x, strides=out_strides, shape=out_shape,
+                      subok=subok, writeable=writeable)
+
+
 def _broadcast_to(array, shape, subok, readonly):
     shape = tuple(shape) if np.iterable(shape) else (shape,)
     array = np.array(array, copy=False, subok=subok)
@@ -165,6 +390,12 @@ def broadcast_to(array, shape, subok=False):
         If the array is not compatible with the new shape according to NumPy's
         broadcasting rules.
 
+    See Also
+    --------
+    broadcast
+    broadcast_arrays
+    broadcast_shapes
+
     Notes
     -----
     .. versionadded:: 1.10.0
@@ -197,6 +428,49 @@ def _broadcast_shape(*args):
     return b.shape
 
 
+@set_module('numpy')
+def broadcast_shapes(*args):
+    """
+    Broadcast the input shapes into a single shape.
+
+    :ref:`Learn more about broadcasting here <basics.broadcasting>`.
+
+    .. versionadded:: 1.20.0
+
+    Parameters
+    ----------
+    `*args` : tuples of ints, or ints
+        The shapes to be broadcast against each other.
+
+    Returns
+    -------
+    tuple
+        Broadcasted shape.
+
+    Raises
+    ------
+    ValueError
+        If the shapes are not compatible and cannot be broadcast according
+        to NumPy's broadcasting rules.
+
+    See Also
+    --------
+    broadcast
+    broadcast_arrays
+    broadcast_to
+
+    Examples
+    --------
+    >>> np.broadcast_shapes((1, 2), (3, 1), (3, 2))
+    (3, 2)
+
+    >>> np.broadcast_shapes((6, 7), (5, 6, 1), (7,), (5, 1, 7))
+    (5, 6, 7)
+    """
+    arrays = [np.empty(x, dtype=[]) for x in args]
+    return _broadcast_shape(*arrays)
+
+
 def _broadcast_arrays_dispatcher(*args, subok=None):
     return args
 
@@ -230,6 +504,12 @@ def broadcast_arrays(*args, subok=False):
             warning will be emitted. A future version will set the
             ``writable`` flag False so writing to it will raise an error.
 
+    See Also
+    --------
+    broadcast
+    broadcast_to
+    broadcast_shapes
+
     Examples
     --------
     >>> x = np.array([[1,2,3]])
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index 81ba789e3..847e6cb8a 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -125,32 +125,36 @@ class TestSetOps:
         assert_array_equal([7, 1], ediff1d(two_elem, to_begin=7))
         assert_array_equal([5, 6, 1], ediff1d(two_elem, to_begin=[5, 6]))
 
-    @pytest.mark.parametrize("ary, prepend, append", [
+    @pytest.mark.parametrize("ary, prepend, append, expected", [
         # should fail because trying to cast
         # np.nan standard floating point value
         # into an integer array:
         (np.array([1, 2, 3], dtype=np.int64),
          None,
-         np.nan),
+         np.nan,
+         'to_end'),
         # should fail because attempting
         # to downcast to int type:
         (np.array([1, 2, 3], dtype=np.int64),
          np.array([5, 7, 2], dtype=np.float32),
-         None),
+         None,
+         'to_begin'),
         # should fail because attempting to cast
         # two special floating point values
-        # to integers (on both sides of ary):
+        # to integers (on both sides of ary),
+        # `to_begin` is in the error message as the impl checks this first:
         (np.array([1., 3., 9.], dtype=np.int8),
          np.nan,
-         np.nan),
+         np.nan,
+         'to_begin'),
          ])
-    def test_ediff1d_forbidden_type_casts(self, ary, prepend, append):
+    def test_ediff1d_forbidden_type_casts(self, ary, prepend, append, expected):
         # verify resolution of gh-11490
 
         # specifically, raise an appropriate
         # Exception when attempting to append or
         # prepend with an incompatible type
-        msg = 'must be compatible'
+        msg = 'dtype of `{}` must be compatible'.format(expected)
         with assert_raises_regex(TypeError, msg):
             ediff1d(ary=ary,
                     to_end=append,
diff --git a/numpy/lib/tests/test_financial.py b/numpy/lib/tests/test_financial.py
deleted file mode 100644
index 26e79bc06..000000000
--- a/numpy/lib/tests/test_financial.py
+++ /dev/null
@@ -1,380 +0,0 @@
-import warnings
-from decimal import Decimal
-
-import numpy as np
-from numpy.testing import (
-    assert_, assert_almost_equal, assert_allclose, assert_equal, assert_raises
-    )
-
-
-def filter_deprecation(func):
-    def newfunc(*args, **kwargs):
-        with warnings.catch_warnings(record=True) as ws:
-            warnings.filterwarnings('always', category=DeprecationWarning)
-            func(*args, **kwargs)
-            assert_(all(w.category is DeprecationWarning for w in ws))
-    return newfunc
-
-
-class TestFinancial:
-    @filter_deprecation
-    def test_npv_irr_congruence(self):
-        # IRR is defined as the rate required for the present value of a
-        # a series of cashflows to be zero i.e. NPV(IRR(x), x) = 0
-        cashflows = np.array([-40000, 5000, 8000, 12000, 30000])
-        assert_allclose(np.npv(np.irr(cashflows), cashflows), 0, atol=1e-10, rtol=0)
-
-    @filter_deprecation
-    def test_rate(self):
-        assert_almost_equal(
-            np.rate(10, 0, -3500, 10000),
-            0.1107, 4)
-
-    @filter_deprecation
-    def test_rate_decimal(self):
-        rate = np.rate(Decimal('10'), Decimal('0'), Decimal('-3500'), Decimal('10000'))
-        assert_equal(Decimal('0.1106908537142689284704528100'), rate)
-
-    @filter_deprecation
-    def test_irr(self):
-        v = [-150000, 15000, 25000, 35000, 45000, 60000]
-        assert_almost_equal(np.irr(v), 0.0524, 2)
-        v = [-100, 0, 0, 74]
-        assert_almost_equal(np.irr(v), -0.0955, 2)
-        v = [-100, 39, 59, 55, 20]
-        assert_almost_equal(np.irr(v), 0.28095, 2)
-        v = [-100, 100, 0, -7]
-        assert_almost_equal(np.irr(v), -0.0833, 2)
-        v = [-100, 100, 0, 7]
-        assert_almost_equal(np.irr(v), 0.06206, 2)
-        v = [-5, 10.5, 1, -8, 1]
-        assert_almost_equal(np.irr(v), 0.0886, 2)
-
-        # Test that if there is no solution then np.irr returns nan
-        # Fixes gh-6744
-        v = [-1, -2, -3]
-        assert_equal(np.irr(v), np.nan)
-
-    @filter_deprecation
-    def test_pv(self):
-        assert_almost_equal(np.pv(0.07, 20, 12000, 0), -127128.17, 2)
-
-    @filter_deprecation
-    def test_pv_decimal(self):
-        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0')),
-                     Decimal('-127128.1709461939327295222005'))
-
-    @filter_deprecation
-    def test_fv(self):
-        assert_equal(np.fv(0.075, 20, -2000, 0, 0), 86609.362673042924)
-
-    @filter_deprecation
-    def test_fv_decimal(self):
-        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), 0, 0),
-                     Decimal('86609.36267304300040536731624'))
-
-    @filter_deprecation
-    def test_pmt(self):
-        res = np.pmt(0.08 / 12, 5 * 12, 15000)
-        tgt = -304.145914
-        assert_allclose(res, tgt)
-        # Test the edge case where rate == 0.0
-        res = np.pmt(0.0, 5 * 12, 15000)
-        tgt = -250.0
-        assert_allclose(res, tgt)
-        # Test the case where we use broadcast and
-        # the arguments passed in are arrays.
-        res = np.pmt([[0.0, 0.8], [0.3, 0.8]], [12, 3], [2000, 20000])
-        tgt = np.array([[-166.66667, -19311.258], [-626.90814, -19311.258]])
-        assert_allclose(res, tgt)
-
-    @filter_deprecation
-    def test_pmt_decimal(self):
-        res = np.pmt(Decimal('0.08') / Decimal('12'), 5 * 12, 15000)
-        tgt = Decimal('-304.1459143262052370338701494')
-        assert_equal(res, tgt)
-        # Test the edge case where rate == 0.0
-        res = np.pmt(Decimal('0'), Decimal('60'), Decimal('15000'))
-        tgt = -250
-        assert_equal(res, tgt)
-        # Test the case where we use broadcast and
-        # the arguments passed in are arrays.
-        res = np.pmt([[Decimal('0'), Decimal('0.8')], [Decimal('0.3'), Decimal('0.8')]],
-                     [Decimal('12'), Decimal('3')], [Decimal('2000'), Decimal('20000')])
-        tgt = np.array([[Decimal('-166.6666666666666666666666667'), Decimal('-19311.25827814569536423841060')],
-                        [Decimal('-626.9081401700757748402586600'), Decimal('-19311.25827814569536423841060')]])
-
-        # Cannot use the `assert_allclose` because it uses isfinite under the covers
-        # which does not support the Decimal type
-        # See issue: https://github.com/numpy/numpy/issues/9954
-        assert_equal(res[0][0], tgt[0][0])
-        assert_equal(res[0][1], tgt[0][1])
-        assert_equal(res[1][0], tgt[1][0])
-        assert_equal(res[1][1], tgt[1][1])
-
-    @filter_deprecation
-    def test_ppmt(self):
-        assert_equal(np.round(np.ppmt(0.1 / 12, 1, 60, 55000), 2), -710.25)
-
-    @filter_deprecation
-    def test_ppmt_decimal(self):
-        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000')),
-                     Decimal('-710.2541257864217612489830917'))
-
-    # Two tests showing how Decimal is actually getting at a more exact result
-    # .23 / 12 does not come out nicely as a float but does as a decimal
-    @filter_deprecation
-    def test_ppmt_special_rate(self):
-        assert_equal(np.round(np.ppmt(0.23 / 12, 1, 60, 10000000000), 8), -90238044.232277036)
-
-    @filter_deprecation
-    def test_ppmt_special_rate_decimal(self):
-        # When rounded out to 8 decimal places like the float based test, this should not equal the same value
-        # as the float, substituted for the decimal
-        def raise_error_because_not_equal():
-            assert_equal(
-                round(np.ppmt(Decimal('0.23') / Decimal('12'), 1, 60, Decimal('10000000000')), 8),
-                Decimal('-90238044.232277036'))
-
-        assert_raises(AssertionError, raise_error_because_not_equal)
-        assert_equal(np.ppmt(Decimal('0.23') / Decimal('12'), 1, 60, Decimal('10000000000')),
-                     Decimal('-90238044.2322778884413969909'))
-
-    @filter_deprecation
-    def test_ipmt(self):
-        assert_almost_equal(np.round(np.ipmt(0.1 / 12, 1, 24, 2000), 2), -16.67)
-
-    @filter_deprecation
-    def test_ipmt_decimal(self):
-        result = np.ipmt(Decimal('0.1') / Decimal('12'), 1, 24, 2000)
-        assert_equal(result.flat[0], Decimal('-16.66666666666666666666666667'))
-
-    @filter_deprecation
-    def test_nper(self):
-        assert_almost_equal(np.nper(0.075, -2000, 0, 100000.),
-                            21.54, 2)
-
-    @filter_deprecation
-    def test_nper2(self):
-        assert_almost_equal(np.nper(0.0, -2000, 0, 100000.),
-                            50.0, 1)
-
-    @filter_deprecation
-    def test_npv(self):
-        assert_almost_equal(
-            np.npv(0.05, [-15000, 1500, 2500, 3500, 4500, 6000]),
-            122.89, 2)
-
-    @filter_deprecation
-    def test_npv_decimal(self):
-        assert_equal(
-            np.npv(Decimal('0.05'), [-15000, 1500, 2500, 3500, 4500, 6000]),
-            Decimal('122.894854950942692161628715'))
-
-    @filter_deprecation
-    def test_mirr(self):
-        val = [-4500, -800, 800, 800, 600, 600, 800, 800, 700, 3000]
-        assert_almost_equal(np.mirr(val, 0.08, 0.055), 0.0666, 4)
-
-        val = [-120000, 39000, 30000, 21000, 37000, 46000]
-        assert_almost_equal(np.mirr(val, 0.10, 0.12), 0.126094, 6)
-
-        val = [100, 200, -50, 300, -200]
-        assert_almost_equal(np.mirr(val, 0.05, 0.06), 0.3428, 4)
-
-        val = [39000, 30000, 21000, 37000, 46000]
-        assert_(np.isnan(np.mirr(val, 0.10, 0.12)))
-
-    @filter_deprecation
-    def test_mirr_decimal(self):
-        val = [Decimal('-4500'), Decimal('-800'), Decimal('800'), Decimal('800'),
-               Decimal('600'), Decimal('600'), Decimal('800'), Decimal('800'),
-               Decimal('700'), Decimal('3000')]
-        assert_equal(np.mirr(val, Decimal('0.08'), Decimal('0.055')),
-                     Decimal('0.066597175031553548874239618'))
-
-        val = [Decimal('-120000'), Decimal('39000'), Decimal('30000'),
-               Decimal('21000'), Decimal('37000'), Decimal('46000')]
-        assert_equal(np.mirr(val, Decimal('0.10'), Decimal('0.12')), Decimal('0.126094130365905145828421880'))
-
-        val = [Decimal('100'), Decimal('200'), Decimal('-50'),
-               Decimal('300'), Decimal('-200')]
-        assert_equal(np.mirr(val, Decimal('0.05'), Decimal('0.06')), Decimal('0.342823387842176663647819868'))
-
-        val = [Decimal('39000'), Decimal('30000'), Decimal('21000'), Decimal('37000'), Decimal('46000')]
-        assert_(np.isnan(np.mirr(val, Decimal('0.10'), Decimal('0.12'))))
-
-    @filter_deprecation
-    def test_when(self):
-        # begin
-        assert_equal(np.rate(10, 20, -3500, 10000, 1),
-                     np.rate(10, 20, -3500, 10000, 'begin'))
-        # end
-        assert_equal(np.rate(10, 20, -3500, 10000),
-                     np.rate(10, 20, -3500, 10000, 'end'))
-        assert_equal(np.rate(10, 20, -3500, 10000, 0),
-                     np.rate(10, 20, -3500, 10000, 'end'))
-
-        # begin
-        assert_equal(np.pv(0.07, 20, 12000, 0, 1),
-                     np.pv(0.07, 20, 12000, 0, 'begin'))
-        # end
-        assert_equal(np.pv(0.07, 20, 12000, 0),
-                     np.pv(0.07, 20, 12000, 0, 'end'))
-        assert_equal(np.pv(0.07, 20, 12000, 0, 0),
-                     np.pv(0.07, 20, 12000, 0, 'end'))
-
-        # begin
-        assert_equal(np.fv(0.075, 20, -2000, 0, 1),
-                     np.fv(0.075, 20, -2000, 0, 'begin'))
-        # end
-        assert_equal(np.fv(0.075, 20, -2000, 0),
-                     np.fv(0.075, 20, -2000, 0, 'end'))
-        assert_equal(np.fv(0.075, 20, -2000, 0, 0),
-                     np.fv(0.075, 20, -2000, 0, 'end'))
-
-        # begin
-        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0, 1),
-                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'begin'))
-        # end
-        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0),
-                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'end'))
-        assert_equal(np.pmt(0.08 / 12, 5 * 12, 15000., 0, 0),
-                     np.pmt(0.08 / 12, 5 * 12, 15000., 0, 'end'))
-
-        # begin
-        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0, 1),
-                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'begin'))
-        # end
-        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0),
-                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'end'))
-        assert_equal(np.ppmt(0.1 / 12, 1, 60, 55000, 0, 0),
-                     np.ppmt(0.1 / 12, 1, 60, 55000, 0, 'end'))
-
-        # begin
-        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0, 1),
-                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'begin'))
-        # end
-        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0),
-                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'end'))
-        assert_equal(np.ipmt(0.1 / 12, 1, 24, 2000, 0, 0),
-                     np.ipmt(0.1 / 12, 1, 24, 2000, 0, 'end'))
-
-        # begin
-        assert_equal(np.nper(0.075, -2000, 0, 100000., 1),
-                     np.nper(0.075, -2000, 0, 100000., 'begin'))
-        # end
-        assert_equal(np.nper(0.075, -2000, 0, 100000.),
-                     np.nper(0.075, -2000, 0, 100000., 'end'))
-        assert_equal(np.nper(0.075, -2000, 0, 100000., 0),
-                     np.nper(0.075, -2000, 0, 100000., 'end'))
-
-    @filter_deprecation
-    def test_decimal_with_when(self):
-        """Test that decimals are still supported if the when argument is passed"""
-        # begin
-        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), Decimal('1')),
-                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'begin'))
-        # end
-        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000')),
-                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'end'))
-        assert_equal(np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), Decimal('0')),
-                     np.rate(Decimal('10'), Decimal('20'), Decimal('-3500'), Decimal('10000'), 'end'))
-
-        # begin
-        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), Decimal('1')),
-                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'begin'))
-        # end
-        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0')),
-                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'end'))
-        assert_equal(np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), Decimal('0')),
-                     np.pv(Decimal('0.07'), Decimal('20'), Decimal('12000'), Decimal('0'), 'end'))
-
-        # begin
-        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), Decimal('1')),
-                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'begin'))
-        # end
-        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0')),
-                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'end'))
-        assert_equal(np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), Decimal('0')),
-                     np.fv(Decimal('0.075'), Decimal('20'), Decimal('-2000'), Decimal('0'), 'end'))
-
-        # begin
-        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0'), Decimal('1')),
-                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0'), 'begin'))
-        # end
-        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0')),
-                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0'), 'end'))
-        assert_equal(np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0'), Decimal('0')),
-                     np.pmt(Decimal('0.08') / Decimal('12'), Decimal('5') * Decimal('12'), Decimal('15000.'),
-                            Decimal('0'), 'end'))
-
-        # begin
-        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0'), Decimal('1')),
-                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0'), 'begin'))
-        # end
-        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0')),
-                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0'), 'end'))
-        assert_equal(np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0'), Decimal('0')),
-                     np.ppmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('60'), Decimal('55000'),
-                             Decimal('0'), 'end'))
-
-        # begin
-        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0'), Decimal('1')).flat[0],
-                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0'), 'begin').flat[0])
-        # end
-        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0')).flat[0],
-                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0'), 'end').flat[0])
-        assert_equal(np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0'), Decimal('0')).flat[0],
-                     np.ipmt(Decimal('0.1') / Decimal('12'), Decimal('1'), Decimal('24'), Decimal('2000'),
-                             Decimal('0'), 'end').flat[0])
-
-    @filter_deprecation
-    def test_broadcast(self):
-        assert_almost_equal(np.nper(0.075, -2000, 0, 100000., [0, 1]),
-                            [21.5449442, 20.76156441], 4)
-
-        assert_almost_equal(np.ipmt(0.1 / 12, list(range(5)), 24, 2000),
-                            [-17.29165168, -16.66666667, -16.03647345,
-                             -15.40102862, -14.76028842], 4)
-
-        assert_almost_equal(np.ppmt(0.1 / 12, list(range(5)), 24, 2000),
-                            [-74.998201, -75.62318601, -76.25337923,
-                             -76.88882405, -77.52956425], 4)
-
-        assert_almost_equal(np.ppmt(0.1 / 12, list(range(5)), 24, 2000, 0,
-                                    [0, 0, 1, 'end', 'begin']),
-                            [-74.998201, -75.62318601, -75.62318601,
-                             -76.88882405, -76.88882405], 4)
-
-    @filter_deprecation
-    def test_broadcast_decimal(self):
-        # Use almost equal because precision is tested in the explicit tests, this test is to ensure
-        # broadcast with Decimal is not broken.
-        assert_almost_equal(np.ipmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000')),
-                            [Decimal('-17.29165168'), Decimal('-16.66666667'), Decimal('-16.03647345'),
-                             Decimal('-15.40102862'), Decimal('-14.76028842')], 4)
-
-        assert_almost_equal(np.ppmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000')),
-                            [Decimal('-74.998201'), Decimal('-75.62318601'), Decimal('-76.25337923'),
-                             Decimal('-76.88882405'), Decimal('-77.52956425')], 4)
-
-        assert_almost_equal(np.ppmt(Decimal('0.1') / Decimal('12'), list(range(5)), Decimal('24'), Decimal('2000'),
-                                    Decimal('0'), [Decimal('0'), Decimal('0'), Decimal('1'), 'end', 'begin']),
-                            [Decimal('-74.998201'), Decimal('-75.62318601'), Decimal('-75.62318601'),
-                             Decimal('-76.88882405'), Decimal('-76.88882405')], 4)
diff --git a/numpy/lib/tests/test_financial_expired.py b/numpy/lib/tests/test_financial_expired.py
new file mode 100644
index 000000000..70b0cd790
--- /dev/null
+++ b/numpy/lib/tests/test_financial_expired.py
@@ -0,0 +1,13 @@
+import sys
+import pytest
+import numpy as np
+
+
+@pytest.mark.skipif(sys.version_info[:2] < (3, 7),
+                    reason="requires python 3.7 or higher")
+def test_financial_expired():
+    match = 'NEP 32'
+    with pytest.warns(DeprecationWarning, match=match):
+        func = np.fv
+    with pytest.raises(RuntimeError, match=match):
+        func(1, 2, 3)
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index 2dbaeb8cb..bac42fad3 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -285,28 +285,11 @@ from io import BytesIO
 import numpy as np
 from numpy.testing import (
     assert_, assert_array_equal, assert_raises, assert_raises_regex,
-    assert_warns
+    assert_warns,
     )
 from numpy.lib import format
 
 
-tempdir = None
-
-# Module-level setup.
-
-
-def setup_module():
-    global tempdir
-    tempdir = tempfile.mkdtemp()
-
-
-def teardown_module():
-    global tempdir
-    if tempdir is not None and os.path.isdir(tempdir):
-        shutil.rmtree(tempdir)
-        tempdir = None
-
-
 # Generate some basic arrays to test with.
 scalars = [
     np.uint8,
@@ -477,49 +460,42 @@ def test_long_str():
     assert_array_equal(long_str_arr, long_str_arr2)
 
 
-@pytest.mark.slow
-def test_memmap_roundtrip():
-    # Fixme: used to crash on windows
-    if not (sys.platform == 'win32' or sys.platform == 'cygwin'):
-        for arr in basic_arrays + record_arrays:
-            if arr.dtype.hasobject:
-                # Skip these since they can't be mmap'ed.
-                continue
-            # Write it out normally and through mmap.
-            nfn = os.path.join(tempdir, 'normal.npy')
-            mfn = os.path.join(tempdir, 'memmap.npy')
-            fp = open(nfn, 'wb')
-            try:
-                format.write_array(fp, arr)
-            finally:
-                fp.close()
-
-            fortran_order = (
-                arr.flags.f_contiguous and not arr.flags.c_contiguous)
-            ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype,
-                                    shape=arr.shape, fortran_order=fortran_order)
-            ma[...] = arr
-            del ma
-
-            # Check that both of these files' contents are the same.
-            fp = open(nfn, 'rb')
+def test_memmap_roundtrip(tmpdir):
+    for i, arr in enumerate(basic_arrays + record_arrays):
+        if arr.dtype.hasobject:
+            # Skip these since they can't be mmap'ed.
+            continue
+        # Write it out normally and through mmap.
+        nfn = os.path.join(tmpdir, f'normal{i}.npy')
+        mfn = os.path.join(tmpdir, f'memmap{i}.npy')
+        with open(nfn, 'wb') as fp:
+            format.write_array(fp, arr)
+
+        fortran_order = (
+            arr.flags.f_contiguous and not arr.flags.c_contiguous)
+        ma = format.open_memmap(mfn, mode='w+', dtype=arr.dtype,
+                                shape=arr.shape, fortran_order=fortran_order)
+        ma[...] = arr
+        ma.flush()
+
+        # Check that both of these files' contents are the same.
+        with open(nfn, 'rb') as fp:
             normal_bytes = fp.read()
-            fp.close()
-            fp = open(mfn, 'rb')
+        with open(mfn, 'rb') as fp:
             memmap_bytes = fp.read()
-            fp.close()
-            assert_equal_(normal_bytes, memmap_bytes)
+        assert_equal_(normal_bytes, memmap_bytes)
 
-            # Check that reading the file using memmap works.
-            ma = format.open_memmap(nfn, mode='r')
-            del ma
+        # Check that reading the file using memmap works.
+        ma = format.open_memmap(nfn, mode='r')
+        ma.flush()
 
 
-def test_compressed_roundtrip():
+def test_compressed_roundtrip(tmpdir):
     arr = np.random.rand(200, 200)
-    npz_file = os.path.join(tempdir, 'compressed.npz')
+    npz_file = os.path.join(tmpdir, 'compressed.npz')
     np.savez_compressed(npz_file, arr=arr)
-    arr1 = np.load(npz_file)['arr']
+    with np.load(npz_file) as npz:
+        arr1 = npz['arr']
     assert_array_equal(arr, arr1)
 
 
@@ -539,13 +515,14 @@ dt5 = np.dtype({'names': ['a', 'b'], 'formats': ['i4', 'i4'],
 dt6 = np.dtype({'names': [], 'formats': [], 'itemsize': 8})
 
 @pytest.mark.parametrize("dt", [dt1, dt2, dt3, dt4, dt5, dt6])
-def test_load_padded_dtype(dt):
+def test_load_padded_dtype(tmpdir, dt):
     arr = np.zeros(3, dt)
     for i in range(3):
         arr[i] = i + 5
-    npz_file = os.path.join(tempdir, 'aligned.npz')
+    npz_file = os.path.join(tmpdir, 'aligned.npz')
     np.savez(npz_file, arr=arr)
-    arr1 = np.load(npz_file)['arr']
+    with np.load(npz_file) as npz:
+        arr1 = npz['arr']
     assert_array_equal(arr, arr1)
 
 
@@ -602,7 +579,7 @@ def test_pickle_python2_python3():
                               encoding='latin1')
 
 
-def test_pickle_disallow():
+def test_pickle_disallow(tmpdir):
     data_dir = os.path.join(os.path.dirname(__file__), 'data')
 
     path = os.path.join(data_dir, 'py2-objarr.npy')
@@ -610,10 +587,10 @@ def test_pickle_disallow():
                   allow_pickle=False, encoding='latin1')
 
     path = os.path.join(data_dir, 'py2-objarr.npz')
-    f = np.load(path, allow_pickle=False, encoding='latin1')
-    assert_raises(ValueError, f.__getitem__, 'x')
+    with np.load(path, allow_pickle=False, encoding='latin1') as f:
+        assert_raises(ValueError, f.__getitem__, 'x')
 
-    path = os.path.join(tempdir, 'pickle-disabled.npy')
+    path = os.path.join(tmpdir, 'pickle-disabled.npy')
     assert_raises(ValueError, np.save, path, np.array([None], dtype=object),
                   allow_pickle=False)
 
@@ -698,31 +675,33 @@ def test_version_2_0():
     assert_raises(ValueError, format.write_array, f, d, (1, 0))
 
 
-@pytest.mark.slow
-def test_version_2_0_memmap():
+def test_version_2_0_memmap(tmpdir):
     # requires more than 2 byte for header
     dt = [(("%d" % i) * 100, float) for i in range(500)]
     d = np.ones(1000, dtype=dt)
-    tf = tempfile.mktemp('', 'mmap', dir=tempdir)
+    tf1 = os.path.join(tmpdir, f'version2_01.npy')
+    tf2 = os.path.join(tmpdir, f'version2_02.npy')
 
     # 1.0 requested but data cannot be saved this way
-    assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype,
+    assert_raises(ValueError, format.open_memmap, tf1, mode='w+', dtype=d.dtype,
                             shape=d.shape, version=(1, 0))
 
-    ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+    ma = format.open_memmap(tf1, mode='w+', dtype=d.dtype,
                             shape=d.shape, version=(2, 0))
     ma[...] = d
-    del ma
+    ma.flush()
+    ma = format.open_memmap(tf1, mode='r')
+    assert_array_equal(ma, d)
 
     with warnings.catch_warnings(record=True) as w:
         warnings.filterwarnings('always', '', UserWarning)
-        ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+        ma = format.open_memmap(tf2, mode='w+', dtype=d.dtype,
                                 shape=d.shape, version=None)
         assert_(w[0].category is UserWarning)
         ma[...] = d
-        del ma
+        ma.flush()
 
-    ma = format.open_memmap(tf, mode='r')
+    ma = format.open_memmap(tf2, mode='r')
     assert_array_equal(ma, d)
 
 
@@ -874,11 +853,11 @@ def test_bad_header():
     assert_raises(ValueError, format.read_array_header_1_0, s)
 
 
-def test_large_file_support():
+def test_large_file_support(tmpdir):
     if (sys.platform == 'win32' or sys.platform == 'cygwin'):
         pytest.skip("Unknown if Windows has sparse filesystems")
     # try creating a large sparse file
-    tf_name = os.path.join(tempdir, 'sparse_file')
+    tf_name = os.path.join(tmpdir, 'sparse_file')
     try:
         # seek past end would work too, but linux truncate somewhat
         # increases the chances that we have a sparse filesystem and can
@@ -902,7 +881,7 @@ def test_large_file_support():
 @pytest.mark.skipif(np.dtype(np.intp).itemsize < 8,
                     reason="test requires 64-bit system")
 @pytest.mark.slow
-def test_large_archive():
+def test_large_archive(tmpdir):
     # Regression test for product of saving arrays with dimensions of array
     # having a product that doesn't fit in int32.  See gh-7598 for details.
     try:
@@ -910,7 +889,7 @@ def test_large_archive():
     except MemoryError:
         pytest.skip("Could not create large file")
 
-    fname = os.path.join(tempdir, "large_archive")
+    fname = os.path.join(tmpdir, "large_archive")
 
     with open(fname, "wb") as f:
         np.savez(f, arr=a)
@@ -921,14 +900,15 @@ def test_large_archive():
     assert_(a.shape == new_a.shape)
 
 
-def test_empty_npz():
+def test_empty_npz(tmpdir):
     # Test for gh-9989
-    fname = os.path.join(tempdir, "nothing.npz")
+    fname = os.path.join(tmpdir, "nothing.npz")
     np.savez(fname)
-    np.load(fname)
+    with np.load(fname) as nps:
+        pass
 
 
-def test_unicode_field_names():
+def test_unicode_field_names(tmpdir):
     # gh-7391
     arr = np.array([
         (1, 3),
@@ -939,7 +919,7 @@ def test_unicode_field_names():
         ('int', int),
         (u'\N{CJK UNIFIED IDEOGRAPH-6574}\N{CJK UNIFIED IDEOGRAPH-5F62}', int)
     ])
-    fname = os.path.join(tempdir, "unicode.npy")
+    fname = os.path.join(tmpdir, "unicode.npy")
     with open(fname, 'wb') as f:
         format.write_array(f, arr, version=(3, 0))
     with open(fname, 'rb') as f:
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index eb2fc3311..4c7c0480c 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -1166,26 +1166,68 @@ class TestAngle:
 
 class TestTrimZeros:
 
-    """
-    Only testing for integer splits.
+    a = np.array([0, 0, 1, 0, 2, 3, 4, 0])
+    b = a.astype(float)
+    c = a.astype(complex)
+    d = a.astype(object)
 
-    """
+    def values(self):
+        attr_names = ('a', 'b', 'c', 'd')
+        return (getattr(self, name) for name in attr_names)
 
     def test_basic(self):
-        a = np.array([0, 0, 1, 2, 3, 4, 0])
-        res = trim_zeros(a)
-        assert_array_equal(res, np.array([1, 2, 3, 4]))
+        slc = np.s_[2:-1]
+        for arr in self.values():
+            res = trim_zeros(arr)
+            assert_array_equal(res, arr[slc])
 
     def test_leading_skip(self):
-        a = np.array([0, 0, 1, 0, 2, 3, 4, 0])
-        res = trim_zeros(a)
-        assert_array_equal(res, np.array([1, 0, 2, 3, 4]))
+        slc = np.s_[:-1]
+        for arr in self.values():
+            res = trim_zeros(arr, trim='b')
+            assert_array_equal(res, arr[slc])
 
     def test_trailing_skip(self):
-        a = np.array([0, 0, 1, 0, 2, 3, 0, 4, 0])
-        res = trim_zeros(a)
-        assert_array_equal(res, np.array([1, 0, 2, 3, 0, 4]))
+        slc = np.s_[2:]
+        for arr in self.values():
+            res = trim_zeros(arr, trim='F')
+            assert_array_equal(res, arr[slc])
+
+    def test_all_zero(self):
+        for _arr in self.values():
+            arr = np.zeros_like(_arr, dtype=_arr.dtype)
+
+            res1 = trim_zeros(arr, trim='B')
+            assert len(res1) == 0
+
+            res2 = trim_zeros(arr, trim='f')
+            assert len(res2) == 0
+
+    def test_size_zero(self):
+        arr = np.zeros(0)
+        res = trim_zeros(arr)
+        assert_array_equal(arr, res)
+
+    @pytest.mark.parametrize(
+        'arr',
+        [np.array([0, 2**62, 0]),
+         np.array([0, 2**63, 0]),
+         np.array([0, 2**64, 0])]
+    )
+    def test_overflow(self, arr):
+        slc = np.s_[1:2]
+        res = trim_zeros(arr)
+        assert_array_equal(res, arr[slc])
 
+    def test_no_trim(self):
+        arr = np.array([None, 1, None])
+        res = trim_zeros(arr)
+        assert_array_equal(arr, res)
+
+
+    def test_list_to_list(self):
+        res = trim_zeros(self.a.tolist())
+        assert isinstance(res, list)
 
 class TestExtins:
 
@@ -1763,28 +1805,28 @@ class TestFilterwindows:
     def test_hanning(self):
         # check symmetry
         w = hanning(10)
-        assert_array_almost_equal(w, flipud(w), 7)
+        assert_equal(w, flipud(w))
         # check known value
         assert_almost_equal(np.sum(w, axis=0), 4.500, 4)
 
     def test_hamming(self):
         # check symmetry
         w = hamming(10)
-        assert_array_almost_equal(w, flipud(w), 7)
+        assert_equal(w, flipud(w))
         # check known value
         assert_almost_equal(np.sum(w, axis=0), 4.9400, 4)
 
     def test_bartlett(self):
         # check symmetry
         w = bartlett(10)
-        assert_array_almost_equal(w, flipud(w), 7)
+        assert_equal(w, flipud(w))
         # check known value
         assert_almost_equal(np.sum(w, axis=0), 4.4444, 4)
 
     def test_blackman(self):
         # check symmetry
         w = blackman(10)
-        assert_array_almost_equal(w, flipud(w), 7)
+        assert_equal(w, flipud(w))
         # check known value
         assert_almost_equal(np.sum(w, axis=0), 3.7800, 4)
 
@@ -1981,6 +2023,12 @@ class TestCorrCoef:
         assert_array_almost_equal(c, np.array([[1., -1.], [-1., 1.]]))
         assert_(np.all(np.abs(c) <= 1.0))
 
+    @pytest.mark.parametrize("test_type", [np.half, np.single, np.double, np.longdouble])
+    def test_corrcoef_dtype(self, test_type):
+        cast_A = self.A.astype(test_type)
+        res = corrcoef(cast_A, dtype=test_type)
+        assert test_type == res.dtype
+
 
 class TestCov:
     x1 = np.array([[0, 2], [1, 1], [2, 0]]).T
@@ -2081,6 +2129,12 @@ class TestCov:
                             aweights=self.unit_weights),
                         self.res1)
 
+    @pytest.mark.parametrize("test_type", [np.half, np.single, np.double, np.longdouble])
+    def test_cov_dtype(self, test_type):
+        cast_x1 = self.x1.astype(test_type)
+        res = cov(cast_x1, dtype=test_type)
+        assert test_type == res.dtype
+
 
 class Test_I0:
 
@@ -2089,8 +2143,9 @@ class Test_I0:
             i0(0.5),
             np.array(1.0634833707413234))
 
-        A = np.array([0.49842636, 0.6969809, 0.22011976, 0.0155549])
-        expected = np.array([1.06307822, 1.12518299, 1.01214991, 1.00006049])
+        # need at least one test above 8, as the implementation is piecewise
+        A = np.array([0.49842636, 0.6969809, 0.22011976, 0.0155549, 10.0])
+        expected = np.array([1.06307822, 1.12518299, 1.01214991, 1.00006049, 2815.71662847])
         assert_almost_equal(i0(A), expected)
         assert_almost_equal(i0(-A), expected)
 
@@ -2127,6 +2182,10 @@ class Test_I0:
 
         assert_array_equal(exp, res)
 
+    def test_complex(self):
+        a = np.array([0, 1 + 2j])
+        with pytest.raises(TypeError, match="i0 not supported for complex values"):
+            res = i0(a)
 
 class TestKaiser:
 
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index 905165a99..c21aefd1a 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -16,23 +16,13 @@ class TestRavelUnravelIndex:
     def test_basic(self):
         assert_equal(np.unravel_index(2, (2, 2)), (1, 0))
 
-        # test backwards compatibility with older dims
-        # keyword argument; see Issue #10586
-        with assert_warns(DeprecationWarning):
-            # we should achieve the correct result
-            # AND raise the appropriate warning
-            # when using older "dims" kw argument
-            assert_equal(np.unravel_index(indices=2,
-                                          dims=(2, 2)),
-                                          (1, 0))
-
         # test that new shape argument works properly
         assert_equal(np.unravel_index(indices=2,
                                       shape=(2, 2)),
                                       (1, 0))
 
         # test that an invalid second keyword argument
-        # is properly handled
+        # is properly handled, including the old name `dims`.
         with assert_raises(TypeError):
             np.unravel_index(indices=2, hape=(2, 2))
 
@@ -42,6 +32,9 @@ class TestRavelUnravelIndex:
         with assert_raises(TypeError):
             np.unravel_index(254, ims=(17, 94))
 
+        with assert_raises(TypeError):
+            np.unravel_index(254, dims=(17, 94))
+
         assert_equal(np.ravel_multi_index((1, 0), (2, 2)), 2)
         assert_equal(np.unravel_index(254, (17, 94)), (2, 66))
         assert_equal(np.ravel_multi_index((2, 66), (17, 94)), 254)
@@ -249,6 +242,29 @@ class TestGrid:
         assert_equal(grid.size, expected[0])
         assert_equal(grid_small.size, expected[1])
 
+    def test_accepts_npfloating(self):
+        # regression test for #16466
+        grid64 = mgrid[0.1:0.33:0.1, ]
+        grid32 = mgrid[np.float32(0.1):np.float32(0.33):np.float32(0.1), ]
+        assert_(grid32.dtype == np.float64)
+        assert_array_almost_equal(grid64, grid32)
+
+        # different code path for single slice
+        grid64 = mgrid[0.1:0.33:0.1]
+        grid32 = mgrid[np.float32(0.1):np.float32(0.33):np.float32(0.1)]
+        assert_(grid32.dtype == np.float64)
+        assert_array_almost_equal(grid64, grid32)
+
+    def test_accepts_npcomplexfloating(self):
+        # Related to #16466
+        assert_array_almost_equal(
+            mgrid[0.1:0.3:3j, ], mgrid[0.1:0.3:np.complex64(3j), ]
+        )
+
+        # different code path for single slice
+        assert_array_almost_equal(
+            mgrid[0.1:0.3:3j], mgrid[0.1:0.3:np.complex64(3j)]
+        )
 
 class TestConcatenator:
     def test_1d(self):
@@ -270,6 +286,10 @@ class TestConcatenator:
         g = r_[0:36:100j]
         assert_(g.shape == (100,))
 
+        # Related to #16466
+        g = r_[0:36:np.complex64(100j)]
+        assert_(g.shape == (100,))
+
     def test_2d(self):
         b = np.random.rand(5, 5)
         c = np.random.rand(5, 5)
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py
index 664bfe6e5..aa4499764 100644
--- a/numpy/lib/tests/test_io.py
+++ b/numpy/lib/tests/test_io.py
@@ -13,17 +13,19 @@ from tempfile import NamedTemporaryFile
 from io import BytesIO, StringIO
 from datetime import datetime
 import locale
-from multiprocessing import Process
+from multiprocessing import Process, Value
+from ctypes import c_bool
 
 import numpy as np
 import numpy.ma as ma
 from numpy.lib._iotools import ConverterError, ConversionWarning
-from numpy.compat import asbytes, bytes
+from numpy.compat import asbytes
 from numpy.ma.testutils import assert_equal
 from numpy.testing import (
     assert_warns, assert_, assert_raises_regex, assert_raises,
     assert_allclose, assert_array_equal, temppath, tempdir, IS_PYPY,
-    HAS_REFCOUNT, suppress_warnings, assert_no_gc_cycles, assert_no_warnings
+    HAS_REFCOUNT, suppress_warnings, assert_no_gc_cycles, assert_no_warnings,
+    break_cycles
     )
 from numpy.testing._private.utils import requires_memory
 
@@ -574,16 +576,29 @@ class TestSaveTxt:
     @pytest.mark.slow
     @requires_memory(free_bytes=7e9)
     def test_large_zip(self):
-        def check_large_zip():
-            # The test takes at least 6GB of memory, writes a file larger than 4GB
-            test_data = np.asarray([np.random.rand(np.random.randint(50,100),4)
-                                   for i in range(800000)], dtype=object)
-            with tempdir() as tmpdir:
-                np.savez(os.path.join(tmpdir, 'test.npz'), test_data=test_data)
+        def check_large_zip(memoryerror_raised):
+            memoryerror_raised.value = False
+            try:
+                # The test takes at least 6GB of memory, writes a file larger
+                # than 4GB
+                test_data = np.asarray([np.random.rand(
+                                        np.random.randint(50,100),4)
+                                        for i in range(800000)], dtype=object)
+                with tempdir() as tmpdir:
+                    np.savez(os.path.join(tmpdir, 'test.npz'),
+                             test_data=test_data)
+            except MemoryError:
+                memoryerror_raised.value = True
+                raise
         # run in a subprocess to ensure memory is released on PyPy, see gh-15775
-        p = Process(target=check_large_zip)
+        # Use an object in shared memory to re-raise the MemoryError exception
+        # in our process if needed, see gh-16889
+        memoryerror_raised = Value(c_bool)
+        p = Process(target=check_large_zip, args=(memoryerror_raised,))
         p.start()
         p.join()
+        if memoryerror_raised.value:
+            raise MemoryError("Child process raised a MemoryError exception")
         assert p.exitcode == 0
 
 class LoadTxtBase:
@@ -1011,7 +1026,7 @@ class TestLoadTxt(LoadTxtBase):
         a = np.array([b'start ', b'  ', b''])
         assert_array_equal(x['comment'], a)
 
-    def test_structure_unpack(self):
+    def test_unpack_structured(self):
         txt = TextIO("M 21 72\nF 35 58")
         dt = {'names': ('a', 'b', 'c'), 'formats': ('|S1', '<i4', '<f4')}
         a, b, c = np.loadtxt(txt, dtype=dt, unpack=True)
@@ -2343,6 +2358,51 @@ M   33  21.99
         assert_equal(test['f1'], 17179869184)
         assert_equal(test['f2'], 1024)
 
+    def test_unpack_structured(self):
+        # Regression test for gh-4341
+        # Unpacking should work on structured arrays
+        txt = TextIO("M 21 72\nF 35 58")
+        dt = {'names': ('a', 'b', 'c'), 'formats': ('S1', 'i4', 'f4')}
+        a, b, c = np.genfromtxt(txt, dtype=dt, unpack=True)
+        assert_equal(a.dtype, np.dtype('S1'))
+        assert_equal(b.dtype, np.dtype('i4'))
+        assert_equal(c.dtype, np.dtype('f4'))
+        assert_array_equal(a, np.array([b'M', b'F']))
+        assert_array_equal(b, np.array([21, 35]))
+        assert_array_equal(c, np.array([72.,  58.]))
+
+    def test_unpack_auto_dtype(self):
+        # Regression test for gh-4341
+        # Unpacking should work when dtype=None
+        txt = TextIO("M 21 72.\nF 35 58.")
+        expected = (np.array(["M", "F"]), np.array([21, 35]), np.array([72., 58.]))
+        test = np.genfromtxt(txt, dtype=None, unpack=True, encoding="utf-8")
+        for arr, result in zip(expected, test):
+            assert_array_equal(arr, result)
+            assert_equal(arr.dtype, result.dtype)
+
+    def test_unpack_single_name(self):
+        # Regression test for gh-4341
+        # Unpacking should work when structured dtype has only one field
+        txt = TextIO("21\n35")
+        dt = {'names': ('a',), 'formats': ('i4',)}
+        expected = np.array([21, 35], dtype=np.int32)
+        test = np.genfromtxt(txt, dtype=dt, unpack=True)
+        assert_array_equal(expected, test)
+        assert_equal(expected.dtype, test.dtype)
+
+    def test_squeeze_scalar(self):
+        # Regression test for gh-4341
+        # Unpacking a scalar should give zero-dim output,
+        # even if dtype is structured
+        txt = TextIO("1")
+        dt = {'names': ('a',), 'formats': ('i4',)}
+        expected = np.array((1,), dtype=np.int32)
+        test = np.genfromtxt(txt, dtype=dt, unpack=True)
+        assert_array_equal(expected, test)
+        assert_equal((), test.shape)
+        assert_equal(expected.dtype, test.dtype)
+
 
 class TestPathUsage:
     # Test that pathlib.Path can be used
@@ -2373,6 +2433,9 @@ class TestPathUsage:
             assert_array_equal(data, a)
             # close the mem-mapped file
             del data
+            if IS_PYPY:
+                break_cycles()
+                break_cycles()
 
     def test_save_load_memmap_readwrite(self):
         # Test that pathlib.Path instances can be written mem-mapped.
@@ -2384,6 +2447,9 @@ class TestPathUsage:
             a[0][0] = 5
             b[0][0] = 5
             del b  # closes the file
+            if IS_PYPY:
+                break_cycles()
+                break_cycles()
             data = np.load(path)
             assert_array_equal(data, a)
 
diff --git a/numpy/lib/tests/test_polynomial.py b/numpy/lib/tests/test_polynomial.py
index cd0b90dc4..6c3e4fa02 100644
--- a/numpy/lib/tests/test_polynomial.py
+++ b/numpy/lib/tests/test_polynomial.py
@@ -227,6 +227,20 @@ class TestPolynomial:
         v = np.arange(1, 21)
         assert_almost_equal(np.poly(v), np.poly(np.diag(v)))
 
+    def test_zero_poly_dtype(self):
+        """
+        Regression test for gh-16354.
+        """
+        z = np.array([0, 0, 0])
+        p = np.poly1d(z.astype(np.int64))
+        assert_equal(p.coeffs.dtype, np.int64)
+
+        p = np.poly1d(z.astype(np.float32))
+        assert_equal(p.coeffs.dtype, np.float32)
+
+        p = np.poly1d(z.astype(np.complex64))
+        assert_equal(p.coeffs.dtype, np.complex64)
+
     def test_poly_eq(self):
         p = np.poly1d([1, 2, 3])
         p2 = np.poly1d([1, 2, 4])
@@ -243,6 +257,15 @@ class TestPolynomial:
         assert_equal(q.coeffs.dtype, np.complex128)
         assert_equal(r.coeffs.dtype, np.complex128)
         assert_equal(q*a + r, b)
+        
+        c = [1, 2, 3]
+        d = np.poly1d([1, 2, 3])
+        s, t = np.polydiv(c, d)
+        assert isinstance(s, np.poly1d)
+        assert isinstance(t, np.poly1d)
+        u, v = np.polydiv(d, c)
+        assert isinstance(u, np.poly1d)
+        assert isinstance(v, np.poly1d)
 
     def test_poly_coeffs_mutable(self):
         """ Coefficients should be modifiable """
diff --git a/numpy/lib/tests/test_stride_tricks.py b/numpy/lib/tests/test_stride_tricks.py
index 9d95eb9d0..efec5d24d 100644
--- a/numpy/lib/tests/test_stride_tricks.py
+++ b/numpy/lib/tests/test_stride_tricks.py
@@ -5,8 +5,11 @@ from numpy.testing import (
     assert_raises_regex, assert_warns,
     )
 from numpy.lib.stride_tricks import (
-    as_strided, broadcast_arrays, _broadcast_shape, broadcast_to
+    as_strided, broadcast_arrays, _broadcast_shape, broadcast_to,
+    broadcast_shapes, sliding_window_view,
     )
+import pytest
+
 
 def assert_shapes_correct(input_shapes, expected_shape):
     # Broadcast a list of arrays with the given input shapes and check the
@@ -274,7 +277,9 @@ def test_broadcast_to_raises():
 
 
 def test_broadcast_shape():
-    # broadcast_shape is already exercized indirectly by broadcast_arrays
+    # tests internal _broadcast_shape
+    # _broadcast_shape is already exercised indirectly by broadcast_arrays
+    # _broadcast_shape is also exercised by the public broadcast_shapes function
     assert_equal(_broadcast_shape(), ())
     assert_equal(_broadcast_shape([1, 2]), (2,))
     assert_equal(_broadcast_shape(np.ones((1, 1))), (1, 1))
@@ -288,6 +293,64 @@ def test_broadcast_shape():
     assert_raises(ValueError, lambda: _broadcast_shape(*bad_args))
 
 
+def test_broadcast_shapes_succeeds():
+    # tests public broadcast_shapes
+    data = [
+        [[], ()],
+        [[()], ()],
+        [[(7,)], (7,)],
+        [[(1, 2), (2,)], (1, 2)],
+        [[(1, 1)], (1, 1)],
+        [[(1, 1), (3, 4)], (3, 4)],
+        [[(6, 7), (5, 6, 1), (7,), (5, 1, 7)], (5, 6, 7)],
+        [[(5, 6, 1)], (5, 6, 1)],
+        [[(1, 3), (3, 1)], (3, 3)],
+        [[(1, 0), (0, 0)], (0, 0)],
+        [[(0, 1), (0, 0)], (0, 0)],
+        [[(1, 0), (0, 1)], (0, 0)],
+        [[(1, 1), (0, 0)], (0, 0)],
+        [[(1, 1), (1, 0)], (1, 0)],
+        [[(1, 1), (0, 1)], (0, 1)],
+        [[(), (0,)], (0,)],
+        [[(0,), (0, 0)], (0, 0)],
+        [[(0,), (0, 1)], (0, 0)],
+        [[(1,), (0, 0)], (0, 0)],
+        [[(), (0, 0)], (0, 0)],
+        [[(1, 1), (0,)], (1, 0)],
+        [[(1,), (0, 1)], (0, 1)],
+        [[(1,), (1, 0)], (1, 0)],
+        [[(), (1, 0)], (1, 0)],
+        [[(), (0, 1)], (0, 1)],
+        [[(1,), (3,)], (3,)],
+        [[2, (3, 2)], (3, 2)],
+    ]
+    for input_shapes, target_shape in data:
+        assert_equal(broadcast_shapes(*input_shapes), target_shape)
+
+    assert_equal(broadcast_shapes(*([(1, 2)] * 32)), (1, 2))
+    assert_equal(broadcast_shapes(*([(1, 2)] * 100)), (1, 2))
+
+    # regression tests for gh-5862
+    assert_equal(broadcast_shapes(*([(2,)] * 32)), (2,))
+
+
+def test_broadcast_shapes_raises():
+    # tests public broadcast_shapes
+    data = [
+        [(3,), (4,)],
+        [(2, 3), (2,)],
+        [(3,), (3,), (4,)],
+        [(1, 3, 4), (2, 3, 3)],
+        [(1, 2), (3,1), (3,2), (10, 5)],
+        [2, (2, 3)],
+    ]
+    for input_shapes in data:
+        assert_raises(ValueError, lambda: broadcast_shapes(*input_shapes))
+
+    bad_args = [(2,)] * 32 + [(3,)] * 32
+    assert_raises(ValueError, lambda: broadcast_shapes(*bad_args))
+
+
 def test_as_strided():
     a = np.array([None])
     a_view = as_strided(a)
@@ -333,6 +396,109 @@ def test_as_strided():
     assert_equal(a.dtype, a_view.dtype)
     assert_array_equal([r] * 3, a_view)
 
+
+class TestSlidingWindowView:
+    def test_1d(self):
+        arr = np.arange(5)
+        arr_view = sliding_window_view(arr, 2)
+        expected = np.array([[0, 1],
+                             [1, 2],
+                             [2, 3],
+                             [3, 4]])
+        assert_array_equal(arr_view, expected)
+
+    def test_2d(self):
+        i, j = np.ogrid[:3, :4]
+        arr = 10*i + j
+        shape = (2, 2)
+        arr_view = sliding_window_view(arr, shape)
+        expected = np.array([[[[0, 1], [10, 11]],
+                              [[1, 2], [11, 12]],
+                              [[2, 3], [12, 13]]],
+                             [[[10, 11], [20, 21]],
+                              [[11, 12], [21, 22]],
+                              [[12, 13], [22, 23]]]])
+        assert_array_equal(arr_view, expected)
+
+    def test_2d_with_axis(self):
+        i, j = np.ogrid[:3, :4]
+        arr = 10*i + j
+        arr_view = sliding_window_view(arr, 3, 0)
+        expected = np.array([[[0, 10, 20],
+                              [1, 11, 21],
+                              [2, 12, 22],
+                              [3, 13, 23]]])
+        assert_array_equal(arr_view, expected)
+
+    def test_2d_repeated_axis(self):
+        i, j = np.ogrid[:3, :4]
+        arr = 10*i + j
+        arr_view = sliding_window_view(arr, (2, 3), (1, 1))
+        expected = np.array([[[[0, 1, 2],
+                               [1, 2, 3]]],
+                             [[[10, 11, 12],
+                               [11, 12, 13]]],
+                             [[[20, 21, 22],
+                               [21, 22, 23]]]])
+        assert_array_equal(arr_view, expected)
+
+    def test_2d_without_axis(self):
+        i, j = np.ogrid[:4, :4]
+        arr = 10*i + j
+        shape = (2, 3)
+        arr_view = sliding_window_view(arr, shape)
+        expected = np.array([[[[0, 1, 2], [10, 11, 12]],
+                              [[1, 2, 3], [11, 12, 13]]],
+                             [[[10, 11, 12], [20, 21, 22]],
+                              [[11, 12, 13], [21, 22, 23]]],
+                             [[[20, 21, 22], [30, 31, 32]],
+                              [[21, 22, 23], [31, 32, 33]]]])
+        assert_array_equal(arr_view, expected)
+
+    def test_errors(self):
+        i, j = np.ogrid[:4, :4]
+        arr = 10*i + j
+        with pytest.raises(ValueError, match='cannot contain negative values'):
+            sliding_window_view(arr, (-1, 3))
+        with pytest.raises(
+                ValueError,
+                match='must provide window_shape for all dimensions of `x`'):
+            sliding_window_view(arr, (1,))
+        with pytest.raises(
+                ValueError,
+                match='Must provide matching length window_shape and axis'):
+            sliding_window_view(arr, (1, 3, 4), axis=(0, 1))
+        with pytest.raises(
+                ValueError,
+                match='window shape cannot be larger than input array'):
+            sliding_window_view(arr, (5, 5))
+
+    def test_writeable(self):
+        arr = np.arange(5)
+        view = sliding_window_view(arr, 2, writeable=False)
+        assert_(not view.flags.writeable)
+        with pytest.raises(
+                ValueError,
+                match='assignment destination is read-only'):
+            view[0, 0] = 3
+        view = sliding_window_view(arr, 2, writeable=True)
+        assert_(view.flags.writeable)
+        view[0, 1] = 3
+        assert_array_equal(arr, np.array([0, 3, 2, 3, 4]))
+
+    def test_subok(self):
+        class MyArray(np.ndarray):
+            pass
+
+        arr = np.arange(5).view(MyArray)
+        assert_(not isinstance(sliding_window_view(arr, 2,
+                                                   subok=False),
+                               MyArray))
+        assert_(isinstance(sliding_window_view(arr, 2, subok=True), MyArray))
+        # Default behavior
+        assert_(not isinstance(sliding_window_view(arr, 2), MyArray))
+
+
 def as_strided_writeable():
     arr = np.ones(10)
     view = as_strided(arr, writeable=False)
@@ -435,7 +601,7 @@ def test_writeable():
             # check: no warning emitted
             assert_equal(result.flags.writeable, True)
             result[:] = 0
-            
+
     # keep readonly input readonly
     original.flags.writeable = False
     _, result = broadcast_arrays(0, original)
diff --git a/numpy/lib/tests/test_utils.py b/numpy/lib/tests/test_utils.py
index 261cfef5d..33951b92a 100644
--- a/numpy/lib/tests/test_utils.py
+++ b/numpy/lib/tests/test_utils.py
@@ -140,3 +140,22 @@ class TestByteBounds:
 def test_assert_raises_regex_context_manager():
     with assert_raises_regex(ValueError, 'no deprecation warning'):
         raise ValueError('no deprecation warning')
+
+
+def test_info_method_heading():
+    # info(class) should only print "Methods:" heading if methods exist
+
+    class NoPublicMethods:
+        pass
+
+    class WithPublicMethods:
+        def first_method():
+            pass
+            
+    def _has_method_heading(cls):
+        out = StringIO()
+        utils.info(cls, output=out)
+        return 'Methods:' in out.getvalue()
+
+    assert _has_method_heading(WithPublicMethods)
+    assert not _has_method_heading(NoPublicMethods)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 2bb4c78a5..2b4cbdfbb 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -8,7 +8,7 @@ from numpy.core.numeric import (
     asarray, where, int8, int16, int32, int64, empty, promote_types, diagonal,
     nonzero
     )
-from numpy.core.overrides import set_module
+from numpy.core.overrides import set_array_function_like_doc, set_module
 from numpy.core import overrides
 from numpy.core import iinfo
 
@@ -149,8 +149,13 @@ def flipud(m):
     return m[::-1, ...]
 
 
+def _eye_dispatcher(N, M=None, k=None, dtype=None, order=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def eye(N, M=None, k=0, dtype=float, order='C'):
+def eye(N, M=None, k=0, dtype=float, order='C', *, like=None):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
 
@@ -171,6 +176,9 @@ def eye(N, M=None, k=0, dtype=float, order='C'):
         column-major (Fortran-style) order in memory.
 
         .. versionadded:: 1.14.0
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -194,6 +202,8 @@ def eye(N, M=None, k=0, dtype=float, order='C'):
            [0.,  0.,  0.]])
 
     """
+    if like is not None:
+        return _eye_with_like(N, M=M, k=k, dtype=dtype, order=order, like=like)
     if M is None:
         M = N
     m = zeros((N, M), dtype=dtype, order=order)
@@ -207,6 +217,11 @@ def eye(N, M=None, k=0, dtype=float, order='C'):
     return m
 
 
+_eye_with_like = array_function_dispatch(
+    _eye_dispatcher
+)(eye)
+
+
 def _diag_dispatcher(v, k=None):
     return (v,)
 
@@ -343,8 +358,13 @@ def diagflat(v, k=0):
     return wrap(res)
 
 
+def _tri_dispatcher(N, M=None, k=None, dtype=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def tri(N, M=None, k=0, dtype=float):
+def tri(N, M=None, k=0, dtype=float, *, like=None):
     """
     An array with ones at and below the given diagonal and zeros elsewhere.
 
@@ -361,6 +381,9 @@ def tri(N, M=None, k=0, dtype=float):
         and `k` > 0 is above.  The default is 0.
     dtype : dtype, optional
         Data type of the returned array.  The default is float.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -381,6 +404,9 @@ def tri(N, M=None, k=0, dtype=float):
            [1.,  1.,  0.,  0.,  0.]])
 
     """
+    if like is not None:
+        return _tri_with_like(N, M=M, k=k, dtype=dtype, like=like)
+
     if M is None:
         M = N
 
@@ -393,6 +419,11 @@ def tri(N, M=None, k=0, dtype=float):
     return m
 
 
+_tri_with_like = array_function_dispatch(
+    _tri_dispatcher
+)(tri)
+
+
 def _trilu_dispatcher(m, k=None):
     return (m,)
 
@@ -675,7 +706,7 @@ def histogram2d(x, y, bins=10, range=None, normed=None, weights=None,
 
     >>> fig = plt.figure(figsize=(7, 3))
     >>> ax = fig.add_subplot(131, title='imshow: square bins')
-    >>> plt.imshow(H, interpolation='nearest', origin='low',
+    >>> plt.imshow(H, interpolation='nearest', origin='lower',
     ...         extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
     <matplotlib.image.AxesImage object at 0x...>
 
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index d511c2a40..5447608bf 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -587,11 +587,11 @@ def info(object=None, maxwidth=76, output=sys.stdout, toplevel='numpy'):
             print(inspect.getdoc(object), file=output)
 
         methods = pydoc.allmethods(object)
-        if methods != []:
+
+        public_methods = [meth for meth in methods if meth[0] != '_']
+        if public_methods:
             print("\n\nMethods:\n", file=output)
-            for meth in methods:
-                if meth[0] == '_':
-                    continue
+            for meth in public_methods:
                 thisobj = getattr(object, meth, None)
                 if thisobj is not None:
                     methstr, other = pydoc.splitdoc(
diff --git a/numpy/linalg/__init__.pyi b/numpy/linalg/__init__.pyi
new file mode 100644
index 000000000..ffb05bb81
--- /dev/null
+++ b/numpy/linalg/__init__.pyi
@@ -0,0 +1,23 @@
+from typing import Any
+
+matrix_power: Any
+solve: Any
+tensorsolve: Any
+tensorinv: Any
+inv: Any
+cholesky: Any
+eigvals: Any
+eigvalsh: Any
+pinv: Any
+slogdet: Any
+det: Any
+svd: Any
+eig: Any
+eigh: Any
+lstsq: Any
+norm: Any
+qr: Any
+cond: Any
+matrix_rank: Any
+LinAlgError: Any
+multi_dot: Any
diff --git a/numpy/linalg/lapack_lite/make_lite.py b/numpy/linalg/lapack_lite/make_lite.py
index 23921acf4..cf15b2541 100755
--- a/numpy/linalg/lapack_lite/make_lite.py
+++ b/numpy/linalg/lapack_lite/make_lite.py
@@ -81,7 +81,7 @@ class FortranRoutine:
         return self._dependencies
 
     def __repr__(self):
-        return "FortranRoutine({!r}, filename={!r})".format(self.name, self.filename)
+        return f'FortranRoutine({self.name!r}, filename={self.filename!r})'
 
 class UnknownFortranRoutine(FortranRoutine):
     """Wrapper for a Fortran routine for which the corresponding file
@@ -193,7 +193,7 @@ class LapackLibrary(FortranLibrary):
 def printRoutineNames(desc, routines):
     print(desc)
     for r in routines:
-        print('\t%s' % r.name)
+        print(f'\t{r.name}')
 
 def getLapackRoutines(wrapped_routines, ignores, lapack_dir):
     blas_src_dir = os.path.join(lapack_dir, 'BLAS', 'SRC')
@@ -243,7 +243,7 @@ def dumpRoutineNames(library, output_dir):
         with open(filename, 'w') as fo:
             for r in routines:
                 deps = r.dependencies()
-                fo.write('%s: %s\n' % (r.name, ' '.join(deps)))
+                fo.write(f"{r.name}: {' '.join(deps)}\n")
 
 def concatenateRoutines(routines, output_file):
     with open(output_file, 'w') as output_fo:
@@ -316,13 +316,13 @@ def create_name_header(output_dir):
 
         # Rename BLAS/LAPACK symbols
         for name in sorted(symbols):
-            f.write("#define %s_ BLAS_FUNC(%s)\n" % (name, name))
+            f.write(f'#define {name}_ BLAS_FUNC({name})\n')
 
         # Rename also symbols that f2c exports itself
         f.write("\n"
                 "/* Symbols exported by f2c.c */\n")
         for name in sorted(f2c_symbols):
-            f.write("#define %s numpy_lapack_lite_%s\n" % (name, name))
+            f.write(f'#define {name} numpy_lapack_lite_{name}\n')
 
 def main():
     if len(sys.argv) != 3:
@@ -348,9 +348,9 @@ def main():
     dumpRoutineNames(library, output_dir)
 
     for typename in types:
-        fortran_file = os.path.join(output_dir, 'f2c_%s.f' % typename)
+        fortran_file = os.path.join(output_dir, f'f2c_{typename}.f')
         c_file = fortran_file[:-2] + '.c'
-        print('creating %s ...'  % c_file)
+        print(f'creating {c_file} ...')
         routines = library.allRoutinesByType(typename)
         concatenateRoutines(routines, fortran_file)
 
@@ -358,11 +358,11 @@ def main():
         patch_file = os.path.basename(fortran_file) + '.patch'
         if os.path.exists(patch_file):
             subprocess.check_call(['patch', '-u', fortran_file, patch_file])
-            print("Patched {}".format(fortran_file))
+            print(f'Patched {fortran_file}')
         try:
             runF2C(fortran_file, output_dir)
         except F2CError:
-            print('f2c failed on %s' % fortran_file)
+            print(f'f2c failed on {fortran_file}')
             break
         scrubF2CSource(c_file)
 
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 513ea8219..7775e4c32 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -176,10 +176,9 @@ def _to_native_byte_order(*arrays):
 def _fastCopyAndTranspose(type, *arrays):
     cast_arrays = ()
     for a in arrays:
-        if a.dtype.type is type:
-            cast_arrays = cast_arrays + (_fastCT(a),)
-        else:
-            cast_arrays = cast_arrays + (_fastCT(a.astype(type)),)
+        if a.dtype.type is not type:
+            a = a.astype(type)
+        cast_arrays = cast_arrays + (_fastCT(a),)
     if len(cast_arrays) == 1:
         return cast_arrays[0]
     else:
@@ -362,13 +361,13 @@ def solve(a, b):
 
     Examples
     --------
-    Solve the system of equations ``3 * x0 + x1 = 9`` and ``x0 + 2 * x1 = 8``:
+    Solve the system of equations ``x0 + 2 * x1 = 1`` and ``3 * x0 + 5 * x1 = 2``:
 
-    >>> a = np.array([[3,1], [1,2]])
-    >>> b = np.array([9,8])
+    >>> a = np.array([[1, 2], [3, 5]])
+    >>> b = np.array([1, 2])
     >>> x = np.linalg.solve(a, b)
     >>> x
-    array([2.,  3.])
+    array([-1.,  1.])
 
     Check that the solution is correct:
 
@@ -902,7 +901,7 @@ def qr(a, mode='reduced'):
             warnings.warn(msg, DeprecationWarning, stacklevel=3)
             mode = 'economic'
         else:
-            raise ValueError("Unrecognized mode '%s'" % mode)
+            raise ValueError(f"Unrecognized mode '{mode}'")
 
     a, wrap = _makearray(a)
     _assert_2d(a)
@@ -2207,8 +2206,8 @@ def lstsq(a, b, rcond="warn"):
         Least-squares solution. If `b` is two-dimensional,
         the solutions are in the `K` columns of `x`.
     residuals : {(1,), (K,), (0,)} ndarray
-        Sums of residuals; squared Euclidean 2-norm for each column in
-        ``b - a*x``.
+        Sums of squared residuals: Squared Euclidean 2-norm for each column in
+        ``b - a @ x``.
         If the rank of `a` is < N or M <= N, this is an empty array.
         If `b` is 1-dimensional, this is a (1,) shape array.
         Otherwise the shape is (K,).
@@ -2559,7 +2558,7 @@ def norm(x, ord=None, axis=None, keepdims=False):
             # special case for speedup
             s = (x.conj() * x).real
             return sqrt(add.reduce(s, axis=axis, keepdims=keepdims))
-        # None of the str-type keywords for ord ('fro', 'nuc') 
+        # None of the str-type keywords for ord ('fro', 'nuc')
         # are valid for vectors
         elif isinstance(ord, str):
             raise ValueError(f"Invalid norm order '{ord}' for vectors")
diff --git a/numpy/linalg/setup.py b/numpy/linalg/setup.py
index bb070ed9d..5c9f2a4cb 100644
--- a/numpy/linalg/setup.py
+++ b/numpy/linalg/setup.py
@@ -80,6 +80,7 @@ def configuration(parent_package='', top_path=None):
         extra_info=lapack_info,
         libraries=['npymath'],
     )
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/linalg/tests/test_build.py b/numpy/linalg/tests/test_build.py
index cbf3089bc..4859226d9 100644
--- a/numpy/linalg/tests/test_build.py
+++ b/numpy/linalg/tests/test_build.py
@@ -16,13 +16,13 @@ class FindDependenciesLdd:
             p = Popen(self.cmd, stdout=PIPE, stderr=PIPE)
             stdout, stderr = p.communicate()
         except OSError:
-            raise RuntimeError("command %s cannot be run" % self.cmd)
+            raise RuntimeError(f'command {self.cmd} cannot be run')
 
     def get_dependencies(self, lfile):
         p = Popen(self.cmd + [lfile], stdout=PIPE, stderr=PIPE)
         stdout, stderr = p.communicate()
         if not (p.returncode == 0):
-            raise RuntimeError("failed dependencies check for %s" % lfile)
+            raise RuntimeError(f'failed dependencies check for {lfile}')
 
         return stdout
 
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index 3f3bf9f70..21fab58e1 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -85,7 +85,7 @@ class LinalgCase:
         do(self.a, self.b, tags=self.tags)
 
     def __repr__(self):
-        return "<LinalgCase: %s>" % (self.name,)
+        return f'<LinalgCase: {self.name}>'
 
 
 def apply_tag(tag, cases):
@@ -349,7 +349,7 @@ class LinalgTestCase:
             try:
                 case.check(self.do)
             except Exception:
-                msg = "In test case: %r\n\n" % case
+                msg = f'In test case: {case!r}\n\n'
                 msg += traceback.format_exc()
                 raise AssertionError(msg)
 
@@ -1732,7 +1732,7 @@ class TestCholesky:
 
             b = np.matmul(c, c.transpose(t).conj())
             assert_allclose(b, a,
-                            err_msg="{} {}\n{}\n{}".format(shape, dtype, a, c),
+                            err_msg=f'{shape} {dtype}\n{a}\n{c}',
                             atol=500 * a.shape[0] * np.finfo(dtype).eps)
 
     def test_0_size(self):
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 59647c67d..1807aadcf 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3665,7 +3665,7 @@ PyObject *PyInit__umath_linalg(void)
         return NULL;
     }
 
-    version = PyString_FromString(umath_linalg_version_string);
+    version = PyUnicode_FromString(umath_linalg_version_string);
     if (version == NULL) {
         return NULL;
     }
diff --git a/numpy/ma/__init__.pyi b/numpy/ma/__init__.pyi
new file mode 100644
index 000000000..d1259abcc
--- /dev/null
+++ b/numpy/ma/__init__.pyi
@@ -0,0 +1,225 @@
+from typing import Any
+
+core: Any
+extras: Any
+MAError: Any
+MaskError: Any
+MaskType: Any
+MaskedArray: Any
+abs: Any
+absolute: Any
+add: Any
+all: Any
+allclose: Any
+allequal: Any
+alltrue: Any
+amax: Any
+amin: Any
+angle: Any
+anom: Any
+anomalies: Any
+any: Any
+append: Any
+arange: Any
+arccos: Any
+arccosh: Any
+arcsin: Any
+arcsinh: Any
+arctan: Any
+arctan2: Any
+arctanh: Any
+argmax: Any
+argmin: Any
+argsort: Any
+around: Any
+array: Any
+asanyarray: Any
+asarray: Any
+bitwise_and: Any
+bitwise_or: Any
+bitwise_xor: Any
+bool_: Any
+ceil: Any
+choose: Any
+clip: Any
+common_fill_value: Any
+compress: Any
+compressed: Any
+concatenate: Any
+conjugate: Any
+convolve: Any
+copy: Any
+correlate: Any
+cos: Any
+cosh: Any
+count: Any
+cumprod: Any
+cumsum: Any
+default_fill_value: Any
+diag: Any
+diagonal: Any
+diff: Any
+divide: Any
+empty: Any
+empty_like: Any
+equal: Any
+exp: Any
+expand_dims: Any
+fabs: Any
+filled: Any
+fix_invalid: Any
+flatten_mask: Any
+flatten_structured_array: Any
+floor: Any
+floor_divide: Any
+fmod: Any
+frombuffer: Any
+fromflex: Any
+fromfunction: Any
+getdata: Any
+getmask: Any
+getmaskarray: Any
+greater: Any
+greater_equal: Any
+harden_mask: Any
+hypot: Any
+identity: Any
+ids: Any
+indices: Any
+inner: Any
+innerproduct: Any
+isMA: Any
+isMaskedArray: Any
+is_mask: Any
+is_masked: Any
+isarray: Any
+left_shift: Any
+less: Any
+less_equal: Any
+log: Any
+log10: Any
+log2: Any
+logical_and: Any
+logical_not: Any
+logical_or: Any
+logical_xor: Any
+make_mask: Any
+make_mask_descr: Any
+make_mask_none: Any
+mask_or: Any
+masked: Any
+masked_array: Any
+masked_equal: Any
+masked_greater: Any
+masked_greater_equal: Any
+masked_inside: Any
+masked_invalid: Any
+masked_less: Any
+masked_less_equal: Any
+masked_not_equal: Any
+masked_object: Any
+masked_outside: Any
+masked_print_option: Any
+masked_singleton: Any
+masked_values: Any
+masked_where: Any
+max: Any
+maximum: Any
+maximum_fill_value: Any
+mean: Any
+min: Any
+minimum: Any
+minimum_fill_value: Any
+mod: Any
+multiply: Any
+mvoid: Any
+ndim: Any
+negative: Any
+nomask: Any
+nonzero: Any
+not_equal: Any
+ones: Any
+outer: Any
+outerproduct: Any
+power: Any
+prod: Any
+product: Any
+ptp: Any
+put: Any
+putmask: Any
+ravel: Any
+remainder: Any
+repeat: Any
+reshape: Any
+resize: Any
+right_shift: Any
+round: Any
+round_: Any
+set_fill_value: Any
+shape: Any
+sin: Any
+sinh: Any
+size: Any
+soften_mask: Any
+sometrue: Any
+sort: Any
+sqrt: Any
+squeeze: Any
+std: Any
+subtract: Any
+sum: Any
+swapaxes: Any
+take: Any
+tan: Any
+tanh: Any
+trace: Any
+transpose: Any
+true_divide: Any
+var: Any
+where: Any
+zeros: Any
+apply_along_axis: Any
+apply_over_axes: Any
+atleast_1d: Any
+atleast_2d: Any
+atleast_3d: Any
+average: Any
+clump_masked: Any
+clump_unmasked: Any
+column_stack: Any
+compress_cols: Any
+compress_nd: Any
+compress_rowcols: Any
+compress_rows: Any
+count_masked: Any
+corrcoef: Any
+cov: Any
+diagflat: Any
+dot: Any
+dstack: Any
+ediff1d: Any
+flatnotmasked_contiguous: Any
+flatnotmasked_edges: Any
+hsplit: Any
+hstack: Any
+isin: Any
+in1d: Any
+intersect1d: Any
+mask_cols: Any
+mask_rowcols: Any
+mask_rows: Any
+masked_all: Any
+masked_all_like: Any
+median: Any
+mr_: Any
+notmasked_contiguous: Any
+notmasked_edges: Any
+polyfit: Any
+row_stack: Any
+setdiff1d: Any
+setxor1d: Any
+stack: Any
+unique: Any
+union1d: Any
+vander: Any
+vstack: Any
diff --git a/numpy/ma/bench.py b/numpy/ma/bench.py
index 83cc6aea7..e29d54365 100644
--- a/numpy/ma/bench.py
+++ b/numpy/ma/bench.py
@@ -58,7 +58,7 @@ def compare_functions_1v(func, nloop=500,
                        xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
     funcname = func.__name__
     print("-"*50)
-    print("%s on small arrays" % funcname)
+    print(f'{funcname} on small arrays')
     module, data = "numpy.ma", "nmxs"
     timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
 
@@ -70,8 +70,8 @@ def compare_functions_1v(func, nloop=500,
 def compare_methods(methodname, args, vars='x', nloop=500, test=True,
                     xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
     print("-"*50)
-    print("%s on small arrays" % methodname)
-    data, ver = "nm%ss" % vars, 'numpy.ma'
+    print(f'{methodname} on small arrays')
+    data, ver = f'nm{vars}l', 'numpy.ma'
     timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
 
     print("%s on large arrays" % methodname)
@@ -86,11 +86,11 @@ def compare_functions_2v(func, nloop=500, test=True,
                        yl=yl, nmyl=nmyl):
     funcname = func.__name__
     print("-"*50)
-    print("%s on small arrays" % funcname)
+    print(f'{funcname} on small arrays')
     module, data = "numpy.ma", "nmxs,nmys"
     timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
 
-    print("%s on large arrays" % funcname)
+    print(f'{funcname} on large arrays')
     module, data = "numpy.ma", "nmxl,nmyl"
     timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
     return
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index b5371f51a..d6af22337 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -443,9 +443,9 @@ def _check_fill_value(fill_value, ndtype):
         if isinstance(fill_value, (ndarray, np.void)):
             try:
                 fill_value = np.array(fill_value, copy=False, dtype=ndtype)
-            except ValueError:
+            except ValueError as e:
                 err_msg = "Unable to transform %s to dtype %s"
-                raise ValueError(err_msg % (fill_value, ndtype))
+                raise ValueError(err_msg % (fill_value, ndtype)) from e
         else:
             fill_value = np.asarray(fill_value, dtype=object)
             fill_value = np.array(_recursive_set_fill_value(fill_value, ndtype),
@@ -460,12 +460,12 @@ def _check_fill_value(fill_value, ndtype):
             # Also in case of converting string arrays.
             try:
                 fill_value = np.array(fill_value, copy=False, dtype=ndtype)
-            except (OverflowError, ValueError):
+            except (OverflowError, ValueError) as e:
                 # Raise TypeError instead of OverflowError or ValueError.
                 # OverflowError is seldom used, and the real problem here is
                 # that the passed fill_value is not compatible with the ndtype.
                 err_msg = "Cannot convert fill_value %s to dtype %s"
-                raise TypeError(err_msg % (fill_value, ndtype))
+                raise TypeError(err_msg % (fill_value, ndtype)) from e
     return np.array(fill_value)
 
 
@@ -1489,7 +1489,7 @@ def is_mask(m):
 
     See Also
     --------
-    isMaskedArray : Test whether input is an instance of MaskedArray.
+    ma.isMaskedArray : Test whether input is an instance of MaskedArray.
 
     Examples
     --------
@@ -2812,7 +2812,7 @@ class MaskedArray(ndarray):
 
     def __new__(cls, data=None, mask=nomask, dtype=None, copy=False,
                 subok=True, ndmin=0, fill_value=None, keep_mask=True,
-                hard_mask=None, shrink=True, order=None, **options):
+                hard_mask=None, shrink=True, order=None):
         """
         Create a new masked array from scratch.
 
@@ -3538,11 +3538,12 @@ class MaskedArray(ndarray):
         Force the mask to hard.
 
         Whether the mask of a masked array is hard or soft is determined by
-        its `hardmask` property. `harden_mask` sets `hardmask` to True.
+        its `~ma.MaskedArray.hardmask` property. `harden_mask` sets
+        `~ma.MaskedArray.hardmask` to ``True``.
 
         See Also
         --------
-        hardmask
+        ma.MaskedArray.hardmask
 
         """
         self._hardmask = True
@@ -3553,11 +3554,12 @@ class MaskedArray(ndarray):
         Force the mask to soft.
 
         Whether the mask of a masked array is hard or soft is determined by
-        its `hardmask` property. `soften_mask` sets `hardmask` to False.
+        its `~ma.MaskedArray.hardmask` property. `soften_mask` sets
+        `~ma.MaskedArray.hardmask` to ``False``.
 
         See Also
         --------
-        hardmask
+        ma.MaskedArray.hardmask
 
         """
         self._hardmask = False
@@ -3832,7 +3834,7 @@ class MaskedArray(ndarray):
         """
         Return `a` where condition is ``True``.
 
-        If condition is a `MaskedArray`, missing values are considered
+        If condition is a `~ma.MaskedArray`, missing values are considered
         as ``False``.
 
         Parameters
@@ -3851,7 +3853,7 @@ class MaskedArray(ndarray):
         Returns
         -------
         result : MaskedArray
-            A :class:`MaskedArray` object.
+            A :class:`~ma.MaskedArray` object.
 
         Notes
         -----
@@ -4461,7 +4463,7 @@ class MaskedArray(ndarray):
 
         See Also
         --------
-        count_masked : Count masked elements in array or along a given axis.
+        ma.count_masked : Count masked elements in array or along a given axis.
 
         Examples
         --------
@@ -5103,7 +5105,7 @@ class MaskedArray(ndarray):
 
         Notes
         -----
-        The mask is lost if `out` is not a valid :class:`MaskedArray` !
+        The mask is lost if `out` is not a valid :class:`ma.MaskedArray` !
 
         Arithmetic is modular when using integer types, and no error is
         raised on overflow.
@@ -5389,7 +5391,7 @@ class MaskedArray(ndarray):
 
         See Also
         --------
-        numpy.ndarray.around : corresponding function for ndarrays
+        numpy.ndarray.round : corresponding function for ndarrays
         numpy.around : equivalent function
         """
         result = self._data.round(decimals=decimals, out=out).view(type(self))
@@ -5437,7 +5439,7 @@ class MaskedArray(ndarray):
             When the array contains unmasked values at the same extremes of the
             datatype, the ordering of these values and the masked values is
             undefined.
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used internally for the masked values.
             If ``fill_value`` is not None, it supersedes ``endwith``.
 
@@ -5449,7 +5451,7 @@ class MaskedArray(ndarray):
 
         See Also
         --------
-        MaskedArray.sort : Describes sorting algorithms used.
+        ma.MaskedArray.sort : Describes sorting algorithms used.
         lexsort : Indirect stable sort with multiple keys.
         numpy.ndarray.sort : Inplace sort.
 
@@ -5495,7 +5497,7 @@ class MaskedArray(ndarray):
         axis : {None, integer}
             If None, the index is into the flattened array, otherwise along
             the specified axis
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used to fill in the masked values.  If None, the output of
             minimum_fill_value(self._data) is used instead.
         out : {None, array}, optional
@@ -5541,7 +5543,7 @@ class MaskedArray(ndarray):
         axis : {None, integer}
             If None, the index is into the flattened array, otherwise along
             the specified axis
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used to fill in the masked values.  If None, the output of
             maximum_fill_value(self._data) is used instead.
         out : {None, array}, optional
@@ -5592,7 +5594,7 @@ class MaskedArray(ndarray):
             When the array contains unmasked values sorting at the same extremes of the
             datatype, the ordering of these values and the masked values is
             undefined.
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used internally for the masked values.
             If ``fill_value`` is not None, it supersedes ``endwith``.
 
@@ -5663,7 +5665,7 @@ class MaskedArray(ndarray):
         out : array_like, optional
             Alternative output array in which to place the result.  Must be of
             the same shape and buffer length as the expected output.
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used to fill in the masked values.
             If None, use the output of `minimum_fill_value`.
         keepdims : bool, optional
@@ -5679,7 +5681,7 @@ class MaskedArray(ndarray):
 
         See Also
         --------
-        minimum_fill_value
+        ma.minimum_fill_value
             Returns the minimum filling value for a given datatype.
 
         """
@@ -5797,7 +5799,7 @@ class MaskedArray(ndarray):
         out : array_like, optional
             Alternative output array in which to place the result.  Must
             be of the same shape and buffer length as the expected output.
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used to fill in the masked values.
             If None, use the output of maximum_fill_value().
         keepdims : bool, optional
@@ -5813,7 +5815,7 @@ class MaskedArray(ndarray):
 
         See Also
         --------
-        maximum_fill_value
+        ma.maximum_fill_value
             Returns the maximum filling value for a given datatype.
 
         """
@@ -5874,7 +5876,7 @@ class MaskedArray(ndarray):
             Alternative output array in which to place the result. It must
             have the same shape and buffer length as the expected output
             but the type will be cast if necessary.
-        fill_value : {var}, optional
+        fill_value : scalar or None, optional
             Value used to fill in the masked values.
         keepdims : bool, optional
             If this is set to True, the axes which are reduced are left
@@ -6907,11 +6909,11 @@ def compressed(x):
     Return all the non-masked data as a 1-D array.
 
     This function is equivalent to calling the "compressed" method of a
-    `MaskedArray`, see `MaskedArray.compressed` for details.
+    `ma.MaskedArray`, see `ma.MaskedArray.compressed` for details.
 
     See Also
     --------
-    MaskedArray.compressed
+    ma.MaskedArray.compressed
         Equivalent method.
 
     """
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index 8ede29da1..1bf03e966 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -901,11 +901,11 @@ def compress_rows(a):
     Suppress whole rows of a 2-D array that contain masked values.
 
     This is equivalent to ``np.ma.compress_rowcols(a, 0)``, see
-    `extras.compress_rowcols` for details.
+    `compress_rowcols` for details.
 
     See Also
     --------
-    extras.compress_rowcols
+    compress_rowcols
 
     """
     a = asarray(a)
@@ -918,11 +918,11 @@ def compress_cols(a):
     Suppress whole columns of a 2-D array that contain masked values.
 
     This is equivalent to ``np.ma.compress_rowcols(a, 1)``, see
-    `extras.compress_rowcols` for details.
+    `compress_rowcols` for details.
 
     See Also
     --------
-    extras.compress_rowcols
+    compress_rowcols
 
     """
     a = asarray(a)
@@ -1641,7 +1641,7 @@ def flatnotmasked_contiguous(a):
     slice_list : list
         A sorted sequence of `slice` objects (start index, end index).
 
-        ..versionchanged:: 1.15.0
+        .. versionchanged:: 1.15.0
             Now returns an empty list instead of None for a fully masked array
 
     See Also
diff --git a/numpy/ma/mrecords.py b/numpy/ma/mrecords.py
index cd93a9a14..70087632e 100644
--- a/numpy/ma/mrecords.py
+++ b/numpy/ma/mrecords.py
@@ -60,7 +60,7 @@ def _checknames(descr, names=None):
         elif isinstance(names, str):
             new_names = names.split(',')
         else:
-            raise NameError("illegal input names %s" % repr(names))
+            raise NameError(f'illegal input names {names!r}')
         nnames = len(new_names)
         if nnames < ndescr:
             new_names += default_names[nnames:]
@@ -198,8 +198,8 @@ class MaskedRecords(MaskedArray):
         fielddict = ndarray.__getattribute__(self, 'dtype').fields
         try:
             res = fielddict[attr][:2]
-        except (TypeError, KeyError):
-            raise AttributeError("record array has no attribute %s" % attr)
+        except (TypeError, KeyError) as e:
+            raise AttributeError(f'record array has no attribute {attr}') from e
         # So far, so good
         _localdict = ndarray.__getattribute__(self, '__dict__')
         _data = ndarray.view(self, _localdict['_baseclass'])
@@ -274,7 +274,7 @@ class MaskedRecords(MaskedArray):
         try:
             res = fielddict[attr][:2]
         except (TypeError, KeyError):
-            raise AttributeError("record array has no attribute %s" % attr)
+            raise AttributeError(f'record array has no attribute {attr}')
 
         if val is masked:
             _fill_value = _localdict['_fill_value']
@@ -337,13 +337,13 @@ class MaskedRecords(MaskedArray):
 
         """
         if self.size > 1:
-            mstr = ["(%s)" % ",".join([str(i) for i in s])
+            mstr = [f"({','.join([str(i) for i in s])})"
                     for s in zip(*[getattr(self, f) for f in self.dtype.names])]
-            return "[%s]" % ", ".join(mstr)
+            return f"[{', '.join(mstr)}]"
         else:
-            mstr = ["%s" % ",".join([str(i) for i in s])
+            mstr = [f"{','.join([str(i) for i in s])}"
                     for s in zip([getattr(self, f) for f in self.dtype.names])]
-            return "(%s)" % ", ".join(mstr)
+            return f"({', '.join(mstr)})"
 
     def __repr__(self):
         """
@@ -657,7 +657,7 @@ def openfile(fname):
     try:
         f = open(fname)
     except IOError:
-        raise IOError("No such file: '%s'" % fname)
+        raise IOError(f"No such file: '{fname}'")
     if f.readline()[:2] != "\\x":
         f.seek(0, 0)
         return f
diff --git a/numpy/ma/setup.py b/numpy/ma/setup.py
index d3f34c874..018d38cdd 100644
--- a/numpy/ma/setup.py
+++ b/numpy/ma/setup.py
@@ -3,6 +3,7 @@ def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('ma', parent_package, top_path)
     config.add_subpackage('tests')
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == "__main__":
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 27f14a5e7..0b2e7303c 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -244,6 +244,10 @@ class TestMaskedArray:
                                         'offsets':[0,8]})
         array(x)  # used to fail due to 'V' padding field in x.dtype.descr
 
+    def test_unknown_keyword_parameter(self):
+        with pytest.raises(TypeError, match="unexpected keyword argument"):
+            MaskedArray([1, 2, 3], maks=[0, 1, 0])  # `mask` is misspelled.
+
     def test_asarray(self):
         (x, y, a10, m1, m2, xm, ym, z, zm, xf) = self.d
         xm.fill_value = -9999
@@ -2748,7 +2752,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 xm += t(1)
                 assert_equal(xm, y + t(1))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_addition_array_type(self):
         # Test of inplace additions
@@ -2765,7 +2769,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 assert_equal(xm, y + a)
                 assert_equal(xm.mask, mask_or(m, a.mask))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_subtraction_scalar_type(self):
         # Test of inplace subtractions
@@ -2778,7 +2782,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 xm -= t(1)
                 assert_equal(xm, y - t(1))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_subtraction_array_type(self):
         # Test of inplace subtractions
@@ -2795,7 +2799,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 assert_equal(xm, y - a)
                 assert_equal(xm.mask, mask_or(m, a.mask))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_multiplication_scalar_type(self):
         # Test of inplace multiplication
@@ -2808,7 +2812,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 xm *= t(2)
                 assert_equal(xm, y * t(2))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_multiplication_array_type(self):
         # Test of inplace multiplication
@@ -2825,7 +2829,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 assert_equal(xm, y * a)
                 assert_equal(xm.mask, mask_or(m, a.mask))
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_floor_division_scalar_type(self):
         # Test of inplace division
@@ -2861,7 +2865,7 @@ class TestMaskedArrayInPlaceArithmetics:
                     mask_or(mask_or(m, a.mask), (a == t(0)))
                 )
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
     def test_inplace_division_scalar_type(self):
         # Test of inplace division
@@ -2895,9 +2899,9 @@ class TestMaskedArrayInPlaceArithmetics:
                     warnings.warn(str(e), stacklevel=1)
 
                 if issubclass(t, np.integer):
-                    assert_equal(len(sup.log), 2, "Failed on type=%s." % t)
+                    assert_equal(len(sup.log), 2, f'Failed on type={t}.')
                 else:
-                    assert_equal(len(sup.log), 0, "Failed on type=%s." % t)
+                    assert_equal(len(sup.log), 0, f'Failed on type={t}.')
 
     def test_inplace_division_array_type(self):
         # Test of inplace division
@@ -2934,9 +2938,9 @@ class TestMaskedArrayInPlaceArithmetics:
                     warnings.warn(str(e), stacklevel=1)
 
                 if issubclass(t, np.integer):
-                    assert_equal(len(sup.log), 2, "Failed on type=%s." % t)
+                    assert_equal(len(sup.log), 2, f'Failed on type={t}.')
                 else:
-                    assert_equal(len(sup.log), 0, "Failed on type=%s." % t)
+                    assert_equal(len(sup.log), 0, f'Failed on type={t}.')
 
     def test_inplace_pow_type(self):
         # Test keeping data w/ (inplace) power
@@ -2954,7 +2958,7 @@ class TestMaskedArrayInPlaceArithmetics:
                 assert_equal(x.data, xx_r.data)
                 assert_equal(x.mask, xx_r.mask)
 
-                assert_equal(len(w), 0, "Failed on type=%s." % t)
+                assert_equal(len(w), 0, f'Failed on type={t}.')
 
 
 class TestMaskedArrayMethods:
@@ -3858,8 +3862,6 @@ class TestMaskedArrayMathMethods:
             assert_almost_equal(np.sqrt(mXvar0[k]),
                                 mX[:, k].compressed().std())
 
-    @pytest.mark.skipif(sys.platform=='win32' and sys.version_info < (3, 6),
-                        reason='Fails on Python < 3.6 on Windows, gh-9671')
     @suppress_copy_mask_on_assignment
     def test_varstd_specialcases(self):
         # Test a special case for var
@@ -4603,7 +4605,7 @@ class TestMaskedArrayFunctions:
         class M(MaskedArray):
             pass
 
-        test = np.ma.compressed(M(shape=(0,1,2)))
+        test = np.ma.compressed(M([[[]], [[]]]))
         assert_equal(test.ndim, 1)
 
         # with .compressed() overridden
@@ -4611,7 +4613,7 @@ class TestMaskedArrayFunctions:
             def compressed(self):
                 return 42
 
-        test = np.ma.compressed(M(shape=(0,1,2)))
+        test = np.ma.compressed(M([[[]], [[]]]))
         assert_equal(test, 42)
 
     def test_convolve(self):
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 96c7e3609..ab003b94e 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -27,7 +27,7 @@ pi = np.pi
 def eq(v, w, msg=''):
     result = allclose(v, w)
     if not result:
-        print("Not eq:%s\n%s\n----%s" % (msg, str(v), str(w)))
+        print(f'Not eq:{msg}\n{v}\n----{w}')
     return result
 
 
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index caa746740..f2623406d 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -109,11 +109,11 @@ class CSAIterator:
 class ComplicatedSubArray(SubArray):
 
     def __str__(self):
-        return 'myprefix {0} mypostfix'.format(self.view(SubArray))
+        return f'myprefix {self.view(SubArray)} mypostfix'
 
     def __repr__(self):
         # Return a repr that does not start with 'name('
-        return '<{0} {1}>'.format(self.__class__.__name__, self)
+        return f'<{self.__class__.__name__} {self}>'
 
     def _validate_input(self, value):
         if not isinstance(value, ComplicatedSubArray):
@@ -317,8 +317,8 @@ class TestSubclassing:
         assert_startswith(repr(mx), 'masked_array')
         xsub = SubArray(x)
         mxsub = masked_array(xsub, mask=[True, False, True, False, False])
-        assert_startswith(repr(mxsub),
-            'masked_{0}(data=[--, 1, --, 3, 4]'.format(SubArray.__name__))
+        assert_startswith(repr(mxsub), 
+            f'masked_{SubArray.__name__}(data=[--, 1, --, 3, 4]')
 
     def test_subclass_str(self):
         """test str with subclass that has overridden str, setitem"""
diff --git a/numpy/ma/testutils.py b/numpy/ma/testutils.py
index 51ab03948..8d55e1763 100644
--- a/numpy/ma/testutils.py
+++ b/numpy/ma/testutils.py
@@ -86,7 +86,7 @@ def _assert_equal_on_sequences(actual, desired, err_msg=''):
     """
     assert_equal(len(actual), len(desired), err_msg)
     for k in range(len(desired)):
-        assert_equal(actual[k], desired[k], 'item=%r\n%s' % (k, err_msg))
+        assert_equal(actual[k], desired[k], f'item={k!r}\n{err_msg}')
     return
 
 
@@ -117,8 +117,8 @@ def assert_equal(actual, desired, err_msg=''):
         assert_equal(len(actual), len(desired), err_msg)
         for k, i in desired.items():
             if k not in actual:
-                raise AssertionError("%s not in %s" % (k, actual))
-            assert_equal(actual[k], desired[k], 'key=%r\n%s' % (k, err_msg))
+                raise AssertionError(f"{k} not in {actual}")
+            assert_equal(actual[k], desired[k], f'key={k!r}\n{err_msg}')
         return
     # Case #2: lists .....
     if isinstance(desired, (list, tuple)) and isinstance(actual, (list, tuple)):
@@ -156,12 +156,12 @@ def fail_if_equal(actual, desired, err_msg='',):
         for k, i in desired.items():
             if k not in actual:
                 raise AssertionError(repr(k))
-            fail_if_equal(actual[k], desired[k], 'key=%r\n%s' % (k, err_msg))
+            fail_if_equal(actual[k], desired[k], f'key={k!r}\n{err_msg}')
         return
     if isinstance(desired, (list, tuple)) and isinstance(actual, (list, tuple)):
         fail_if_equal(len(actual), len(desired), err_msg)
         for k in range(len(desired)):
-            fail_if_equal(actual[k], desired[k], 'item=%r\n%s' % (k, err_msg))
+            fail_if_equal(actual[k], desired[k], f'item={k!r}\n{err_msg}')
         return
     if isinstance(actual, np.ndarray) or isinstance(desired, np.ndarray):
         return fail_if_array_equal(actual, desired, err_msg)
diff --git a/numpy/ma/timer_comparison.py b/numpy/ma/timer_comparison.py
index 83bd7852e..9eb1a23cd 100644
--- a/numpy/ma/timer_comparison.py
+++ b/numpy/ma/timer_comparison.py
@@ -7,12 +7,9 @@ import numpy.core.fromnumeric as fromnumeric
 
 from numpy.testing import build_err_msg
 
-# Fixme: this does not look right.
-np.seterr(all='ignore')
 
 pi = np.pi
 
-
 class ModuleTester:
     def __init__(self, module):
         self.module = module
@@ -77,8 +74,7 @@ class ModuleTester:
             if not cond:
                 msg = build_err_msg([x, y],
                                     err_msg
-                                    + '\n(shapes %s, %s mismatch)' % (x.shape,
-                                                                      y.shape),
+                                    + f'\n(shapes {x.shape}, {y.shape} mismatch)',
                                     header=header,
                                     names=('x', 'y'))
                 assert cond, msg
@@ -100,9 +96,9 @@ class ModuleTester:
                                     header=header,
                                     names=('x', 'y'))
                 assert cond, msg
-        except ValueError:
+        except ValueError as e:
             msg = build_err_msg([x, y], err_msg, header=header, names=('x', 'y'))
-            raise ValueError(msg)
+            raise ValueError(msg) from e
 
     def assert_array_equal(self, x, y, err_msg=''):
         """
@@ -112,6 +108,7 @@ class ModuleTester:
         self.assert_array_compare(self.equal, x, y, err_msg=err_msg,
                                   header='Arrays are not equal')
 
+    @np.errstate(all='ignore')
     def test_0(self):
         """
         Tests creation
@@ -122,6 +119,7 @@ class ModuleTester:
         xm = self.masked_array(x, mask=m)
         xm[0]
 
+    @np.errstate(all='ignore')
     def test_1(self):
         """
         Tests creation
@@ -149,6 +147,7 @@ class ModuleTester:
             xf.shape = s
             assert(self.count(xm) == len(m1) - reduce(lambda x, y:x+y, m1))
 
+    @np.errstate(all='ignore')
     def test_2(self):
         """
         Tests conversions and indexing.
@@ -191,6 +190,7 @@ class ModuleTester:
         m3 = self.make_mask(m, copy=1)
         assert(m is not m3)
 
+    @np.errstate(all='ignore')
     def test_3(self):
         """
         Tests resize/repeat
@@ -210,6 +210,7 @@ class ModuleTester:
         y8 = x4.repeat(2, 0)
         assert self.allequal(y5, y8)
 
+    @np.errstate(all='ignore')
     def test_4(self):
         """
         Test of take, transpose, inner, outer products.
@@ -233,6 +234,7 @@ class ModuleTester:
         assert t[1] == 2
         assert t[2] == 3
 
+    @np.errstate(all='ignore')
     def test_5(self):
         """
         Tests inplace w/ scalar
@@ -285,6 +287,7 @@ class ModuleTester:
         x += 1.
         assert self.allequal(x, y + 1.)
 
+    @np.errstate(all='ignore')
     def test_6(self):
         """
         Tests inplace w/ array
@@ -336,6 +339,7 @@ class ModuleTester:
         x /= a
         xm /= a
 
+    @np.errstate(all='ignore')
     def test_7(self):
         "Tests ufunc"
         d = (self.array([1.0, 0, -1, pi/2]*2, mask=[0, 1]+[0]*6),
@@ -370,6 +374,7 @@ class ModuleTester:
             self.assert_array_equal(ur.filled(0), mr.filled(0), f)
             self.assert_array_equal(ur._mask, mr._mask)
 
+    @np.errstate(all='ignore')
     def test_99(self):
         # test average
         ott = self.array([0., 1., 2., 3.], mask=[1, 0, 0, 0])
@@ -415,6 +420,7 @@ class ModuleTester:
         self.assert_array_equal(self.average(z, axis=1), [2.5, 5.0])
         self.assert_array_equal(self.average(z, axis=0, weights=w2), [0., 1., 99., 99., 4.0, 10.0])
 
+    @np.errstate(all='ignore')
     def test_A(self):
         x = self.arange(24)
         x[5:6] = self.masked
@@ -434,4 +440,4 @@ if __name__ == '__main__':
         cur = np.sort(cur)
         print("#%i" % i + 50*'.')
         print(eval("ModuleTester.test_%i.__doc__" % i))
-        print("core_current : %.3f - %.3f" % (cur[0], cur[1]))
+        print(f'core_current : {cur[0]:.3f} - {cur[1]:.3f}')
diff --git a/numpy/matrixlib/__init__.pyi b/numpy/matrixlib/__init__.pyi
new file mode 100644
index 000000000..b240bb327
--- /dev/null
+++ b/numpy/matrixlib/__init__.pyi
@@ -0,0 +1,6 @@
+from typing import Any
+
+matrix: Any
+bmat: Any
+mat: Any
+asmatrix: Any
diff --git a/numpy/matrixlib/setup.py b/numpy/matrixlib/setup.py
index 19b3bb2de..4fed75de1 100644
--- a/numpy/matrixlib/setup.py
+++ b/numpy/matrixlib/setup.py
@@ -3,6 +3,7 @@ def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('matrixlib', parent_package, top_path)
     config.add_subpackage('tests')
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == "__main__":
diff --git a/numpy/matrixlib/tests/test_masked_matrix.py b/numpy/matrixlib/tests/test_masked_matrix.py
index 45424ecf0..95d3f44b6 100644
--- a/numpy/matrixlib/tests/test_masked_matrix.py
+++ b/numpy/matrixlib/tests/test_masked_matrix.py
@@ -1,4 +1,5 @@
 import numpy as np
+from numpy.testing import assert_warns
 from numpy.ma.testutils import (assert_, assert_equal, assert_raises,
                                 assert_array_equal)
 from numpy.ma.core import (masked_array, masked_values, masked, allequal,
@@ -198,7 +199,8 @@ class TestSubclassing:
         # Result should work
         assert_equal(add(mx, x), mx+x)
         assert_(isinstance(add(mx, mx)._data, np.matrix))
-        assert_(isinstance(add.outer(mx, mx), MMatrix))
+        with assert_warns(DeprecationWarning):
+            assert_(isinstance(add.outer(mx, mx), MMatrix))
         assert_(isinstance(hypot(mx, mx), MMatrix))
         assert_(isinstance(hypot(mx, x), MMatrix))
 
diff --git a/numpy/polynomial/__init__.py b/numpy/polynomial/__init__.py
index 43b2caba3..c832094e2 100644
--- a/numpy/polynomial/__init__.py
+++ b/numpy/polynomial/__init__.py
@@ -12,6 +12,107 @@ all operations on polynomials, including evaluation at an argument, are
 implemented as operations on the coefficients.  Additional (module-specific)
 information can be found in the docstring for the module of interest.
 
+This package provides *convenience classes* for each of six different kinds
+of polynomials:
+
+         ============    ================
+         **Name**        **Provides**
+         ============    ================
+         Polynomial      Power series
+         Chebyshev       Chebyshev series
+         Legendre        Legendre series
+         Laguerre        Laguerre series
+         Hermite         Hermite series
+         HermiteE        HermiteE series
+         ============    ================
+
+These *convenience classes* provide a consistent interface for creating,
+manipulating, and fitting data with polynomials of different bases.
+The convenience classes are the preferred interface for the `~numpy.polynomial`
+package, and are available from the `numpy.polynomial` namespace.
+This eliminates the need to
+navigate to the corresponding submodules, e.g. ``np.polynomial.Polynomial``
+or ``np.polynomial.Chebyshev`` instead of
+``np.polynomial.polynomial.Polynomial`` or 
+``np.polynomial.chebyshev.Chebyshev``, respectively.
+The classes provide a more consistent and concise interface than the
+type-specific functions defined in the submodules for each type of polynomial.
+For example, to fit a Chebyshev polynomial with degree ``1`` to data given
+by arrays ``xdata`` and ``ydata``, the 
+`~chebyshev.Chebyshev.fit` class method::
+
+    >>> from numpy.polynomial import Chebyshev
+    >>> c = Chebyshev.fit(xdata, ydata, deg=1)
+
+is preferred over the `chebyshev.chebfit` function from the 
+`numpy.polynomial.chebyshev` module::
+
+    >>> from numpy.polynomial.chebyshev import chebfit
+    >>> c = chebfit(xdata, ydata, deg=1)
+
+See :doc:`routines.polynomials.classes` for more details.
+
+Convenience Classes
+===================
+
+The following lists the various constants and methods common to all of
+the classes representing the various kinds of polynomials. In the following,
+the term ``Poly`` represents any one of the convenience classes (e.g.
+``Polynomial``, ``Chebyshev``, ``Hermite``, etc.) while the lowercase ``p``
+represents an **instance** of a polynomial class.
+
+Constants
+---------
+
+- ``Poly.domain``     -- Default domain
+- ``Poly.window``     -- Default window
+- ``Poly.basis_name`` -- String used to represent the basis
+- ``Poly.maxpower``   -- Maximum value ``n`` such that ``p**n`` is allowed
+- ``Poly.nickname``   -- String used in printing
+
+Creation
+--------
+
+Methods for creating polynomial instances.
+
+- ``Poly.basis(degree)``    -- Basis polynomial of given degree
+- ``Poly.identity()``       -- ``p`` where ``p(x) = x`` for all ``x``
+- ``Poly.fit(x, y, deg)``   -- ``p`` of degree ``deg`` with coefficients 
+  determined by the least-squares fit to the data ``x``, ``y``
+- ``Poly.fromroots(roots)`` -- ``p`` with specified roots
+- ``p.copy()``              -- Create a copy of ``p``
+
+Conversion
+----------
+
+Methods for converting a polynomial instance of one kind to another.
+
+- ``p.cast(Poly)``    -- Convert ``p`` to instance of kind ``Poly``
+- ``p.convert(Poly)`` -- Convert ``p`` to instance of kind ``Poly`` or map
+  between ``domain`` and ``window``
+
+Calculus
+--------
+- ``p.deriv()`` -- Take the derivative of ``p``
+- ``p.integ()`` -- Integrate ``p``
+
+Validation
+----------
+- ``Poly.has_samecoef(p1, p2)``   -- Check if coefficients match
+- ``Poly.has_samedomain(p1, p2)`` -- Check if domains match
+- ``Poly.has_sametype(p1, p2)``   -- Check if types match
+- ``Poly.has_samewindow(p1, p2)`` -- Check if windows match
+
+Misc
+----
+- ``p.linspace()`` -- Return ``x, p(x)`` at equally-spaced points in ``domain``
+- ``p.mapparms()`` -- Return the parameters for the linear mapping between
+  ``domain`` and ``window``.
+- ``p.roots()``    -- Return the roots of `p`.
+- ``p.trim()``     -- Remove trailing coefficients.
+- ``p.cutdeg(degree)`` -- Truncate p to given degree
+- ``p.truncate(size)`` -- Truncate p to given size
+
 """
 from .polynomial import Polynomial
 from .chebyshev import Chebyshev
diff --git a/numpy/polynomial/__init__.pyi b/numpy/polynomial/__init__.pyi
new file mode 100644
index 000000000..817ba22ac
--- /dev/null
+++ b/numpy/polynomial/__init__.pyi
@@ -0,0 +1,9 @@
+from typing import Any
+
+Polynomial: Any
+Chebyshev: Any
+Legendre: Any
+Hermite: Any
+HermiteE: Any
+Laguerre: Any
+set_default_printstyle: Any
diff --git a/numpy/polynomial/_polybase.py b/numpy/polynomial/_polybase.py
index f4a67a222..59c380f10 100644
--- a/numpy/polynomial/_polybase.py
+++ b/numpy/polynomial/_polybase.py
@@ -547,8 +547,8 @@ class ABCPolyBase(abc.ABC):
         othercoef = self._get_coefficients(other)
         try:
             quo, rem = self._div(self.coef, othercoef)
-        except ZeroDivisionError as e:
-            raise e
+        except ZeroDivisionError:
+            raise
         except Exception:
             return NotImplemented
         quo = self.__class__(quo, self.domain, self.window)
@@ -605,8 +605,8 @@ class ABCPolyBase(abc.ABC):
     def __rdivmod__(self, other):
         try:
             quo, rem = self._div(other, self.coef)
-        except ZeroDivisionError as e:
-            raise e
+        except ZeroDivisionError:
+            raise
         except Exception:
             return NotImplemented
         quo = self.__class__(quo, self.domain, self.window)
diff --git a/numpy/polynomial/chebyshev.py b/numpy/polynomial/chebyshev.py
index d99fd98f5..6745c9371 100644
--- a/numpy/polynomial/chebyshev.py
+++ b/numpy/polynomial/chebyshev.py
@@ -477,8 +477,6 @@ def chebline(off, scl):
     """
     Chebyshev series whose graph is a straight line.
 
-
-
     Parameters
     ----------
     off, scl : scalars
@@ -492,7 +490,11 @@ def chebline(off, scl):
 
     See Also
     --------
-    polyline
+    numpy.polynomial.polynomial.polyline
+    numpy.polynomial.legendre.legline
+    numpy.polynomial.laguerre.lagline
+    numpy.polynomial.hermite.hermline
+    numpy.polynomial.hermite_e.hermeline
 
     Examples
     --------
@@ -545,7 +547,11 @@ def chebfromroots(roots):
 
     See Also
     --------
-    polyfromroots, legfromroots, lagfromroots, hermfromroots, hermefromroots
+    numpy.polynomial.polynomial.polyfromroots
+    numpy.polynomial.legendre.legfromroots
+    numpy.polynomial.laguerre.lagfromroots
+    numpy.polynomial.hermite.hermfromroots
+    numpy.polynomial.hermite_e.hermefromroots
 
     Examples
     --------
@@ -764,7 +770,7 @@ def chebdiv(c1, c2):
 
     See Also
     --------
-    chebadd, chebsub, chemulx, chebmul, chebpow
+    chebadd, chebsub, chebmulx, chebmul, chebpow
 
     Notes
     -----
@@ -1601,7 +1607,7 @@ def chebfit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Warns
     -----
@@ -1615,11 +1621,15 @@ def chebfit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    polyfit, legfit, lagfit, hermfit, hermefit
+    numpy.polynomial.polynomial.polyfit
+    numpy.polynomial.legendre.legfit
+    numpy.polynomial.laguerre.lagfit
+    numpy.polynomial.hermite.hermfit
+    numpy.polynomial.hermite_e.hermefit
     chebval : Evaluates a Chebyshev series.
     chebvander : Vandermonde matrix of Chebyshev series.
     chebweight : Chebyshev weight function.
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1729,7 +1739,11 @@ def chebroots(c):
 
     See Also
     --------
-    polyroots, legroots, lagroots, hermroots, hermeroots
+    numpy.polynomial.polynomial.polyroots
+    numpy.polynomial.legendre.legroots
+    numpy.polynomial.laguerre.lagroots
+    numpy.polynomial.hermite.hermroots
+    numpy.polynomial.hermite_e.hermeroots
 
     Notes
     -----
diff --git a/numpy/polynomial/hermite.py b/numpy/polynomial/hermite.py
index 280cad39e..c679c5298 100644
--- a/numpy/polynomial/hermite.py
+++ b/numpy/polynomial/hermite.py
@@ -233,7 +233,11 @@ def hermline(off, scl):
 
     See Also
     --------
-    polyline, chebline
+    numpy.polynomial.polynomial.polyline
+    numpy.polynomial.chebyshev.chebline
+    numpy.polynomial.legendre.legline
+    numpy.polynomial.laguerre.lagline
+    numpy.polynomial.hermite_e.hermeline
 
     Examples
     --------
@@ -286,7 +290,11 @@ def hermfromroots(roots):
 
     See Also
     --------
-    polyfromroots, legfromroots, lagfromroots, chebfromroots, hermefromroots
+    numpy.polynomial.polynomial.polyfromroots
+    numpy.polynomial.legendre.legfromroots
+    numpy.polynomial.laguerre.lagfromroots
+    numpy.polynomial.chebyshev.chebfromroots
+    numpy.polynomial.hermite_e.hermefromroots
 
     Examples
     --------
@@ -1322,7 +1330,7 @@ def hermfit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Warns
     -----
@@ -1336,11 +1344,15 @@ def hermfit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    chebfit, legfit, lagfit, polyfit, hermefit
+    numpy.polynomial.chebyshev.chebfit
+    numpy.polynomial.legendre.legfit
+    numpy.polynomial.laguerre.lagfit
+    numpy.polynomial.polynomial.polyfit
+    numpy.polynomial.hermite_e.hermefit
     hermval : Evaluates a Hermite series.
     hermvander : Vandermonde matrix of Hermite series.
     hermweight : Hermite weight function
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1457,7 +1469,11 @@ def hermroots(c):
 
     See Also
     --------
-    polyroots, legroots, lagroots, chebroots, hermeroots
+    numpy.polynomial.polynomial.polyroots
+    numpy.polynomial.legendre.legroots
+    numpy.polynomial.laguerre.lagroots
+    numpy.polynomial.chebyshev.chebroots
+    numpy.polynomial.hermite_e.hermeroots
 
     Notes
     -----
diff --git a/numpy/polynomial/hermite_e.py b/numpy/polynomial/hermite_e.py
index 9b3b25105..1ce8ebe04 100644
--- a/numpy/polynomial/hermite_e.py
+++ b/numpy/polynomial/hermite_e.py
@@ -218,8 +218,6 @@ def hermeline(off, scl):
     """
     Hermite series whose graph is a straight line.
 
-
-
     Parameters
     ----------
     off, scl : scalars
@@ -233,7 +231,11 @@ def hermeline(off, scl):
 
     See Also
     --------
-    polyline, chebline
+    numpy.polynomial.polynomial.polyline
+    numpy.polynomial.chebyshev.chebline
+    numpy.polynomial.legendre.legline
+    numpy.polynomial.laguerre.lagline
+    numpy.polynomial.hermite.hermline
 
     Examples
     --------
@@ -287,7 +289,11 @@ def hermefromroots(roots):
 
     See Also
     --------
-    polyfromroots, legfromroots, lagfromroots, hermfromroots, chebfromroots
+    numpy.polynomial.polynomial.polyfromroots
+    numpy.polynomial.legendre.legfromroots
+    numpy.polynomial.laguerre.lagfromroots
+    numpy.polynomial.hermite.hermfromroots
+    numpy.polynomial.chebyshev.chebfromroots
 
     Examples
     --------
@@ -1315,7 +1321,7 @@ def hermefit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Warns
     -----
@@ -1329,11 +1335,15 @@ def hermefit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    chebfit, legfit, polyfit, hermfit, polyfit
+    numpy.polynomial.chebyshev.chebfit
+    numpy.polynomial.legendre.legfit
+    numpy.polynomial.polynomial.polyfit
+    numpy.polynomial.hermite.hermfit
+    numpy.polynomial.laguerre.lagfit
     hermeval : Evaluates a Hermite series.
     hermevander : pseudo Vandermonde matrix of Hermite series.
     hermeweight : HermiteE weight function.
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1452,7 +1462,11 @@ def hermeroots(c):
 
     See Also
     --------
-    polyroots, legroots, lagroots, hermroots, chebroots
+    numpy.polynomial.polynomial.polyroots
+    numpy.polynomial.legendre.legroots
+    numpy.polynomial.laguerre.lagroots
+    numpy.polynomial.hermite.hermroots
+    numpy.polynomial.chebyshev.chebroots
 
     Notes
     -----
diff --git a/numpy/polynomial/laguerre.py b/numpy/polynomial/laguerre.py
index c1db13215..9cff0b71c 100644
--- a/numpy/polynomial/laguerre.py
+++ b/numpy/polynomial/laguerre.py
@@ -214,8 +214,6 @@ def lagline(off, scl):
     """
     Laguerre series whose graph is a straight line.
 
-
-
     Parameters
     ----------
     off, scl : scalars
@@ -229,7 +227,11 @@ def lagline(off, scl):
 
     See Also
     --------
-    polyline, chebline
+    numpy.polynomial.polynomial.polyline
+    numpy.polynomial.chebyshev.chebline
+    numpy.polynomial.legendre.legline
+    numpy.polynomial.hermite.hermline
+    numpy.polynomial.hermite_e.hermeline
 
     Examples
     --------
@@ -282,7 +284,11 @@ def lagfromroots(roots):
 
     See Also
     --------
-    polyfromroots, legfromroots, chebfromroots, hermfromroots, hermefromroots
+    numpy.polynomial.polynomial.polyfromroots
+    numpy.polynomial.legendre.legfromroots
+    numpy.polynomial.chebyshev.chebfromroots
+    numpy.polynomial.hermite.hermfromroots
+    numpy.polynomial.hermite_e.hermefromroots
 
     Examples
     --------
@@ -1321,7 +1327,7 @@ def lagfit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Warns
     -----
@@ -1335,11 +1341,15 @@ def lagfit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    chebfit, legfit, polyfit, hermfit, hermefit
+    numpy.polynomial.polynomial.polyfit
+    numpy.polynomial.legendre.legfit
+    numpy.polynomial.chebyshev.chebfit
+    numpy.polynomial.hermite.hermfit
+    numpy.polynomial.hermite_e.hermefit
     lagval : Evaluates a Laguerre series.
     lagvander : pseudo Vandermonde matrix of Laguerre series.
     lagweight : Laguerre weight function.
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1455,7 +1465,11 @@ def lagroots(c):
 
     See Also
     --------
-    polyroots, legroots, chebroots, hermroots, hermeroots
+    numpy.polynomial.polynomial.polyroots
+    numpy.polynomial.legendre.legroots
+    numpy.polynomial.chebyshev.chebroots
+    numpy.polynomial.hermite.hermroots
+    numpy.polynomial.hermite_e.hermeroots
 
     Notes
     -----
diff --git a/numpy/polynomial/legendre.py b/numpy/polynomial/legendre.py
index 7b5b665f2..427f9f82f 100644
--- a/numpy/polynomial/legendre.py
+++ b/numpy/polynomial/legendre.py
@@ -243,7 +243,11 @@ def legline(off, scl):
 
     See Also
     --------
-    polyline, chebline
+    numpy.polynomial.polynomial.polyline
+    numpy.polynomial.chebyshev.chebline
+    numpy.polynomial.laguerre.lagline
+    numpy.polynomial.hermite.hermline
+    numpy.polynomial.hermite_e.hermeline
 
     Examples
     --------
@@ -296,7 +300,11 @@ def legfromroots(roots):
 
     See Also
     --------
-    polyfromroots, chebfromroots, lagfromroots, hermfromroots, hermefromroots
+    numpy.polynomial.polynomial.polyfromroots
+    numpy.polynomial.chebyshev.chebfromroots
+    numpy.polynomial.laguerre.lagfromroots
+    numpy.polynomial.hermite.hermfromroots
+    numpy.polynomial.hermite_e.hermefromroots
 
     Examples
     --------
@@ -1343,7 +1351,7 @@ def legfit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Warns
     -----
@@ -1357,11 +1365,15 @@ def legfit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    chebfit, polyfit, lagfit, hermfit, hermefit
+    numpy.polynomial.polynomial.polyfit
+    numpy.polynomial.chebyshev.chebfit
+    numpy.polynomial.laguerre.lagfit
+    numpy.polynomial.hermite.hermfit
+    numpy.polynomial.hermite_e.hermefit
     legval : Evaluates a Legendre series.
     legvander : Vandermonde matrix of Legendre series.
     legweight : Legendre weight function (= 1).
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1470,7 +1482,11 @@ def legroots(c):
 
     See Also
     --------
-    polyroots, chebroots, lagroots, hermroots, hermeroots
+    numpy.polynomial.polynomial.polyroots
+    numpy.polynomial.chebyshev.chebroots
+    numpy.polynomial.laguerre.lagroots
+    numpy.polynomial.hermite.hermroots
+    numpy.polynomial.hermite_e.hermeroots
 
     Notes
     -----
diff --git a/numpy/polynomial/polynomial.py b/numpy/polynomial/polynomial.py
index 83693441f..1baa7d870 100644
--- a/numpy/polynomial/polynomial.py
+++ b/numpy/polynomial/polynomial.py
@@ -127,7 +127,11 @@ def polyline(off, scl):
 
     See Also
     --------
-    chebline
+    numpy.polynomial.chebyshev.chebline
+    numpy.polynomial.legendre.legline
+    numpy.polynomial.laguerre.lagline
+    numpy.polynomial.hermite.hermline
+    numpy.polynomial.hermite_e.hermeline
 
     Examples
     --------
@@ -179,8 +183,11 @@ def polyfromroots(roots):
 
     See Also
     --------
-    chebfromroots, legfromroots, lagfromroots, hermfromroots
-    hermefromroots
+    numpy.polynomial.chebyshev.chebfromroots
+    numpy.polynomial.legendre.legfromroots
+    numpy.polynomial.laguerre.lagfromroots
+    numpy.polynomial.hermite.hermfromroots
+    numpy.polynomial.hermite_e.hermefromroots
 
     Notes
     -----
@@ -1267,7 +1274,7 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None):
         sv -- singular values of the scaled Vandermonde matrix
         rcond -- value of `rcond`.
 
-        For more details, see `linalg.lstsq`.
+        For more details, see `numpy.linalg.lstsq`.
 
     Raises
     ------
@@ -1281,10 +1288,14 @@ def polyfit(x, y, deg, rcond=None, full=False, w=None):
 
     See Also
     --------
-    chebfit, legfit, lagfit, hermfit, hermefit
+    numpy.polynomial.chebyshev.chebfit
+    numpy.polynomial.legendre.legfit
+    numpy.polynomial.laguerre.lagfit
+    numpy.polynomial.hermite.hermfit
+    numpy.polynomial.hermite_e.hermefit
     polyval : Evaluates a polynomial.
     polyvander : Vandermonde matrix for powers.
-    linalg.lstsq : Computes a least-squares fit from the matrix.
+    numpy.linalg.lstsq : Computes a least-squares fit from the matrix.
     scipy.interpolate.UnivariateSpline : Computes spline fits.
 
     Notes
@@ -1411,7 +1422,11 @@ def polyroots(c):
 
     See Also
     --------
-    chebroots
+    numpy.polynomial.chebyshev.chebroots
+    numpy.polynomial.legendre.legroots
+    numpy.polynomial.laguerre.lagroots
+    numpy.polynomial.hermite.hermroots
+    numpy.polynomial.hermite_e.hermeroots
 
     Notes
     -----
diff --git a/numpy/polynomial/setup.py b/numpy/polynomial/setup.py
index 641464518..b58e867a1 100644
--- a/numpy/polynomial/setup.py
+++ b/numpy/polynomial/setup.py
@@ -2,6 +2,7 @@ def configuration(parent_package='',top_path=None):
     from numpy.distutils.misc_util import Configuration
     config = Configuration('polynomial', parent_package, top_path)
     config.add_subpackage('tests')
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/polynomial/tests/test_printing.py b/numpy/polynomial/tests/test_printing.py
index 27a8ab9f2..4e9902a69 100644
--- a/numpy/polynomial/tests/test_printing.py
+++ b/numpy/polynomial/tests/test_printing.py
@@ -85,7 +85,7 @@ class TestStrUnicodeSuperSubscripts:
 class TestStrAscii:
 
     @pytest.fixture(scope='class', autouse=True)
-    def use_unicode(self):
+    def use_ascii(self):
         poly.set_default_printstyle('ascii')
 
     @pytest.mark.parametrize(('inp', 'tgt'), (
@@ -161,7 +161,10 @@ class TestStrAscii:
 
 
 class TestLinebreaking:
-    poly.set_default_printstyle('ascii')
+
+    @pytest.fixture(scope='class', autouse=True)
+    def use_ascii(self):
+        poly.set_default_printstyle('ascii')
 
     def test_single_line_one_less(self):
         # With 'ascii' style, len(str(p)) is default linewidth - 1 (i.e. 74)
@@ -283,19 +286,19 @@ def test_nonnumeric_object_coefficients(coefs, tgt):
 
 class TestFormat:
     def test_format_unicode(self):
-        poly.Polynomial._use_unicode = False
+        poly.set_default_printstyle('ascii')
         p = poly.Polynomial([1, 2, 0, -1])
         assert_equal(format(p, 'unicode'), "1.0 + 2.0·x¹ + 0.0·x² - 1.0·x³")
 
     def test_format_ascii(self):
-        poly.Polynomial._use_unicode = True
+        poly.set_default_printstyle('unicode')
         p = poly.Polynomial([1, 2, 0, -1])
         assert_equal(
             format(p, 'ascii'), "1.0 + 2.0 x**1 + 0.0 x**2 - 1.0 x**3"
         )
 
     def test_empty_formatstr(self):
-        poly.Polynomial._use_unicode = False
+        poly.set_default_printstyle('ascii')
         p = poly.Polynomial([1, 2, 3])
         assert_equal(format(p), "1.0 + 2.0 x**1 + 3.0 x**2")
         assert_equal(f"{p}", "1.0 + 2.0 x**1 + 3.0 x**2")
diff --git a/numpy/random/__init__.pyi b/numpy/random/__init__.pyi
new file mode 100644
index 000000000..f7c3cfafe
--- /dev/null
+++ b/numpy/random/__init__.pyi
@@ -0,0 +1,61 @@
+from typing import Any
+
+beta: Any
+binomial: Any
+bytes: Any
+chisquare: Any
+choice: Any
+dirichlet: Any
+exponential: Any
+f: Any
+gamma: Any
+geometric: Any
+get_state: Any
+gumbel: Any
+hypergeometric: Any
+laplace: Any
+logistic: Any
+lognormal: Any
+logseries: Any
+multinomial: Any
+multivariate_normal: Any
+negative_binomial: Any
+noncentral_chisquare: Any
+noncentral_f: Any
+normal: Any
+pareto: Any
+permutation: Any
+poisson: Any
+power: Any
+rand: Any
+randint: Any
+randn: Any
+random: Any
+random_integers: Any
+random_sample: Any
+ranf: Any
+rayleigh: Any
+sample: Any
+seed: Any
+set_state: Any
+shuffle: Any
+standard_cauchy: Any
+standard_exponential: Any
+standard_gamma: Any
+standard_normal: Any
+standard_t: Any
+triangular: Any
+uniform: Any
+vonmises: Any
+wald: Any
+weibull: Any
+zipf: Any
+Generator: Any
+RandomState: Any
+SeedSequence: Any
+MT19937: Any
+Philox: Any
+PCG64: Any
+SFC64: Any
+default_rng: Any
+BitGenerator: Any
diff --git a/numpy/random/_common.pxd b/numpy/random/_common.pxd
index 588f613ae..4f404b7a1 100644
--- a/numpy/random/_common.pxd
+++ b/numpy/random/_common.pxd
@@ -77,6 +77,8 @@ cdef object wrap_int(object val, object bits)
 
 cdef np.ndarray int_to_array(object value, object name, object bits, object uint_size)
 
+cdef validate_output_shape(iter_shape, np.ndarray output)
+
 cdef object cont(void *func, void *state, object size, object lock, int narg,
                  object a, object a_name, constraint_type a_constraint,
                  object b, object b_name, constraint_type b_constraint,
diff --git a/numpy/random/_examples/cython/setup.py b/numpy/random/_examples/cython/setup.py
index 42425c2c1..83f06fde8 100644
--- a/numpy/random/_examples/cython/setup.py
+++ b/numpy/random/_examples/cython/setup.py
@@ -19,7 +19,7 @@ inc_path = np.get_include()
 lib_path = join(np.get_include(), '..', '..', 'random', 'lib')
 
 extending = Extension("extending",
-                      sources=[join(path, 'extending.pyx')],
+                      sources=[join('.', 'extending.pyx')],
                       include_dirs=[
                             np.get_include(),
                             join(path, '..', '..')
@@ -27,7 +27,7 @@ extending = Extension("extending",
                       define_macros=defs,
                       )
 distributions = Extension("extending_distributions",
-                          sources=[join(path, 'extending_distributions.pyx')],
+                          sources=[join('.', 'extending_distributions.pyx')],
                           include_dirs=[inc_path],
                           library_dirs=[lib_path],
                           libraries=['npyrandom'],
diff --git a/numpy/random/_examples/numba/extending.py b/numpy/random/_examples/numba/extending.py
index 0d240596b..f387db695 100644
--- a/numpy/random/_examples/numba/extending.py
+++ b/numpy/random/_examples/numba/extending.py
@@ -44,9 +44,9 @@ assert r1.shape == (n,)
 assert r1.shape == r2.shape
 
 t1 = timeit(numbacall, number=1000)
-print('{:.2f} secs for {} PCG64 (Numba/PCG64) gaussian randoms'.format(t1, n))
+print(f'{t1:.2f} secs for {n} PCG64 (Numba/PCG64) gaussian randoms')
 t2 = timeit(numpycall, number=1000)
-print('{:.2f} secs for {} PCG64 (NumPy/PCG64) gaussian randoms'.format(t2, n))
+print(f'{t2:.2f} secs for {n} PCG64 (NumPy/PCG64) gaussian randoms')
 
 # example 2
 
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index cc2852da7..7ffa36775 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -5,6 +5,7 @@ import warnings
 
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 cimport cython
 import numpy as np
@@ -25,8 +26,16 @@ from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
             CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
             double_fill, cont, kahan_sum, cont_broadcast_3, float_fill, cont_f,
             check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+            validate_output_shape
         )
 
+cdef extern from "numpy/arrayobject.h":
+    int PyArray_ResolveWritebackIfCopy(np.ndarray)
+    object PyArray_FromArray(np.PyArrayObject *, np.PyArray_Descr *, int)
+
+    enum:
+        NPY_ARRAY_WRITEBACKIFCOPY
+
 np.import_array()
 
 cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
@@ -47,6 +56,77 @@ cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
     return sum
 
 
+cdef inline void _shuffle_raw_wrap(bitgen_t *bitgen, np.npy_intp n,
+                                   np.npy_intp first, np.npy_intp itemsize,
+                                   np.npy_intp stride,
+                                   char* data, char* buf) nogil:
+    # We trick gcc into providing a specialized implementation for
+    # the most common case, yielding a ~33% performance improvement.
+    # Note that apparently, only one branch can ever be specialized.
+    if itemsize == sizeof(np.npy_intp):
+        _shuffle_raw(bitgen, n, first, sizeof(np.npy_intp), stride, data, buf)
+    else:
+        _shuffle_raw(bitgen, n, first, itemsize, stride, data, buf)
+
+
+cdef inline void _shuffle_raw(bitgen_t *bitgen, np.npy_intp n,
+                              np.npy_intp first, np.npy_intp itemsize,
+                              np.npy_intp stride,
+                              char* data, char* buf) nogil:
+    """
+    Parameters
+    ----------
+    bitgen
+        Pointer to a bitgen_t instance.
+    n
+        Number of elements in data
+    first
+        First observation to shuffle.  Shuffles n-1,
+        n-2, ..., first, so that when first=1 the entire
+        array is shuffled
+    itemsize
+        Size in bytes of item
+    stride
+        Array stride
+    data
+        Location of data
+    buf
+        Location of buffer (itemsize)
+    """
+    cdef np.npy_intp i, j
+
+    for i in reversed(range(first, n)):
+        j = random_interval(bitgen, i)
+        string.memcpy(buf, data + j * stride, itemsize)
+        string.memcpy(data + j * stride, data + i * stride, itemsize)
+        string.memcpy(data + i * stride, buf, itemsize)
+
+
+cdef inline void _shuffle_int(bitgen_t *bitgen, np.npy_intp n,
+                              np.npy_intp first, int64_t* data) nogil:
+    """
+    Parameters
+    ----------
+    bitgen
+        Pointer to a bitgen_t instance.
+    n
+        Number of elements in data
+    first
+        First observation to shuffle.  Shuffles n-1,
+        n-2, ..., first, so that when first=1 the entire
+        array is shuffled
+    data
+        Location of data
+    """
+    cdef np.npy_intp i, j
+    cdef int64_t temp
+    for i in reversed(range(first, n)):
+        j = random_bounded_uint64(bitgen, 0, i, 0, 0)
+        temp = data[j]
+        data[j] = data[i]
+        data[i] = temp
+
+
 cdef bint _check_bit_generator(object bitgen):
     """Check if an object satisfies the BitGenerator interface.
     """
@@ -707,8 +787,8 @@ cdef class Generator:
                     idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
                     idx_data = <int64_t*>(<np.ndarray>idx).data
                     with self.lock, nogil:
-                        self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
-                                          idx_data)
+                        _shuffle_int(&self._bitgen, pop_size_i,
+                                     max(pop_size_i - size_i, 1), idx_data)
                     # Copy to allow potentially large array backing idx to be gc
                     idx = idx[(pop_size - size):].copy()
                 else:
@@ -736,7 +816,7 @@ cdef class Generator:
                                 hash_set[loc] = j
                                 idx_data[j - pop_size_i + size_i] = j
                         if shuffle:
-                            self._shuffle_int(size_i, 1, idx_data)
+                            _shuffle_int(&self._bitgen, size_i, 1, idx_data)
                 idx.shape = shape
 
         if is_scalar and isinstance(idx, np.ndarray):
@@ -2809,6 +2889,7 @@ cdef class Generator:
             cnt = np.PyArray_SIZE(randoms)
 
             it = np.PyArray_MultiIterNew3(randoms, p_arr, n_arr)
+            validate_output_shape(it.shape, randoms)
             with self.lock, nogil:
                 for i in range(cnt):
                     _dp = (<double*>np.PyArray_MultiIter_DATA(it, 1))[0]
@@ -3606,7 +3687,7 @@ cdef class Generator:
         Now, do one experiment throwing the dice 10 time, and 10 times again,
         and another throwing the dice 20 times, and 20 times again:
 
-        >>> rng.multinomial([[10], [20]], [1/6.]*6, size=2)
+        >>> rng.multinomial([[10], [20]], [1/6.]*6, size=(2, 2))
         array([[[2, 4, 0, 1, 2, 1],
                 [1, 3, 0, 3, 1, 2]],
                [[1, 4, 4, 4, 4, 3],
@@ -3661,6 +3742,7 @@ cdef class Generator:
                 temp = np.empty(size, dtype=np.int8)
                 temp_arr = <np.ndarray>temp
                 it = np.PyArray_MultiIterNew2(on, temp_arr)
+                validate_output_shape(it.shape, temp_arr)
             shape = it.shape + (d,)
             multin = np.zeros(shape, dtype=np.int64)
             mnarr = <np.ndarray>multin
@@ -3941,7 +4023,7 @@ cdef class Generator:
             The drawn samples, of shape ``(size, k)``.
 
         Raises
-        -------
+        ------
         ValueError
             If any value in ``alpha`` is less than or equal to zero
 
@@ -4111,7 +4193,159 @@ cdef class Generator:
 
         return diric
 
-    # Shuffling and permutations:
+    def permuted(self, object x, *, axis=None, out=None):
+        """
+        permuted(x, axis=None, out=None)
+
+        Randomly permute `x` along axis `axis`.
+
+        Unlike `shuffle`, each slice along the given axis is shuffled
+        independently of the others.
+
+        Parameters
+        ----------
+        x : array_like, at least one-dimensional
+            Array to be shuffled.
+        axis : int, optional
+            Slices of `x` in this axis are shuffled. Each slice
+            is shuffled independently of the others.  If `axis` is
+            None, the flattened array is shuffled.
+        out : ndarray, optional
+            If given, this is the destinaton of the shuffled array.
+            If `out` is None, a shuffled copy of the array is returned.
+
+        Returns
+        -------
+        ndarray
+            If `out` is None, a shuffled copy of `x` is returned.
+            Otherwise, the shuffled array is stored in `out`,
+            and `out` is returned
+
+        See Also
+        --------
+        shuffle
+        permutation
+
+        Examples
+        --------
+        Create a `numpy.random.Generator` instance:
+
+        >>> rng = np.random.default_rng()
+
+        Create a test array:
+
+        >>> x = np.arange(24).reshape(3, 8)
+        >>> x
+        array([[ 0,  1,  2,  3,  4,  5,  6,  7],
+               [ 8,  9, 10, 11, 12, 13, 14, 15],
+               [16, 17, 18, 19, 20, 21, 22, 23]])
+
+        Shuffle the rows of `x`:
+
+        >>> y = rng.permuted(x, axis=1)
+        >>> y
+        array([[ 4,  3,  6,  7,  1,  2,  5,  0],  # random
+               [15, 10, 14,  9, 12, 11,  8, 13],
+               [17, 16, 20, 21, 18, 22, 23, 19]])
+
+        `x` has not been modified:
+
+        >>> x
+        array([[ 0,  1,  2,  3,  4,  5,  6,  7],
+               [ 8,  9, 10, 11, 12, 13, 14, 15],
+               [16, 17, 18, 19, 20, 21, 22, 23]])
+
+        To shuffle the rows of `x` in-place, pass `x` as the `out`
+        parameter:
+
+        >>> y = rng.permuted(x, axis=1, out=x)
+        >>> x
+        array([[ 3,  0,  4,  7,  1,  6,  2,  5],  # random
+               [ 8, 14, 13,  9, 12, 11, 15, 10],
+               [17, 18, 16, 22, 19, 23, 20, 21]])
+
+        Note that when the ``out`` parameter is given, the return
+        value is ``out``:
+
+        >>> y is x
+        True
+        """
+
+        cdef int ax
+        cdef np.npy_intp axlen, axstride, itemsize
+        cdef void *buf
+        cdef np.flatiter it
+        cdef np.ndarray to_shuffle
+        cdef int status
+        cdef int flags
+
+        x = np.asarray(x)
+
+        if out is None:
+            out = x.copy(order='K')
+        else:
+            if type(out) is not np.ndarray:
+                raise TypeError('out must be a numpy array')
+            if out.shape != x.shape:
+                raise ValueError('out must have the same shape as x')
+            np.copyto(out, x, casting='safe')
+
+        if axis is None:
+            if x.ndim > 1:
+                if not (np.PyArray_FLAGS(out) & (np.NPY_ARRAY_C_CONTIGUOUS |
+                                                 np.NPY_ARRAY_F_CONTIGUOUS)): 
+                    flags = (np.NPY_ARRAY_C_CONTIGUOUS |
+                             NPY_ARRAY_WRITEBACKIFCOPY)
+                    to_shuffle = PyArray_FromArray(<np.PyArrayObject *>out,
+                                                   <np.PyArray_Descr *>NULL, flags)
+                    self.shuffle(to_shuffle.ravel(order='K'))
+                    # Because we only execute this block if out is not
+                    # contiguous, we know this call will always result in a
+                    # copy of to_shuffle back to out. I.e. status will be 1.
+                    status = PyArray_ResolveWritebackIfCopy(to_shuffle)
+                    assert status == 1
+                else:
+                    # out is n-d with n > 1, but is either C- or F-contiguous,
+                    # so we know out.ravel(order='A') is a view.
+                    self.shuffle(out.ravel(order='A'))
+            else:
+                # out is 1-d
+                self.shuffle(out)
+            return out
+
+        ax = normalize_axis_index(axis, np.ndim(out))
+        itemsize = out.itemsize
+        axlen = out.shape[ax]
+        axstride = out.strides[ax]
+
+        it = np.PyArray_IterAllButAxis(out, &ax)
+
+        buf = PyMem_Malloc(itemsize)
+        if buf == NULL:
+            raise MemoryError('memory allocation failed in permuted')
+
+        if out.dtype.hasobject:
+            # Keep the GIL when shuffling an object array.
+            with self.lock:
+                while np.PyArray_ITER_NOTDONE(it):
+                    _shuffle_raw_wrap(&self._bitgen, axlen, 0, itemsize,
+                                      axstride,
+                                      <char *>np.PyArray_ITER_DATA(it),
+                                      <char *>buf)
+                    np.PyArray_ITER_NEXT(it)
+        else:
+            # out is not an object array, so we can release the GIL.
+            with self.lock, nogil:
+                while np.PyArray_ITER_NOTDONE(it):
+                    _shuffle_raw_wrap(&self._bitgen, axlen, 0, itemsize,
+                                      axstride,
+                                      <char *>np.PyArray_ITER_DATA(it),
+                                      <char *>buf)
+                    np.PyArray_ITER_NEXT(it)
+
+        PyMem_Free(buf)
+        return out
+
     def shuffle(self, object x, axis=0):
         """
         shuffle(x, axis=0)
@@ -4174,14 +4408,15 @@ cdef class Generator:
             # when the function exits.
             buf = np.empty(itemsize, dtype=np.int8)  # GC'd at function exit
             buf_ptr = <char*><size_t>np.PyArray_DATA(buf)
-            with self.lock:
-                # We trick gcc into providing a specialized implementation for
-                # the most common case, yielding a ~33% performance improvement.
-                # Note that apparently, only one branch can ever be specialized.
-                if itemsize == sizeof(np.npy_intp):
-                    self._shuffle_raw(n, 1, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
-                else:
-                    self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr)
+            if x.dtype.hasobject:
+                with self.lock:
+                    _shuffle_raw_wrap(&self._bitgen, n, 1, itemsize, stride,
+                                      x_ptr, buf_ptr)
+            else:
+                # Same as above, but the GIL is released.
+                with self.lock, nogil:
+                    _shuffle_raw_wrap(&self._bitgen, n, 1, itemsize, stride,
+                                      x_ptr, buf_ptr)
         elif isinstance(x, np.ndarray) and x.ndim and x.size:
             x = np.swapaxes(x, 0, axis)
             buf = np.empty_like(x[0, ...])
@@ -4204,56 +4439,6 @@ cdef class Generator:
                     j = random_interval(&self._bitgen, i)
                     x[i], x[j] = x[j], x[i]
 
-    cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp first,
-                             np.npy_intp itemsize, np.npy_intp stride,
-                             char* data, char* buf):
-        """
-        Parameters
-        ----------
-        n
-            Number of elements in data
-        first
-            First observation to shuffle.  Shuffles n-1,
-            n-2, ..., first, so that when first=1 the entire
-            array is shuffled
-        itemsize
-            Size in bytes of item
-        stride
-            Array stride
-        data
-            Location of data
-        buf
-            Location of buffer (itemsize)
-        """
-        cdef np.npy_intp i, j
-        for i in reversed(range(first, n)):
-            j = random_interval(&self._bitgen, i)
-            string.memcpy(buf, data + j * stride, itemsize)
-            string.memcpy(data + j * stride, data + i * stride, itemsize)
-            string.memcpy(data + i * stride, buf, itemsize)
-
-    cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
-                             int64_t* data) nogil:
-        """
-        Parameters
-        ----------
-        n
-            Number of elements in data
-        first
-            First observation to shuffle.  Shuffles n-1,
-            n-2, ..., first, so that when first=1 the entire
-            array is shuffled
-        data
-            Location of data
-        """
-        cdef np.npy_intp i, j
-        cdef int64_t temp
-        for i in reversed(range(first, n)):
-            j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
-            temp = data[j]
-            data[j] = data[i]
-            data[i] = temp
-
     def permutation(self, object x, axis=0):
         """
         permutation(x, axis=0)
@@ -4336,7 +4521,7 @@ def default_rng(seed=None):
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in a`SeedSequence` instance
+        pass in a `SeedSequence` instance.
         Additionally, when passed a `BitGenerator`, it will be wrapped by
         `Generator`. If passed a `Generator`, it will be returned unaltered.
 
diff --git a/numpy/random/bit_generator.pxd b/numpy/random/bit_generator.pxd
index bd5e47a20..dfa7d0a71 100644
--- a/numpy/random/bit_generator.pxd
+++ b/numpy/random/bit_generator.pxd
@@ -23,7 +23,7 @@ cdef class BitGenerator():
 cdef class SeedSequence():
     cdef readonly object entropy
     cdef readonly tuple spawn_key
-    cdef readonly uint32_t pool_size
+    cdef readonly Py_ssize_t pool_size
     cdef readonly object pool
     cdef readonly uint32_t n_children_spawned
 
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index 8820a6e09..d43e7f5aa 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -22,6 +22,7 @@ from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
             CONS_GT_1, LEGACY_CONS_POISSON,
             double_fill, cont, kahan_sum, cont_broadcast_3,
             check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+            validate_output_shape
         )
 
 cdef extern from "numpy/random/distributions.h":
@@ -382,7 +383,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``random`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -452,7 +453,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``beta`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -502,7 +503,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``exponential`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -551,7 +552,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_exponential`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -652,7 +653,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``integers`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -773,7 +774,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``bytes`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -811,7 +812,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``choice`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1016,7 +1017,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``uniform`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1184,7 +1185,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_normal`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         If positive int_like arguments are provided, `randn` generates an array
         of shape ``(d0, d1, ..., dn)``, filled
@@ -1338,7 +1339,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_normal`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1413,7 +1414,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``normal`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1513,7 +1514,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_gamma`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1594,7 +1595,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``gamma`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1683,7 +1684,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``f`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1771,7 +1772,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``noncentral_f`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1856,7 +1857,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``chisquare`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -1929,7 +1930,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``noncentral_chisquare`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2015,7 +2016,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_cauchy`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2091,7 +2092,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``standard_t`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2196,7 +2197,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``vonmises`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2294,7 +2295,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``pareto`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2388,7 +2389,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``weibull`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2484,7 +2485,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``power`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2595,7 +2596,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``laplace`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2686,7 +2687,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``gumbel`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2808,7 +2809,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``logistic`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -2895,7 +2896,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``lognormal`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3008,7 +3009,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``rayleigh`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3090,7 +3091,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``wald`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3163,7 +3164,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``triangular`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3270,7 +3271,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``binomial`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3374,6 +3375,7 @@ cdef class RandomState:
             cnt = np.PyArray_SIZE(randoms)
 
             it = np.PyArray_MultiIterNew3(randoms, p_arr, n_arr)
+            validate_output_shape(it.shape, randoms)
             with self.lock, nogil:
                 for i in range(cnt):
                     _dp = (<double*>np.PyArray_MultiIter_DATA(it, 1))[0]
@@ -3419,7 +3421,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``negative_binomial`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3504,7 +3506,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``poisson`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3590,7 +3592,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``zipf`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3680,7 +3682,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``geometric`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3734,7 +3736,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``hypergeometric`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3865,7 +3867,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``logseries`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -3958,7 +3960,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``multivariate_normal`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -4132,7 +4134,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``multinomial`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -4250,7 +4252,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``dirichlet`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -4396,7 +4398,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``shuffle`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
@@ -4491,7 +4493,7 @@ cdef class RandomState:
 
         .. note::
             New code should use the ``permutation`` method of a ``default_rng()``
-            instance instead; see `random-quick-start`.
+            instance instead; please see the :ref:`random-quick-start`.
 
         Parameters
         ----------
diff --git a/numpy/random/setup.py b/numpy/random/setup.py
index 88ddb1268..bfd08e469 100644
--- a/numpy/random/setup.py
+++ b/numpy/random/setup.py
@@ -137,6 +137,7 @@ def configuration(parent_package='', top_path=None):
                          define_macros=defs + LEGACY_DEFS,
                          )
     config.add_data_files(*depends)
+    config.add_data_files('*.pyi')
     return config
 
 
diff --git a/numpy/random/src/pcg64/pcg64.h b/numpy/random/src/pcg64/pcg64.h
index 2a7217dd9..31899cbc1 100644
--- a/numpy/random/src/pcg64/pcg64.h
+++ b/numpy/random/src/pcg64/pcg64.h
@@ -57,11 +57,11 @@
 #define inline __forceinline
 #endif
 
-#if __GNUC_GNU_INLINE__ && !defined(__cplusplus)
+#if defined(__GNUC_GNU_INLINE__) && !defined(__cplusplus)
 #error Nonstandard GNU inlining semantics. Compile with -std=c99 or better.
 #endif
 
-#if __cplusplus
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -255,7 +255,7 @@ typedef pcg_state_setseq_128 pcg64_random_t;
 #define pcg64_advance_r pcg_setseq_128_advance_r
 #define PCG64_INITIALIZER PCG_STATE_SETSEQ_128_INITIALIZER
 
-#if __cplusplus
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/numpy/random/tests/test_direct.py b/numpy/random/tests/test_direct.py
index dad12c8a8..d602b36b4 100644
--- a/numpy/random/tests/test_direct.py
+++ b/numpy/random/tests/test_direct.py
@@ -230,13 +230,13 @@ class Base:
     def test_repr(self):
         rs = Generator(self.bit_generator(*self.data1['seed']))
         assert 'Generator' in repr(rs)
-        assert '{:#x}'.format(id(rs)).upper().replace('X', 'x') in repr(rs)
+        assert f'{id(rs):#x}'.upper().replace('X', 'x') in repr(rs)
 
     def test_str(self):
         rs = Generator(self.bit_generator(*self.data1['seed']))
         assert 'Generator' in str(rs)
         assert str(self.bit_generator.__name__) in str(rs)
-        assert '{:#x}'.format(id(rs)).upper().replace('X', 'x') not in str(rs)
+        assert f'{id(rs):#x}'.upper().replace('X', 'x') not in str(rs)
 
     def test_pickle(self):
         import pickle
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 332b63198..b69cd38d4 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -18,20 +18,20 @@ JUMP_TEST_DATA = [
     {
         "seed": 0,
         "steps": 10,
-        "initial": {"key_md5": "64eaf265d2203179fb5ffb73380cd589", "pos": 9},
-        "jumped": {"key_md5": "8cb7b061136efceef5217a9ce2cc9a5a", "pos": 598},
+        "initial": {"key_sha256": "bb1636883c2707b51c5b7fc26c6927af4430f2e0785a8c7bc886337f919f9edf", "pos": 9},
+        "jumped": {"key_sha256": "ff682ac12bb140f2d72fba8d3506cf4e46817a0db27aae1683867629031d8d55", "pos": 598},
     },
     {
         "seed":384908324,
         "steps":312,
-        "initial": {"key_md5": "e99708a47b82ff51a2c7b0625b81afb5", "pos": 311},
-        "jumped": {"key_md5": "2ecdbfc47a895b253e6e19ccb2e74b90", "pos": 276},
+        "initial": {"key_sha256": "16b791a1e04886ccbbb4d448d6ff791267dc458ae599475d08d5cced29d11614", "pos": 311},
+        "jumped": {"key_sha256": "a0110a2cf23b56be0feaed8f787a7fc84bef0cb5623003d75b26bdfa1c18002c", "pos": 276},
     },
     {
         "seed": [839438204, 980239840, 859048019, 821],
         "steps": 511,
-        "initial": {"key_md5": "9fcd6280df9199785e17e93162ce283c", "pos": 510},
-        "jumped": {"key_md5": "433b85229f2ed853cde06cd872818305", "pos": 475},
+        "initial": {"key_sha256": "d306cf01314d51bd37892d874308200951a35265ede54d200f1e065004c3e9ea", "pos": 510},
+        "jumped": {"key_sha256": "0e00ab449f01a5195a83b4aee0dfbc2ce8d46466a640b92e33977d2e42f777f8", "pos": 475},
     },
 ]
 
@@ -483,18 +483,18 @@ class TestIntegers:
             assert_array_equal(scalar, array)
 
     def test_repeatability(self, endpoint):
-        # We use a md5 hash of generated sequences of 1000 samples
+        # We use a sha256 hash of generated sequences of 1000 samples
         # in the range [0, 6) for all but bool, where the range
         # is [0, 2). Hashes are for little endian numbers.
-        tgt = {'bool':   'b3300e66d2bb59e493d255d47c3a6cbe',
-               'int16':  '39624ead49ad67e37545744024d2648b',
-               'int32':  '5c4810373f979336c6c0c999996e47a1',
-               'int64':  'ab126c15edff26f55c50d2b7e37391ac',
-               'int8':   'ba71ccaffeeeb9eeb1860f8075020b9c',
-               'uint16': '39624ead49ad67e37545744024d2648b',
-               'uint32': '5c4810373f979336c6c0c999996e47a1',
-               'uint64': 'ab126c15edff26f55c50d2b7e37391ac',
-               'uint8':  'ba71ccaffeeeb9eeb1860f8075020b9c'}
+        tgt = {'bool':   '053594a9b82d656f967c54869bc6970aa0358cf94ad469c81478459c6a90eee3',
+               'int16':  '54de9072b6ee9ff7f20b58329556a46a447a8a29d67db51201bf88baa6e4e5d4',
+               'int32':  'd3a0d5efb04542b25ac712e50d21f39ac30f312a5052e9bbb1ad3baa791ac84b',
+               'int64':  '14e224389ac4580bfbdccb5697d6190b496f91227cf67df60989de3d546389b1',
+               'int8':   '0e203226ff3fbbd1580f15da4621e5f7164d0d8d6b51696dd42d004ece2cbec1',
+               'uint16': '54de9072b6ee9ff7f20b58329556a46a447a8a29d67db51201bf88baa6e4e5d4',
+               'uint32': 'd3a0d5efb04542b25ac712e50d21f39ac30f312a5052e9bbb1ad3baa791ac84b',
+               'uint64': '14e224389ac4580bfbdccb5697d6190b496f91227cf67df60989de3d546389b1',
+               'uint8':  '0e203226ff3fbbd1580f15da4621e5f7164d0d8d6b51696dd42d004ece2cbec1'}
 
         for dt in self.itype[1:]:
             random = Generator(MT19937(1234))
@@ -507,14 +507,14 @@ class TestIntegers:
                 val = random.integers(0, 6 - endpoint, size=1000, endpoint=endpoint,
                                  dtype=dt).byteswap()
 
-            res = hashlib.md5(val).hexdigest()
+            res = hashlib.sha256(val).hexdigest()
             assert_(tgt[np.dtype(dt).name] == res)
 
         # bools do not depend on endianness
         random = Generator(MT19937(1234))
         val = random.integers(0, 2 - endpoint, size=1000, endpoint=endpoint,
                          dtype=bool).view(np.int8)
-        res = hashlib.md5(val).hexdigest()
+        res = hashlib.sha256(val).hexdigest()
         assert_(tgt[np.dtype(bool).name] == res)
 
     def test_repeatability_broadcasting(self, endpoint):
@@ -905,12 +905,12 @@ class TestRandomDist:
         assert actual.dtype == np.int64
 
     def test_choice_large_sample(self):
-        choice_hash = 'd44962a0b1e92f4a3373c23222244e21'
+        choice_hash = '4266599d12bfcfb815213303432341c06b4349f5455890446578877bb322e222'
         random = Generator(MT19937(self.seed))
         actual = random.choice(10000, 5000, replace=False)
         if sys.byteorder != 'little':
             actual = actual.byteswap()
-        res = hashlib.md5(actual.view(np.int8)).hexdigest()
+        res = hashlib.sha256(actual.view(np.int8)).hexdigest()
         assert_(choice_hash == res)
 
     def test_bytes(self):
@@ -1039,6 +1039,56 @@ class TestRandomDist:
         assert_raises(np.AxisError, random.permutation, arr, 3)
         assert_raises(TypeError, random.permutation, arr, slice(1, 2, None))
 
+    @pytest.mark.parametrize("dtype", [int, object])
+    @pytest.mark.parametrize("axis, expected",
+                             [(None, np.array([[3, 7, 0, 9, 10, 11],
+                                               [8, 4, 2, 5,  1,  6]])),
+                              (0, np.array([[6, 1, 2, 9, 10, 11],
+                                            [0, 7, 8, 3,  4,  5]])),
+                              (1, np.array([[ 5, 3,  4, 0, 2, 1],
+                                            [11, 9, 10, 6, 8, 7]]))])
+    def test_permuted(self, dtype, axis, expected):
+        random = Generator(MT19937(self.seed))
+        x = np.arange(12).reshape(2, 6).astype(dtype)
+        random.permuted(x, axis=axis, out=x)
+        assert_array_equal(x, expected)
+
+        random = Generator(MT19937(self.seed))
+        x = np.arange(12).reshape(2, 6).astype(dtype)
+        y = random.permuted(x, axis=axis)
+        assert y.dtype == dtype
+        assert_array_equal(y, expected)
+
+    def test_permuted_with_strides(self):
+        random = Generator(MT19937(self.seed))
+        x0 = np.arange(22).reshape(2, 11)
+        x1 = x0.copy()
+        x = x0[:, ::3]
+        y = random.permuted(x, axis=1, out=x)
+        expected = np.array([[0, 9, 3, 6],
+                             [14, 20, 11, 17]])
+        assert_array_equal(y, expected)
+        x1[:, ::3] = expected
+        # Verify that the original x0 was modified in-place as expected.
+        assert_array_equal(x1, x0)
+
+    def test_permuted_empty(self):
+        y = random.permuted([])
+        assert_array_equal(y, [])
+
+    @pytest.mark.parametrize('outshape', [(2, 3), 5])
+    def test_permuted_out_with_wrong_shape(self, outshape):
+        a = np.array([1, 2, 3])
+        out = np.zeros(outshape, dtype=a.dtype)
+        with pytest.raises(ValueError, match='same shape'):
+            random.permuted(a, out=out)
+
+    def test_permuted_out_with_wrong_type(self):
+        out = np.zeros((3, 5), dtype=np.int32)
+        x = np.ones((3, 5))
+        with pytest.raises(TypeError, match='Cannot cast'):
+            random.permuted(x, axis=1, out=out)
+
     def test_beta(self):
         random = Generator(MT19937(self.seed))
         actual = random.beta(.1, .9, size=(3, 2))
@@ -2374,7 +2424,7 @@ class TestSingleEltArrayInput:
 @pytest.mark.parametrize("config", JUMP_TEST_DATA)
 def test_jumped(config):
     # Each config contains the initial seed, a number of raw steps
-    # the md5 hashes of the initial and the final states' keys and
+    # the sha256 hashes of the initial and the final states' keys and
     # the position of of the initial and the final state.
     # These were produced using the original C implementation.
     seed = config["seed"]
@@ -2386,17 +2436,17 @@ def test_jumped(config):
     key = mt19937.state["state"]["key"]
     if sys.byteorder == 'big':
         key = key.byteswap()
-    md5 = hashlib.md5(key)
+    sha256 = hashlib.sha256(key)
     assert mt19937.state["state"]["pos"] == config["initial"]["pos"]
-    assert md5.hexdigest() == config["initial"]["key_md5"]
+    assert sha256.hexdigest() == config["initial"]["key_sha256"]
 
     jumped = mt19937.jumped()
     key = jumped.state["state"]["key"]
     if sys.byteorder == 'big':
         key = key.byteswap()
-    md5 = hashlib.md5(key)
+    sha256 = hashlib.sha256(key)
     assert jumped.state["state"]["pos"] == config["jumped"]["pos"]
-    assert md5.hexdigest() == config["jumped"]["key_md5"]
+    assert sha256.hexdigest() == config["jumped"]["key_sha256"]
 
 
 def test_broadcast_size_error():
@@ -2423,6 +2473,16 @@ def test_broadcast_size_error():
     with pytest.raises(ValueError):
         random.standard_gamma(shape, out=out)
 
+    # 2 arg
+    with pytest.raises(ValueError):
+        random.binomial(1, [0.3, 0.7], size=(2, 1))
+    with pytest.raises(ValueError):
+        random.binomial([1, 2], 0.3, size=(2, 1))
+    with pytest.raises(ValueError):
+        random.binomial([1, 2], [0.3, 0.7], size=(2, 1))
+    with pytest.raises(ValueError):
+        random.multinomial([2, 2], [.3, .7], size=(2, 1))
+
     # 3 arg
     a = random.chisquare(5, size=3)
     b = random.chisquare(5, size=(4, 3))
diff --git a/numpy/random/tests/test_generator_mt19937_regressions.py b/numpy/random/tests/test_generator_mt19937_regressions.py
index 456c932d4..2ef6b0631 100644
--- a/numpy/random/tests/test_generator_mt19937_regressions.py
+++ b/numpy/random/tests/test_generator_mt19937_regressions.py
@@ -33,11 +33,11 @@ class TestRegression:
         # numbers with this large sample
         # theoretical large N result is 0.49706795
         freq = np.sum(rvsn == 1) / float(N)
-        msg = "Frequency was %f, should be > 0.45" % freq
+        msg = f'Frequency was {freq:f}, should be > 0.45'
         assert_(freq > 0.45, msg)
         # theoretical large N result is 0.19882718
         freq = np.sum(rvsn == 2) / float(N)
-        msg = "Frequency was %f, should be < 0.23" % freq
+        msg = f'Frequency was {freq:f}, should be < 0.23'
         assert_(freq < 0.23, msg)
 
     def test_shuffle_mixed_dimension(self):
diff --git a/numpy/random/tests/test_random.py b/numpy/random/tests/test_random.py
index 276b5bc81..c13fc39e3 100644
--- a/numpy/random/tests/test_random.py
+++ b/numpy/random/tests/test_random.py
@@ -211,18 +211,18 @@ class TestRandint:
 
     def test_repeatability(self):
         import hashlib
-        # We use a md5 hash of generated sequences of 1000 samples
+        # We use a sha256 hash of generated sequences of 1000 samples
         # in the range [0, 6) for all but bool, where the range
         # is [0, 2). Hashes are for little endian numbers.
-        tgt = {'bool': '7dd3170d7aa461d201a65f8bcf3944b0',
-               'int16': '1b7741b80964bb190c50d541dca1cac1',
-               'int32': '4dc9fcc2b395577ebb51793e58ed1a05',
-               'int64': '17db902806f448331b5a758d7d2ee672',
-               'int8': '27dd30c4e08a797063dffac2490b0be6',
-               'uint16': '1b7741b80964bb190c50d541dca1cac1',
-               'uint32': '4dc9fcc2b395577ebb51793e58ed1a05',
-               'uint64': '17db902806f448331b5a758d7d2ee672',
-               'uint8': '27dd30c4e08a797063dffac2490b0be6'}
+        tgt = {'bool': '509aea74d792fb931784c4b0135392c65aec64beee12b0cc167548a2c3d31e71',
+               'int16': '7b07f1a920e46f6d0fe02314155a2330bcfd7635e708da50e536c5ebb631a7d4',
+               'int32': 'e577bfed6c935de944424667e3da285012e741892dcb7051a8f1ce68ab05c92f',
+               'int64': '0fbead0b06759df2cfb55e43148822d4a1ff953c7eb19a5b08445a63bb64fa9e',
+               'int8': '001aac3a5acb935a9b186cbe14a1ca064b8bb2dd0b045d48abeacf74d0203404',
+               'uint16': '7b07f1a920e46f6d0fe02314155a2330bcfd7635e708da50e536c5ebb631a7d4',
+               'uint32': 'e577bfed6c935de944424667e3da285012e741892dcb7051a8f1ce68ab05c92f',
+               'uint64': '0fbead0b06759df2cfb55e43148822d4a1ff953c7eb19a5b08445a63bb64fa9e',
+               'uint8': '001aac3a5acb935a9b186cbe14a1ca064b8bb2dd0b045d48abeacf74d0203404'}
 
         for dt in self.itype[1:]:
             np.random.seed(1234)
@@ -233,13 +233,13 @@ class TestRandint:
             else:
                 val = self.rfunc(0, 6, size=1000, dtype=dt).byteswap()
 
-            res = hashlib.md5(val.view(np.int8)).hexdigest()
+            res = hashlib.sha256(val.view(np.int8)).hexdigest()
             assert_(tgt[np.dtype(dt).name] == res)
 
         # bools do not depend on endianness
         np.random.seed(1234)
         val = self.rfunc(0, 2, size=1000, dtype=bool).view(np.int8)
-        res = hashlib.md5(val).hexdigest()
+        res = hashlib.sha256(val).hexdigest()
         assert_(tgt[np.dtype(bool).name] == res)
 
     def test_int64_uint64_corner_case(self):
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index edd7811bf..b70a04347 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -26,24 +26,24 @@ INT_FUNCS = {'binomial': (100.0, 0.6),
 
 if np.iinfo(int).max < 2**32:
     # Windows and some 32-bit platforms, e.g., ARM
-    INT_FUNC_HASHES = {'binomial': '670e1c04223ffdbab27e08fbbad7bdba',
-                       'logseries': '6bd0183d2f8030c61b0d6e11aaa60caf',
-                       'geometric': '6e9df886f3e1e15a643168568d5280c0',
-                       'hypergeometric': '7964aa611b046aecd33063b90f4dec06',
-                       'multinomial': '68a0b049c16411ed0aa4aff3572431e4',
-                       'negative_binomial': 'dc265219eec62b4338d39f849cd36d09',
-                       'poisson': '7b4dce8e43552fc82701c2fa8e94dc6e',
-                       'zipf': 'fcd2a2095f34578723ac45e43aca48c5',
+    INT_FUNC_HASHES = {'binomial': '2fbead005fc63942decb5326d36a1f32fe2c9d32c904ee61e46866b88447c263',
+                       'logseries': '23ead5dcde35d4cfd4ef2c105e4c3d43304b45dc1b1444b7823b9ee4fa144ebb',
+                       'geometric': '0d764db64f5c3bad48c8c33551c13b4d07a1e7b470f77629bef6c985cac76fcf',
+                       'hypergeometric': '7b59bf2f1691626c5815cdcd9a49e1dd68697251d4521575219e4d2a1b8b2c67',
+                       'multinomial': 'd754fa5b92943a38ec07630de92362dd2e02c43577fc147417dc5b9db94ccdd3',
+                       'negative_binomial': '8eb216f7cb2a63cf55605422845caaff002fddc64a7dc8b2d45acd477a49e824',
+                       'poisson': '70c891d76104013ebd6f6bcf30d403a9074b886ff62e4e6b8eb605bf1a4673b7',
+                       'zipf': '01f074f97517cd5d21747148ac6ca4074dde7fcb7acbaec0a936606fecacd93f',
                        }
 else:
-    INT_FUNC_HASHES = {'binomial': 'b5f8dcd74f172836536deb3547257b14',
-                       'geometric': '8814571f45c87c59699d62ccd3d6c350',
-                       'hypergeometric': 'bc64ae5976eac452115a16dad2dcf642',
-                       'logseries': '84be924b37485a27c4a98797bc88a7a4',
-                       'multinomial': 'ec3c7f9cf9664044bb0c6fb106934200',
-                       'negative_binomial': '210533b2234943591364d0117a552969',
-                       'poisson': '0536a8850c79da0c78defd742dccc3e0',
-                       'zipf': 'f2841f504dd2525cd67cdcad7561e532',
+    INT_FUNC_HASHES = {'binomial': '8626dd9d052cb608e93d8868de0a7b347258b199493871a1dc56e2a26cacb112',
+                       'geometric': '8edd53d272e49c4fc8fbbe6c7d08d563d62e482921f3131d0a0e068af30f0db9',
+                       'hypergeometric': '83496cc4281c77b786c9b7ad88b74d42e01603a55c60577ebab81c3ba8d45657',
+                       'logseries': '65878a38747c176bc00e930ebafebb69d4e1e16cd3a704e264ea8f5e24f548db',
+                       'multinomial': '7a984ae6dca26fd25374479e118b22f55db0aedccd5a0f2584ceada33db98605',
+                       'negative_binomial': 'd636d968e6a24ae92ab52fe11c46ac45b0897e98714426764e820a7d77602a61',
+                       'poisson': '956552176f77e7c9cb20d0118fc9cf690be488d790ed4b4c4747b965e61b0bb4',
+                       'zipf': 'f84ba7feffda41e606e20b28dfc0f1ea9964a74574513d4a4cbc98433a8bfa45',
                        }
 
 
@@ -319,18 +319,18 @@ class TestRandint:
         assert_(vals.min() >= 0)
 
     def test_repeatability(self):
-        # We use a md5 hash of generated sequences of 1000 samples
+        # We use a sha256 hash of generated sequences of 1000 samples
         # in the range [0, 6) for all but bool, where the range
         # is [0, 2). Hashes are for little endian numbers.
-        tgt = {'bool': '7dd3170d7aa461d201a65f8bcf3944b0',
-               'int16': '1b7741b80964bb190c50d541dca1cac1',
-               'int32': '4dc9fcc2b395577ebb51793e58ed1a05',
-               'int64': '17db902806f448331b5a758d7d2ee672',
-               'int8': '27dd30c4e08a797063dffac2490b0be6',
-               'uint16': '1b7741b80964bb190c50d541dca1cac1',
-               'uint32': '4dc9fcc2b395577ebb51793e58ed1a05',
-               'uint64': '17db902806f448331b5a758d7d2ee672',
-               'uint8': '27dd30c4e08a797063dffac2490b0be6'}
+        tgt = {'bool': '509aea74d792fb931784c4b0135392c65aec64beee12b0cc167548a2c3d31e71',
+               'int16': '7b07f1a920e46f6d0fe02314155a2330bcfd7635e708da50e536c5ebb631a7d4',
+               'int32': 'e577bfed6c935de944424667e3da285012e741892dcb7051a8f1ce68ab05c92f',
+               'int64': '0fbead0b06759df2cfb55e43148822d4a1ff953c7eb19a5b08445a63bb64fa9e',
+               'int8': '001aac3a5acb935a9b186cbe14a1ca064b8bb2dd0b045d48abeacf74d0203404',
+               'uint16': '7b07f1a920e46f6d0fe02314155a2330bcfd7635e708da50e536c5ebb631a7d4',
+               'uint32': 'e577bfed6c935de944424667e3da285012e741892dcb7051a8f1ce68ab05c92f',
+               'uint64': '0fbead0b06759df2cfb55e43148822d4a1ff953c7eb19a5b08445a63bb64fa9e',
+               'uint8': '001aac3a5acb935a9b186cbe14a1ca064b8bb2dd0b045d48abeacf74d0203404'}
 
         for dt in self.itype[1:]:
             random.seed(1234)
@@ -341,13 +341,13 @@ class TestRandint:
             else:
                 val = self.rfunc(0, 6, size=1000, dtype=dt).byteswap()
 
-            res = hashlib.md5(val.view(np.int8)).hexdigest()
+            res = hashlib.sha256(val.view(np.int8)).hexdigest()
             assert_(tgt[np.dtype(dt).name] == res)
 
         # bools do not depend on endianness
         random.seed(1234)
         val = self.rfunc(0, 2, size=1000, dtype=bool).view(np.int8)
-        res = hashlib.md5(val).hexdigest()
+        res = hashlib.sha256(val).hexdigest()
         assert_(tgt[np.dtype(bool).name] == res)
 
     @pytest.mark.skipif(np.iinfo('l').max < 2**32,
@@ -1974,7 +1974,7 @@ class TestSingleEltArrayInput:
 # Ensure returned array dtype is correct for platform
 def test_integer_dtype(int_func):
     random.seed(123456789)
-    fname, args, md5 = int_func
+    fname, args, sha256 = int_func
     f = getattr(random, fname)
     actual = f(*args, size=2)
     assert_(actual.dtype == np.dtype('l'))
@@ -1982,10 +1982,20 @@ def test_integer_dtype(int_func):
 
 def test_integer_repeat(int_func):
     random.seed(123456789)
-    fname, args, md5 = int_func
+    fname, args, sha256 = int_func
     f = getattr(random, fname)
     val = f(*args, size=1000000)
     if sys.byteorder != 'little':
         val = val.byteswap()
-    res = hashlib.md5(val.view(np.int8)).hexdigest()
-    assert_(res == md5)
+    res = hashlib.sha256(val.view(np.int8)).hexdigest()
+    assert_(res == sha256)
+
+
+def test_broadcast_size_error():
+    # GH-16833
+    with pytest.raises(ValueError):
+        random.binomial(1, [0.3, 0.7], size=(2, 1))
+    with pytest.raises(ValueError):
+        random.binomial([1, 2], 0.3, size=(2, 1))
+    with pytest.raises(ValueError):
+        random.binomial([1, 2], [0.3, 0.7], size=(2, 1))
diff --git a/numpy/random/tests/test_randomstate_regression.py b/numpy/random/tests/test_randomstate_regression.py
index 4eb82fc4c..0bf361e5e 100644
--- a/numpy/random/tests/test_randomstate_regression.py
+++ b/numpy/random/tests/test_randomstate_regression.py
@@ -44,11 +44,11 @@ class TestRegression:
         # numbers with this large sample
         # theoretical large N result is 0.49706795
         freq = np.sum(rvsn == 1) / float(N)
-        msg = "Frequency was %f, should be > 0.45" % freq
+        msg = f'Frequency was {freq:f}, should be > 0.45'
         assert_(freq > 0.45, msg)
         # theoretical large N result is 0.19882718
         freq = np.sum(rvsn == 2) / float(N)
-        msg = "Frequency was %f, should be < 0.23" % freq
+        msg = f'Frequency was {freq:f}, should be < 0.23'
         assert_(freq < 0.23, msg)
 
     def test_shuffle_mixed_dimension(self):
diff --git a/numpy/random/tests/test_regression.py b/numpy/random/tests/test_regression.py
index 278622287..54d5a3efb 100644
--- a/numpy/random/tests/test_regression.py
+++ b/numpy/random/tests/test_regression.py
@@ -40,11 +40,11 @@ class TestRegression:
         # numbers with this large sample
         # theoretical large N result is 0.49706795
         freq = np.sum(rvsn == 1) / float(N)
-        msg = "Frequency was %f, should be > 0.45" % freq
+        msg = f'Frequency was {freq:f}, should be > 0.45'
         assert_(freq > 0.45, msg)
         # theoretical large N result is 0.19882718
         freq = np.sum(rvsn == 2) / float(N)
-        msg = "Frequency was %f, should be < 0.23" % freq
+        msg = f'Frequency was {freq:f}, should be < 0.23'
         assert_(freq < 0.23, msg)
 
     def test_shuffle_mixed_dimension(self):
diff --git a/numpy/random/tests/test_smoke.py b/numpy/random/tests/test_smoke.py
index ebfc6825e..909bfaa8d 100644
--- a/numpy/random/tests/test_smoke.py
+++ b/numpy/random/tests/test_smoke.py
@@ -129,7 +129,7 @@ class RNG:
             assert_(not comp_state(state, self.rg.bit_generator.state))
         else:
             bitgen_name = self.rg.bit_generator.__class__.__name__
-            pytest.skip('Advance is not supported by {0}'.format(bitgen_name))
+            pytest.skip(f'Advance is not supported by {bitgen_name}')
 
     def test_jump(self):
         state = self.rg.bit_generator.state
@@ -145,8 +145,8 @@ class RNG:
         else:
             bitgen_name = self.rg.bit_generator.__class__.__name__
             if bitgen_name not in ('SFC64',):
-                raise AttributeError('no "jumped" in %s' % bitgen_name)
-            pytest.skip('Jump is not supported by {0}'.format(bitgen_name))
+                raise AttributeError(f'no "jumped" in {bitgen_name}')
+            pytest.skip(f'Jump is not supported by {bitgen_name}')
 
     def test_uniform(self):
         r = self.rg.uniform(-1.0, 0.0, size=10)
@@ -447,8 +447,7 @@ class RNG:
     def test_seed_array(self):
         if self.seed_vector_bits is None:
             bitgen_name = self.bit_generator.__name__
-            pytest.skip('Vector seeding is not supported by '
-                        '{0}'.format(bitgen_name))
+            pytest.skip(f'Vector seeding is not supported by {bitgen_name}')
 
         if self.seed_vector_bits == 32:
             dtype = np.uint32
diff --git a/numpy/rec.pyi b/numpy/rec.pyi
new file mode 100644
index 000000000..c70ee5374
--- /dev/null
+++ b/numpy/rec.pyi
@@ -0,0 +1,5 @@
+from typing import Any
+
+record: Any
+recarray: Any
+format_parser: Any
diff --git a/numpy/testing/__init__.pyi b/numpy/testing/__init__.pyi
new file mode 100644
index 000000000..c394a387d
--- /dev/null
+++ b/numpy/testing/__init__.pyi
@@ -0,0 +1,44 @@
+from typing import Any
+
+assert_equal: Any
+assert_almost_equal: Any
+assert_approx_equal: Any
+assert_array_equal: Any
+assert_array_less: Any
+assert_string_equal: Any
+assert_array_almost_equal: Any
+assert_raises: Any
+build_err_msg: Any
+decorate_methods: Any
+jiffies: Any
+memusage: Any
+print_assert_equal: Any
+raises: Any
+rundocs: Any
+runstring: Any
+verbose: Any
+measure: Any
+assert_: Any
+assert_array_almost_equal_nulp: Any
+assert_raises_regex: Any
+assert_array_max_ulp: Any
+assert_warns: Any
+assert_no_warnings: Any
+assert_allclose: Any
+IgnoreException: Any
+clear_and_catch_warnings: Any
+SkipTest: Any
+KnownFailureException: Any
+temppath: Any
+tempdir: Any
+IS_PYPY: Any
+HAS_REFCOUNT: Any
+suppress_warnings: Any
+assert_array_compare: Any
+_assert_valid_refcount: Any
+_gen_alignment_data: Any
+assert_no_gc_cycles: Any
+break_cycles: Any
+HAS_LAPACK64: Any
+TestCase: Any
+run_module_suite: Any
diff --git a/numpy/testing/_private/decorators.py b/numpy/testing/_private/decorators.py
index b4b6259a0..4c87d1a49 100644
--- a/numpy/testing/_private/decorators.py
+++ b/numpy/testing/_private/decorators.py
@@ -136,7 +136,7 @@ def skipif(skip_condition, msg=None):
             else:
                 out = msg
 
-            return "Skipping test: %s: %s" % (func.__name__, out)
+            return f'Skipping test: {func.__name__}: {out}'
 
         # We need to define *two* skippers because Python doesn't allow both
         # return with value and yield inside the same function.
diff --git a/numpy/testing/_private/noseclasses.py b/numpy/testing/_private/noseclasses.py
index 69e19e959..48fa4dc1f 100644
--- a/numpy/testing/_private/noseclasses.py
+++ b/numpy/testing/_private/noseclasses.py
@@ -76,7 +76,7 @@ class NumpyDocTestFinder(doctest.DocTestFinder):
         # Look for tests in a module's contained objects.
         if ismodule(obj) and self._recurse:
             for valname, val in obj.__dict__.items():
-                valname1 = '%s.%s' % (name, valname)
+                valname1 = f'{name}.{valname}'
                 if ( (isroutine(val) or isclass(val))
                      and self._from_module(module, val)):
 
@@ -96,7 +96,7 @@ class NumpyDocTestFinder(doctest.DocTestFinder):
                 if ((isfunction(val) or isclass(val) or
                      ismethod(val) or isinstance(val, property)) and
                       self._from_module(module, val)):
-                    valname = '%s.%s' % (name, valname)
+                    valname = f'{name}.{valname}'
                     self._find(tests, val, valname, module, source_lines,
                                globs, seen)
 
diff --git a/numpy/testing/_private/nosetester.py b/numpy/testing/_private/nosetester.py
index 57691a448..bccec8236 100644
--- a/numpy/testing/_private/nosetester.py
+++ b/numpy/testing/_private/nosetester.py
@@ -233,20 +233,20 @@ class NoseTester:
         nose = import_nose()
 
         import numpy
-        print("NumPy version %s" % numpy.__version__)
+        print(f'NumPy version {numpy.__version__}')
         relaxed_strides = numpy.ones((10, 1), order="C").flags.f_contiguous
         print("NumPy relaxed strides checking option:", relaxed_strides)
         npdir = os.path.dirname(numpy.__file__)
-        print("NumPy is installed in %s" % npdir)
+        print(f'NumPy is installed in {npdir}')
 
         if 'scipy' in self.package_name:
             import scipy
-            print("SciPy version %s" % scipy.__version__)
+            print(f'SciPy version {scipy.__version__}')
             spdir = os.path.dirname(scipy.__file__)
-            print("SciPy is installed in %s" % spdir)
+            print(f'SciPy is installed in {spdir}')
 
         pyversion = sys.version.replace('\n', '')
-        print("Python version %s" % pyversion)
+        print(f'Python version {pyversion}')
         print("nose version %d.%d.%d" % nose.__versioninfo__)
 
     def _get_custom_doctester(self):
@@ -278,7 +278,7 @@ class NoseTester:
         argv = self._test_argv(label, verbose, extra_argv)
         # our way of doing coverage
         if coverage:
-            argv += ['--cover-package=%s' % self.package_name, '--with-coverage',
+            argv += [f'--cover-package={self.package_name}', '--with-coverage',
                    '--cover-tests', '--cover-erase']
 
         if timer:
@@ -403,9 +403,9 @@ class NoseTester:
                 label, verbose, extra_argv, doctests, coverage, timer)
 
         if doctests:
-            print("Running unit tests and doctests for %s" % self.package_name)
+            print(f'Running unit tests and doctests for {self.package_name}')
         else:
-            print("Running unit tests for %s" % self.package_name)
+            print(f'Running unit tests for {self.package_name}')
 
         self._show_system_info()
 
@@ -520,7 +520,7 @@ class NoseTester:
 
         """
 
-        print("Running benchmarks for %s" % self.package_name)
+        print(f'Running benchmarks for {self.package_name}')
         self._show_system_info()
 
         argv = self._test_argv(label, verbose, extra_argv)
diff --git a/numpy/testing/_private/parameterized.py b/numpy/testing/_private/parameterized.py
index 3bd8ede91..ac7db6c40 100644
--- a/numpy/testing/_private/parameterized.py
+++ b/numpy/testing/_private/parameterized.py
@@ -205,7 +205,7 @@ def default_doc_func(func, num, p):
     all_args_with_values = parameterized_argument_value_pairs(func, p)
 
     # Assumes that the function passed is a bound method.
-    descs = ["%s=%s" %(n, short_repr(v)) for n, v in all_args_with_values]
+    descs = [f'{n}={short_repr(v)}' for n, v in all_args_with_values]
 
     # The documentation might be a multiline string, so split it
     # and just work with the first string, ignoring the period
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index 3827b7505..e974bbd09 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -113,14 +113,14 @@ def gisnan(x):
 
 
 def gisfinite(x):
-    """like isfinite, but always raise an error if type not supported instead of
-    returning a TypeError object.
+    """like isfinite, but always raise an error if type not supported instead
+    of returning a TypeError object.
 
     Notes
     -----
-    isfinite and other ufunc sometimes return a NotImplementedType object instead
-    of raising any exception. This function is a wrapper to make sure an
-    exception is always raised.
+    isfinite and other ufunc sometimes return a NotImplementedType object
+    instead of raising any exception. This function is a wrapper to make sure
+    an exception is always raised.
 
     This should be removed once this problem is solved at the Ufunc level."""
     from numpy.core import isfinite, errstate
@@ -160,12 +160,13 @@ if os.name == 'nt':
         # you should copy this function, but keep the counter open, and call
         # CollectQueryData() each time you need to know.
         # See http://msdn.microsoft.com/library/en-us/dnperfmo/html/perfmonpt2.asp (dead link)
-        # My older explanation for this was that the "AddCounter" process forced
-        # the CPU to 100%, but the above makes more sense :)
+        # My older explanation for this was that the "AddCounter" process
+        # forced the CPU to 100%, but the above makes more sense :)
         import win32pdh
         if format is None:
             format = win32pdh.PDH_FMT_LONG
-        path = win32pdh.MakeCounterPath( (machine, object, instance, None, inum, counter))
+        path = win32pdh.MakeCounterPath( (machine, object, instance, None,
+                                          inum, counter))
         hq = win32pdh.OpenQuery()
         try:
             hc = win32pdh.AddCounter(hq, path)
@@ -186,7 +187,7 @@ if os.name == 'nt':
                                         win32pdh.PDH_FMT_LONG, None)
 elif sys.platform[:5] == 'linux':
 
-    def memusage(_proc_pid_stat='/proc/%s/stat' % (os.getpid())):
+    def memusage(_proc_pid_stat=f'/proc/{os.getpid()}/stat'):
         """
         Return virtual memory size in bytes of the running python.
 
@@ -207,8 +208,7 @@ else:
 
 
 if sys.platform[:5] == 'linux':
-    def jiffies(_proc_pid_stat='/proc/%s/stat' % (os.getpid()),
-                _load_time=[]):
+    def jiffies(_proc_pid_stat=f'/proc/{os.getpid()}/stat', _load_time=[]):
         """
         Return number of jiffies elapsed.
 
@@ -263,11 +263,11 @@ def build_err_msg(arrays, err_msg, header='Items are not equal:',
             try:
                 r = r_func(a)
             except Exception as exc:
-                r = '[repr failed for <{}>: {}]'.format(type(a).__name__, exc)
+                r = f'[repr failed for <{type(a).__name__}>: {exc}]'
             if r.count('\n') > 3:
                 r = '\n'.join(r.splitlines()[:3])
                 r += '...'
-            msg.append(' %s: %s' % (names[i], r))
+            msg.append(f' {names[i]}: {r}')
     return '\n'.join(msg)
 
 
@@ -329,12 +329,14 @@ def assert_equal(actual, desired, err_msg='', verbose=True):
         for k, i in desired.items():
             if k not in actual:
                 raise AssertionError(repr(k))
-            assert_equal(actual[k], desired[k], 'key=%r\n%s' % (k, err_msg), verbose)
+            assert_equal(actual[k], desired[k], f'key={k!r}\n{err_msg}',
+                         verbose)
         return
     if isinstance(desired, (list, tuple)) and isinstance(actual, (list, tuple)):
         assert_equal(len(actual), len(desired), err_msg, verbose)
         for k in range(len(desired)):
-            assert_equal(actual[k], desired[k], 'item=%r\n%s' % (k, err_msg), verbose)
+            assert_equal(actual[k], desired[k], f'item={k!r}\n{err_msg}',
+                         verbose)
         return
     from numpy.core import ndarray, isscalar, signbit
     from numpy.lib import iscomplexobj, real, imag
@@ -694,9 +696,8 @@ def assert_approx_equal(actual,desired,significant=7,err_msg='',verbose=True):
         raise AssertionError(msg)
 
 
-def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
-                         header='', precision=6, equal_nan=True,
-                         equal_inf=True):
+def assert_array_compare(comparison, x, y, err_msg='', verbose=True, header='',
+                         precision=6, equal_nan=True, equal_inf=True):
     __tracebackhide__ = True  # Hide traceback for py.test
     from numpy.core import array, array2string, isnan, inf, bool_, errstate, all, max, object_
 
@@ -744,7 +745,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
         # flag as it everywhere, so we should return the scalar flag.
         if isinstance(x_id, bool) or x_id.ndim == 0:
             return bool_(x_id)
-        elif isinstance(x_id, bool) or y_id.ndim == 0:
+        elif isinstance(y_id, bool) or y_id.ndim == 0:
             return bool_(y_id)
         else:
             return y_id
@@ -754,8 +755,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
         if not cond:
             msg = build_err_msg([x, y],
                                 err_msg
-                                + '\n(shapes %s, %s mismatch)' % (x.shape,
-                                                                  y.shape),
+                                + f'\n(shapes {x.shape}, {y.shape} mismatch)',
                                 verbose=verbose, header=header,
                                 names=('x', 'y'), precision=precision)
             raise AssertionError(msg)
@@ -843,7 +843,7 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
     except ValueError:
         import traceback
         efmt = traceback.format_exc()
-        header = 'error during assertion:\n\n%s\n\n%s' % (efmt, header)
+        header = f'error during assertion:\n\n{efmt}\n\n{header}'
 
         msg = build_err_msg([x, y], err_msg, verbose=verbose, header=header,
                             names=('x', 'y'), precision=precision)
@@ -1170,7 +1170,8 @@ def assert_string_equal(actual, desired):
     if desired == actual:
         return
 
-    diff = list(difflib.Differ().compare(actual.splitlines(True), desired.splitlines(True)))
+    diff = list(difflib.Differ().compare(actual.splitlines(True),
+                desired.splitlines(True)))
     diff_list = []
     while diff:
         d1 = diff.pop(0)
@@ -1198,7 +1199,7 @@ def assert_string_equal(actual, desired):
         raise AssertionError(repr(d1))
     if not diff_list:
         return
-    msg = 'Differences in strings:\n%s' % (''.join(diff_list)).rstrip()
+    msg = f"Differences in strings:\n{''.join(diff_list).rstrip()}"
     if actual != desired:
         raise AssertionError(msg)
 
@@ -1434,9 +1435,7 @@ def measure(code_str, times=1, label=None):
     frame = sys._getframe(1)
     locs, globs = frame.f_locals, frame.f_globals
 
-    code = compile(code_str,
-                   'Test name: %s ' % label,
-                   'exec')
+    code = compile(code_str, f'Test name: {label} ', 'exec')
     i = 0
     elapsed = jiffies()
     while i < times:
@@ -1525,7 +1524,7 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=0, equal_nan=True,
                                        equal_nan=equal_nan)
 
     actual, desired = np.asanyarray(actual), np.asanyarray(desired)
-    header = 'Not equal to tolerance rtol=%g, atol=%g' % (rtol, atol)
+    header = f'Not equal to tolerance rtol={rtol:g}, atol={atol:g}'
     assert_array_compare(compare, actual, desired, err_msg=str(err_msg),
                          verbose=verbose, header=header, equal_nan=equal_nan)
 
@@ -1724,8 +1723,8 @@ def _integer_repr(x, vdt, comp):
 
 
 def integer_repr(x):
-    """Return the signed-magnitude interpretation of the binary representation of
-    x."""
+    """Return the signed-magnitude interpretation of the binary representation
+    of x."""
     import numpy as np
     if x.dtype == np.float16:
         return _integer_repr(x, np.int16, np.int16(-2**15))
@@ -1734,7 +1733,7 @@ def integer_repr(x):
     elif x.dtype == np.float64:
         return _integer_repr(x, np.int64, np.int64(-2**63))
     else:
-        raise ValueError("Unsupported dtype %s" % x.dtype)
+        raise ValueError(f'Unsupported dtype {x.dtype}')
 
 
 @contextlib.contextmanager
@@ -1744,7 +1743,7 @@ def _assert_warns_context(warning_class, name=None):
         l = sup.record(warning_class)
         yield
         if not len(l) > 0:
-            name_str = " when calling %s" % name if name is not None else ""
+            name_str = f' when calling {name}' if name is not None else ''
             raise AssertionError("No warning raised" + name_str)
 
 
@@ -1809,8 +1808,8 @@ def _assert_no_warnings_context(name=None):
         warnings.simplefilter('always')
         yield
         if len(l) > 0:
-            name_str = " when calling %s" % name if name is not None else ""
-            raise AssertionError("Got warnings%s: %s" % (name_str, l))
+            name_str = f' when calling {name}' if name is not None else ''
+            raise AssertionError(f'Got warnings{name_str}: {l}')
 
 
 def assert_no_warnings(*args, **kwargs):
@@ -2322,8 +2321,8 @@ def _assert_no_gc_cycles_context(name=None):
                 break
         else:
             raise RuntimeError(
-                "Unable to fully collect garbage - perhaps a __del__ method is "
-                "creating more reference cycles?")
+                "Unable to fully collect garbage - perhaps a __del__ method "
+                "is creating more reference cycles?")
 
         gc.set_debug(gc.DEBUG_SAVEALL)
         yield
@@ -2337,7 +2336,7 @@ def _assert_no_gc_cycles_context(name=None):
         gc.enable()
 
     if n_objects_in_cycles:
-        name_str = " when calling %s" % name if name is not None else ""
+        name_str = f' when calling {name}' if name is not None else ''
         raise AssertionError(
             "Reference cycles were found{}: {} objects were collected, "
             "of which {} are shown below:{}"
@@ -2403,7 +2402,8 @@ def break_cycles():
     if IS_PYPY:
         # interpreter runs now, to call deleted objects' __del__ methods
         gc.collect()
-        # one more, just to make sure
+        # two more, just to make sure
+        gc.collect()
         gc.collect()
 
 
@@ -2440,12 +2440,10 @@ def check_free_memory(free_bytes):
         try:
             mem_free = _parse_size(env_value)
         except ValueError as exc:
-            raise ValueError('Invalid environment variable {}: {!s}'.format(
-                env_var, exc))
+            raise ValueError(f'Invalid environment variable {env_var}: {exc}')
 
-        msg = ('{0} GB memory required, but environment variable '
-               'NPY_AVAILABLE_MEM={1} set'.format(
-                   free_bytes/1e9, env_value))
+        msg = (f'{free_bytes/1e9} GB memory required, but environment variable '
+               f'NPY_AVAILABLE_MEM={env_value} set')
     else:
         mem_free = _get_mem_available()
 
@@ -2455,8 +2453,7 @@ def check_free_memory(free_bytes):
                    "the test.")
             mem_free = -1
         else:
-            msg = '{0} GB memory required, but {1} GB available'.format(
-                free_bytes/1e9, mem_free/1e9)
+            msg = f'{free_bytes/1e9} GB memory required, but {mem_free/1e9} GB available'
 
     return msg if mem_free < free_bytes else None
 
@@ -2473,7 +2470,7 @@ def _parse_size(size_str):
 
     m = size_re.match(size_str.lower())
     if not m or m.group(2) not in suffixes:
-        raise ValueError("value {!r} not a valid size".format(size_str))
+        raise ValueError(f'value {size_str!r} not a valid size')
     return int(float(m.group(1)) * suffixes[m.group(2)])
 
 
diff --git a/numpy/testing/print_coercion_tables.py b/numpy/testing/print_coercion_tables.py
index 8024df128..3a447cd2d 100755
--- a/numpy/testing/print_coercion_tables.py
+++ b/numpy/testing/print_coercion_tables.py
@@ -3,6 +3,7 @@
 
 """
 import numpy as np
+from collections import namedtuple
 
 # Generic object that can be added, but doesn't do anything else
 class GenericObject:
@@ -25,7 +26,17 @@ def print_cancast_table(ntypes):
     for row in ntypes:
         print(row, end=' ')
         for col in ntypes:
-            print(int(np.can_cast(row, col)), end=' ')
+            if np.can_cast(row, col, "equiv"):
+                cast = "#"
+            elif np.can_cast(row, col, "safe"):
+                cast = "="
+            elif np.can_cast(row, col, "same_kind"):
+                cast = "~"
+            elif np.can_cast(row, col, "unsafe"):
+                cast = "."
+            else:
+                cast = " "
+            print(cast, end=' ')
         print()
 
 def print_coercion_table(ntypes, inputfirstvalue, inputsecondvalue, firstarray, use_promote_types=False):
@@ -69,6 +80,101 @@ def print_coercion_table(ntypes, inputfirstvalue, inputsecondvalue, firstarray,
         print()
 
 
+def print_new_cast_table(*, can_cast=True, legacy=False, flags=False):
+    """Prints new casts, the values given are default "can-cast" values, not
+    actual ones.
+    """
+    from numpy.core._multiarray_tests import get_all_cast_information
+
+    cast_table = {
+        0 : "#",  # No cast (classify as equivalent here)
+        1 : "#",  # equivalent casting
+        2 : "=",  # safe casting
+        3 : "~",  # same-kind casting
+        4 : ".",  # unsafe casting
+    }
+    flags_table = {
+        0 : "▗", 7: "█",
+        1: "▚", 2: "▐", 4: "▄",
+                3: "▜", 5: "▙",
+                        6: "▟",
+    }
+
+    cast_info = namedtuple("cast_info", ["can_cast", "legacy", "flags"])
+    no_cast_info = cast_info(" ", " ", " ")
+
+    casts = get_all_cast_information()
+    table = {}
+    dtypes = set()
+    for cast in casts:
+        dtypes.add(cast["from"])
+        dtypes.add(cast["to"])
+
+        if cast["from"] not in table:
+            table[cast["from"]] = {}
+        to_dict = table[cast["from"]]
+
+        can_cast = cast_table[cast["casting"]]
+        legacy = "L" if cast["legacy"] else "."
+        flags = 0
+        if cast["requires_pyapi"]:
+            flags |= 1
+        if cast["supports_unaligned"]:
+            flags |= 2
+        if cast["no_floatingpoint_errors"]:
+            flags |= 4
+
+        flags = flags_table[flags]
+        to_dict[cast["to"]] = cast_info(can_cast=can_cast, legacy=legacy, flags=flags)
+
+    # The np.dtype(x.type) is a bit strange, because dtype classes do
+    # not expose much yet.
+    types = np.typecodes["All"]
+    def sorter(x):
+        # This is a bit weird hack, to get a table as close as possible to
+        # the one printing all typecodes (but expecting user-dtypes).
+        dtype = np.dtype(x.type)
+        try:
+            indx = types.index(dtype.char)
+        except ValueError:
+            indx = np.inf
+        return (indx, dtype.char)
+
+    dtypes = sorted(dtypes, key=sorter)
+
+    def print_table(field="can_cast"):
+        print('X', end=' ')
+        for dt in dtypes:
+            print(np.dtype(dt.type).char, end=' ')
+        print()
+        for from_dt in dtypes:
+            print(np.dtype(from_dt.type).char, end=' ')
+            row = table.get(from_dt, {})
+            for to_dt in dtypes:
+                print(getattr(row.get(to_dt, no_cast_info), field), end=' ')
+            print()
+
+    if can_cast:
+        # Print the actual table:
+        print()
+        print("Casting: # is equivalent, = is safe, ~ is same-kind, and . is unsafe")
+        print()
+        print_table("can_cast")
+
+    if legacy:
+        print()
+        print("L denotes a legacy cast . a non-legacy one.")
+        print()
+        print_table("legacy")
+
+    if flags:
+        print()
+        print(f"{flags_table[0]}: no flags, {flags_table[1]}: PyAPI, "
+              f"{flags_table[2]}: supports unaligned, {flags_table[4]}: no-float-errors")
+        print()
+        print_table("flags")
+
+
 if __name__ == '__main__':
     print("can cast")
     print_cancast_table(np.typecodes['All'])
@@ -89,3 +195,5 @@ if __name__ == '__main__':
     print()
     print("promote_types")
     print_coercion_table(np.typecodes['All'], 0, 0, False, True)
+    print("New casting type promotion:")
+    print_new_cast_table(can_cast=True, legacy=True, flags=True)
diff --git a/numpy/testing/setup.py b/numpy/testing/setup.py
index 13191f13f..7652a94a2 100755
--- a/numpy/testing/setup.py
+++ b/numpy/testing/setup.py
@@ -6,6 +6,7 @@ def configuration(parent_package='',top_path=None):
 
     config.add_subpackage('_private')
     config.add_subpackage('tests')
+    config.add_data_files('*.pyi')
     return config
 
 if __name__ == '__main__':
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 6a6cc664a..c3b9e04b6 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -1240,7 +1240,7 @@ def assert_warn_len_equal(mod, n_in_context, py34=None, py37=None):
         if sys.version_info[:2] >= (3, 7):
             if py37 is not None:
                 n_in_context = py37
-        elif sys.version_info[:2] >= (3, 4):
+        else:
             if py34 is not None:
                 n_in_context = py34
     assert_equal(num_warns, n_in_context)
diff --git a/numpy/tests/test_numpy_version.py b/numpy/tests/test_numpy_version.py
index 916ab9383..7fd566815 100644
--- a/numpy/tests/test_numpy_version.py
+++ b/numpy/tests/test_numpy_version.py
@@ -8,7 +8,7 @@ def test_valid_numpy_version():
     # Verify that the numpy version is a valid one (no .post suffix or other
     # nonsense).  See gh-6431 for an issue caused by an invalid version.
     version_pattern = r"^[0-9]+\.[0-9]+\.[0-9]+(|a[0-9]|b[0-9]|rc[0-9])"
-    dev_suffix = r"(\.dev0\+([0-9a-f]{7}|Unknown))"
+    dev_suffix = r"\.dev0\+[0-9]*\.g[0-9a-f]+"
     if np.version.release:
         res = re.match(version_pattern, np.__version__)
     else:
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index a9d6da01c..1382e1c4b 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -145,18 +145,7 @@ PUBLIC_MODULES = ['numpy.' + s for s in [
     "distutils.log",
     "distutils.system_info",
     "doc",
-    "doc.basics",
-    "doc.broadcasting",
-    "doc.byteswapping",
     "doc.constants",
-    "doc.creation",
-    "doc.dispatch",
-    "doc.glossary",
-    "doc.indexing",
-    "doc.internals",
-    "doc.misc",
-    "doc.structured_arrays",
-    "doc.subclassing",
     "doc.ufuncs",
     "f2py",
     "fft",
@@ -251,8 +240,10 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "distutils.fcompiler.none",
     "distutils.fcompiler.pathf95",
     "distutils.fcompiler.pg",
+    "distutils.fcompiler.nv",
     "distutils.fcompiler.sun",
     "distutils.fcompiler.vast",
+    "distutils.fcompiler.fujitsu",
     "distutils.from_template",
     "distutils.intelccompiler",
     "distutils.lib2def",
@@ -281,7 +272,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "lib.arraypad",
     "lib.arraysetops",
     "lib.arrayterator",
-    "lib.financial",
     "lib.function_base",
     "lib.histograms",
     "lib.index_tricks",
@@ -360,7 +350,7 @@ def test_all_modules_are_expected():
             modnames.append(modname)
 
     if modnames:
-        raise AssertionError("Found unexpected modules: {}".format(modnames))
+        raise AssertionError(f'Found unexpected modules: {modnames}')
 
 
 # Stuff that clearly shouldn't be in the API and is detected by the next test
@@ -368,18 +358,6 @@ def test_all_modules_are_expected():
 SKIP_LIST_2 = [
     'numpy.math',
     'numpy.distutils.log.sys',
-    'numpy.distutils.system_info.copy',
-    'numpy.distutils.system_info.distutils',
-    'numpy.distutils.system_info.log',
-    'numpy.distutils.system_info.os',
-    'numpy.distutils.system_info.platform',
-    'numpy.distutils.system_info.re',
-    'numpy.distutils.system_info.shutil',
-    'numpy.distutils.system_info.subprocess',
-    'numpy.distutils.system_info.sys',
-    'numpy.distutils.system_info.tempfile',
-    'numpy.distutils.system_info.textwrap',
-    'numpy.distutils.system_info.warnings',
     'numpy.doc.constants.re',
     'numpy.doc.constants.textwrap',
     'numpy.lib.emath',
diff --git a/numpy/tests/test_scripts.py b/numpy/tests/test_scripts.py
index a0f2ba70a..e67a82947 100644
--- a/numpy/tests/test_scripts.py
+++ b/numpy/tests/test_scripts.py
@@ -38,9 +38,9 @@ def find_f2py_commands():
 def test_f2py(f2py_cmd):
     # test that we can run f2py script
     stdout = subprocess.check_output([f2py_cmd, '-v'])
-    assert_equal(stdout.strip(), b'2')
+    assert_equal(stdout.strip(), np.__version__.encode('ascii'))
 
 
 def test_pep338():
     stdout = subprocess.check_output([sys.executable, '-mnumpy.f2py', '-v'])
-    assert_equal(stdout.strip(), b'2')
+    assert_equal(stdout.strip(), np.__version__.encode('ascii'))
diff --git a/numpy/tests/typing/fail/simple.py b/numpy/tests/typing/fail/simple.py
deleted file mode 100644
index b5e9d1b13..000000000
--- a/numpy/tests/typing/fail/simple.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""Simple expression that should fail with mypy."""
-
-import numpy as np
-
-# Array creation routines checks
-np.zeros("test")  # E: incompatible type
-np.zeros()  # E: Too few arguments
-
-np.ones("test")  # E: incompatible type
-np.ones()  # E: Too few arguments
diff --git a/numpy/tests/typing/pass/dtype.py b/numpy/tests/typing/pass/dtype.py
deleted file mode 100644
index f954fdd44..000000000
--- a/numpy/tests/typing/pass/dtype.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import numpy as np
-
-np.dtype(dtype=np.int64)
diff --git a/numpy/tests/typing/pass/scalars.py b/numpy/tests/typing/pass/scalars.py
deleted file mode 100644
index 1c7ace282..000000000
--- a/numpy/tests/typing/pass/scalars.py
+++ /dev/null
@@ -1,98 +0,0 @@
-import numpy as np
-
-
-# Construction
-class C:
-    def __complex__(self):
-        return 3j
-
-
-class B:
-    def __int__(self):
-        return 4
-
-
-class A:
-    def __float__(self):
-        return 4.0
-
-
-np.complex64(3j)
-np.complex64(C())
-np.complex128(3j)
-np.complex128(C())
-np.complex128(None)
-
-np.int8(4)
-np.int16(3.4)
-np.int32(4)
-np.int64(-1)
-np.uint8(B())
-np.uint32()
-
-np.float16(A())
-np.float32(16)
-np.float64(3.0)
-np.float64(None)
-
-np.bytes_(b"hello")
-np.bytes_("hello", 'utf-8')
-np.bytes_("hello", encoding='utf-8')
-np.str_("hello")
-np.str_(b"hello", 'utf-8')
-np.str_(b"hello", encoding='utf-8')
-
-# Protocols
-float(np.int8(4))
-int(np.int16(5))
-np.int8(np.float32(6))
-
-# TODO(alan): test after https://github.com/python/typeshed/pull/2004
-# complex(np.int32(8))
-
-abs(np.int8(4))
-
-# Array-ish semantics
-np.int8().real
-np.int16().imag
-np.int32().data
-np.int64().flags
-
-np.uint8().itemsize * 2
-np.uint16().ndim + 1
-np.uint32().strides
-np.uint64().shape
-
-# Time structures
-np.datetime64()
-np.datetime64(0, "D")
-np.datetime64("2019")
-np.datetime64("2019", "D")
-np.datetime64(None)
-np.datetime64(None, "D")
-
-np.timedelta64()
-np.timedelta64(0)
-np.timedelta64(0, "D")
-np.timedelta64(None)
-np.timedelta64(None, "D")
-
-dt_64 = np.datetime64(0, "D")
-td_64 = np.timedelta64(1, "h")
-
-dt_64 + td_64
-dt_64 - dt_64
-dt_64 - td_64
-
-td_64 + td_64
-td_64 - td_64
-td_64 / 1.0
-td_64 / td_64
-td_64 % td_64
-
-np.void(1)
-np.void(np.int64(1))
-np.void(True)
-np.void(np.bool_(True))
-np.void(b"test")
-np.void(np.bytes_("test"))
diff --git a/numpy/tests/typing/reveal/fromnumeric.py b/numpy/tests/typing/reveal/fromnumeric.py
deleted file mode 100644
index f5feb3f5f..000000000
--- a/numpy/tests/typing/reveal/fromnumeric.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""Tests for :mod:`numpy.core.fromnumeric`."""
-
-import numpy as np
-
-A = np.array(True, ndmin=2, dtype=bool)
-B = np.array(1.0, ndmin=2, dtype=np.float32)
-A.setflags(write=False)
-B.setflags(write=False)
-
-a = np.bool_(True)
-b = np.float32(1.0)
-c = 1.0
-
-reveal_type(np.take(a, 0))  # E: numpy.bool_
-reveal_type(np.take(b, 0))  # E: numpy.float32
-reveal_type(
-    np.take(c, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
-)
-reveal_type(
-    np.take(A, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
-)
-reveal_type(
-    np.take(B, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
-)
-reveal_type(
-    np.take(  # E: Union[Union[numpy.generic, datetime.datetime, datetime.timedelta], numpy.ndarray]
-        A, [0]
-    )
-)
-reveal_type(
-    np.take(  # E: Union[Union[numpy.generic, datetime.datetime, datetime.timedelta], numpy.ndarray]
-        B, [0]
-    )
-)
-
-reveal_type(np.reshape(a, 1))  # E: numpy.ndarray
-reveal_type(np.reshape(b, 1))  # E: numpy.ndarray
-reveal_type(np.reshape(c, 1))  # E: numpy.ndarray
-reveal_type(np.reshape(A, 1))  # E: numpy.ndarray
-reveal_type(np.reshape(B, 1))  # E: numpy.ndarray
-
-reveal_type(np.choose(a, [True, True]))  # E: numpy.bool_
-reveal_type(np.choose(A, [True, True]))  # E: numpy.ndarray
-
-reveal_type(np.repeat(a, 1))  # E: numpy.ndarray
-reveal_type(np.repeat(b, 1))  # E: numpy.ndarray
-reveal_type(np.repeat(c, 1))  # E: numpy.ndarray
-reveal_type(np.repeat(A, 1))  # E: numpy.ndarray
-reveal_type(np.repeat(B, 1))  # E: numpy.ndarray
-
-# TODO: Add tests for np.put()
-
-reveal_type(np.swapaxes(A, 0, 0))  # E: numpy.ndarray
-reveal_type(np.swapaxes(B, 0, 0))  # E: numpy.ndarray
-
-reveal_type(np.transpose(a))  # E: numpy.ndarray
-reveal_type(np.transpose(b))  # E: numpy.ndarray
-reveal_type(np.transpose(c))  # E: numpy.ndarray
-reveal_type(np.transpose(A))  # E: numpy.ndarray
-reveal_type(np.transpose(B))  # E: numpy.ndarray
-
-reveal_type(np.partition(a, 0, axis=None))  # E: numpy.ndarray
-reveal_type(np.partition(b, 0, axis=None))  # E: numpy.ndarray
-reveal_type(np.partition(c, 0, axis=None))  # E: numpy.ndarray
-reveal_type(np.partition(A, 0))  # E: numpy.ndarray
-reveal_type(np.partition(B, 0))  # E: numpy.ndarray
-
-reveal_type(np.argpartition(a, 0))  # E: numpy.integer
-reveal_type(np.argpartition(b, 0))  # E: numpy.integer
-reveal_type(np.argpartition(c, 0))  # E: numpy.ndarray
-reveal_type(np.argpartition(A, 0))  # E: numpy.ndarray
-reveal_type(np.argpartition(B, 0))  # E: numpy.ndarray
-
-reveal_type(np.sort(A, 0))  # E: numpy.ndarray
-reveal_type(np.sort(B, 0))  # E: numpy.ndarray
-
-reveal_type(np.argsort(A, 0))  # E: numpy.ndarray
-reveal_type(np.argsort(B, 0))  # E: numpy.ndarray
-
-reveal_type(np.argmax(A))  # E: numpy.integer
-reveal_type(np.argmax(B))  # E: numpy.integer
-reveal_type(np.argmax(A, axis=0))  # E: Union[numpy.integer, numpy.ndarray]
-reveal_type(np.argmax(B, axis=0))  # E: Union[numpy.integer, numpy.ndarray]
-
-reveal_type(np.argmin(A))  # E: numpy.integer
-reveal_type(np.argmin(B))  # E: numpy.integer
-reveal_type(np.argmin(A, axis=0))  # E: Union[numpy.integer, numpy.ndarray]
-reveal_type(np.argmin(B, axis=0))  # E: Union[numpy.integer, numpy.ndarray]
-
-reveal_type(np.searchsorted(A[0], 0))  # E: numpy.integer
-reveal_type(np.searchsorted(B[0], 0))  # E: numpy.integer
-reveal_type(np.searchsorted(A[0], [0]))  # E: numpy.ndarray
-reveal_type(np.searchsorted(B[0], [0]))  # E: numpy.ndarray
-
-reveal_type(np.resize(a, (5, 5)))  # E: numpy.ndarray
-reveal_type(np.resize(b, (5, 5)))  # E: numpy.ndarray
-reveal_type(np.resize(c, (5, 5)))  # E: numpy.ndarray
-reveal_type(np.resize(A, (5, 5)))  # E: numpy.ndarray
-reveal_type(np.resize(B, (5, 5)))  # E: numpy.ndarray
-
-reveal_type(np.squeeze(a))  # E: numpy.bool_
-reveal_type(np.squeeze(b))  # E: numpy.float32
-reveal_type(np.squeeze(c))  # E: numpy.ndarray
-reveal_type(np.squeeze(A))  # E: numpy.ndarray
-reveal_type(np.squeeze(B))  # E: numpy.ndarray
-
-reveal_type(np.diagonal(A))  # E: numpy.ndarray
-reveal_type(np.diagonal(B))  # E: numpy.ndarray
-
-reveal_type(np.trace(A))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.trace(B))  # E: Union[numpy.number, numpy.ndarray]
-
-reveal_type(np.ravel(a))  # E: numpy.ndarray
-reveal_type(np.ravel(b))  # E: numpy.ndarray
-reveal_type(np.ravel(c))  # E: numpy.ndarray
-reveal_type(np.ravel(A))  # E: numpy.ndarray
-reveal_type(np.ravel(B))  # E: numpy.ndarray
-
-reveal_type(np.nonzero(a))  # E: tuple[numpy.ndarray]
-reveal_type(np.nonzero(b))  # E: tuple[numpy.ndarray]
-reveal_type(np.nonzero(c))  # E: tuple[numpy.ndarray]
-reveal_type(np.nonzero(A))  # E: tuple[numpy.ndarray]
-reveal_type(np.nonzero(B))  # E: tuple[numpy.ndarray]
-
-reveal_type(np.shape(a))  # E: tuple[builtins.int]
-reveal_type(np.shape(b))  # E: tuple[builtins.int]
-reveal_type(np.shape(c))  # E: tuple[builtins.int]
-reveal_type(np.shape(A))  # E: tuple[builtins.int]
-reveal_type(np.shape(B))  # E: tuple[builtins.int]
-
-reveal_type(np.compress([True], a))  # E: numpy.ndarray
-reveal_type(np.compress([True], b))  # E: numpy.ndarray
-reveal_type(np.compress([True], c))  # E: numpy.ndarray
-reveal_type(np.compress([True], A))  # E: numpy.ndarray
-reveal_type(np.compress([True], B))  # E: numpy.ndarray
-
-reveal_type(np.clip(a, 0, 1.0))  # E: numpy.number
-reveal_type(np.clip(b, -1, 1))  # E: numpy.float32
-reveal_type(np.clip(c, 0, 1))  # E: numpy.number
-reveal_type(np.clip(A, 0, 1))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.clip(B, 0, 1))  # E: Union[numpy.number, numpy.ndarray]
-
-reveal_type(np.sum(a))  # E: numpy.number
-reveal_type(np.sum(b))  # E: numpy.float32
-reveal_type(np.sum(c))  # E: numpy.number
-reveal_type(np.sum(A))  # E: numpy.number
-reveal_type(np.sum(B))  # E: numpy.number
-reveal_type(np.sum(A, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.sum(B, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-
-reveal_type(np.all(a))  # E: numpy.bool_
-reveal_type(np.all(b))  # E: numpy.bool_
-reveal_type(np.all(c))  # E: numpy.bool_
-reveal_type(np.all(A))  # E: numpy.bool_
-reveal_type(np.all(B))  # E: numpy.bool_
-reveal_type(np.all(A, axis=0))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.all(B, axis=0))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.all(A, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.all(B, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray]
-
-reveal_type(np.any(a))  # E: numpy.bool_
-reveal_type(np.any(b))  # E: numpy.bool_
-reveal_type(np.any(c))  # E: numpy.bool_
-reveal_type(np.any(A))  # E: numpy.bool_
-reveal_type(np.any(B))  # E: numpy.bool_
-reveal_type(np.any(A, axis=0))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.any(B, axis=0))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.any(A, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray]
-reveal_type(np.any(B, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray]
-
-reveal_type(np.cumsum(a))  # E: numpy.ndarray
-reveal_type(np.cumsum(b))  # E: numpy.ndarray
-reveal_type(np.cumsum(c))  # E: numpy.ndarray
-reveal_type(np.cumsum(A))  # E: numpy.ndarray
-reveal_type(np.cumsum(B))  # E: numpy.ndarray
-
-reveal_type(np.ptp(a))  # E: numpy.number
-reveal_type(np.ptp(b))  # E: numpy.float32
-reveal_type(np.ptp(c))  # E: numpy.number
-reveal_type(np.ptp(A))  # E: numpy.number
-reveal_type(np.ptp(B))  # E: numpy.number
-reveal_type(np.ptp(A, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.ptp(B, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.ptp(A, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.ptp(B, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
-
-reveal_type(np.amax(a))  # E: numpy.number
-reveal_type(np.amax(b))  # E: numpy.float32
-reveal_type(np.amax(c))  # E: numpy.number
-reveal_type(np.amax(A))  # E: numpy.number
-reveal_type(np.amax(B))  # E: numpy.number
-reveal_type(np.amax(A, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amax(B, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amax(A, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amax(B, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
-
-reveal_type(np.amin(a))  # E: numpy.number
-reveal_type(np.amin(b))  # E: numpy.float32
-reveal_type(np.amin(c))  # E: numpy.number
-reveal_type(np.amin(A))  # E: numpy.number
-reveal_type(np.amin(B))  # E: numpy.number
-reveal_type(np.amin(A, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amin(B, axis=0))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amin(A, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
-reveal_type(np.amin(B, keepdims=True))  # E: Union[numpy.number, numpy.ndarray]
diff --git a/numpy/tests/typing/reveal/scalars.py b/numpy/tests/typing/reveal/scalars.py
deleted file mode 100644
index 8a9555fc3..000000000
--- a/numpy/tests/typing/reveal/scalars.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import numpy as np
-
-x = np.complex64(3 + 2j)
-
-reveal_type(x.real)  # E: numpy.float32
-reveal_type(x.imag)  # E: numpy.float32
-
-reveal_type(x.real.real)  # E: numpy.float32
-reveal_type(x.real.imag)  # E: numpy.float32
-
-reveal_type(x.itemsize)  # E: int
-reveal_type(x.shape)  # E: tuple[builtins.int]
-reveal_type(x.strides)  # E: tuple[builtins.int]
-
-# Time structures
-dt = np.datetime64(0, "D")
-td = np.timedelta64(0, "D")
-
-reveal_type(dt + td)  # E: numpy.datetime64
-reveal_type(dt + 1)  # E: numpy.datetime64
-reveal_type(dt - dt)  # E: numpy.timedelta64
-reveal_type(dt - 1)  # E: numpy.timedelta64
-
-reveal_type(td + td)  # E: numpy.timedelta64
-reveal_type(td + 1)  # E: numpy.timedelta64
-reveal_type(td - td)  # E: numpy.timedelta64
-reveal_type(td - 1)  # E: numpy.timedelta64
-reveal_type(td / 1.0)  # E: numpy.timedelta64
-reveal_type(td / td)  # E: float
-reveal_type(td % td)  # E: numpy.timedelta64
diff --git a/numpy/typing/__init__.py b/numpy/typing/__init__.py
index f2000823f..d9d9557bf 100644
--- a/numpy/typing/__init__.py
+++ b/numpy/typing/__init__.py
@@ -11,14 +11,11 @@ Typing (:mod:`numpy.typing`)
   typing-extensions_ package.
 
 Large parts of the NumPy API have PEP-484-style type annotations. In
-addition, the following type aliases are available for users.
+addition a number of type aliases are available to users, most prominently
+the two below:
 
-- ``typing.ArrayLike``: objects that can be converted to arrays
-- ``typing.DtypeLike``: objects that can be converted to dtypes
-
-Roughly speaking, ``typing.ArrayLike`` is "objects that can be used as
-inputs to ``np.array``" and ``typing.DtypeLike`` is "objects that can
-be used as inputs to ``np.dtype``".
+- `ArrayLike`: objects that can be converted to arrays
+- `DTypeLike`: objects that can be converted to dtypes
 
 .. _typing-extensions: https://pypi.org/project/typing-extensions/
 
@@ -34,13 +31,13 @@ differences.
 ArrayLike
 ~~~~~~~~~
 
-The ``ArrayLike`` type tries to avoid creating object arrays. For
+The `ArrayLike` type tries to avoid creating object arrays. For
 example,
 
 .. code-block:: python
 
     >>> np.array(x**2 for x in range(10))
-    array(<generator object <genexpr> at 0x10c004cd0>, dtype=object)
+    array(<generator object <genexpr> at ...>, dtype=object)
 
 is valid NumPy code which will create a 0-dimensional object
 array. Type checkers will complain about the above example when using
@@ -51,14 +48,14 @@ you can either use a ``# type: ignore`` comment:
 
     >>> np.array(x**2 for x in range(10))  # type: ignore
 
-or explicitly type the array like object as ``Any``:
+or explicitly type the array like object as `~typing.Any`:
 
 .. code-block:: python
 
     >>> from typing import Any
     >>> array_like: Any = (x**2 for x in range(10))
     >>> np.array(array_like)
-    array(<generator object <genexpr> at 0x1192741d0>, dtype=object)
+    array(<generator object <genexpr> at ...>, dtype=object)
 
 ndarray
 ~~~~~~~
@@ -68,14 +65,170 @@ the following code is valid:
 
 .. code-block:: python
 
-    x = np.array([1, 2])
-    x.dtype = np.bool_
+    >>> x = np.array([1, 2])
+    >>> x.dtype = np.bool_
 
 This sort of mutation is not allowed by the types. Users who want to
 write statically typed code should insted use the `numpy.ndarray.view`
 method to create a view of the array with a different dtype.
 
+DTypeLike
+~~~~~~~~~
+
+The `DTypeLike` type tries to avoid creation of dtype objects using
+dictionary of fields like below:
+
+.. code-block:: python
+
+    >>> x = np.dtype({"field1": (float, 1), "field2": (int, 3)})
+
+Although this is valid Numpy code, the type checker will complain about it,
+since its usage is discouraged.
+Please see : :ref:`Data type objects <arrays.dtypes>`
+
+Number Precision
+~~~~~~~~~~~~~~~~
+
+The precision of `numpy.number` subclasses is treated as a covariant generic
+parameter (see :class:`~NBitBase`), simplifying the annoting of proccesses
+involving precision-based casting.
+
+.. code-block:: python
+
+    >>> from typing import TypeVar
+    >>> import numpy as np
+    >>> import numpy.typing as npt
+
+    >>> T = TypeVar("T", bound=npt.NBitBase)
+    >>> def func(a: "np.floating[T]", b: "np.floating[T]") -> "np.floating[T]":
+    ...     ...
+
+Consequently, the likes of `~numpy.float16`, `~numpy.float32` and
+`~numpy.float64` are still sub-types of `~numpy.floating`, but, contrary to
+runtime, they're not necessarily considered as sub-classes.
+
+Timedelta64
+~~~~~~~~~~~
+
+The `~numpy.timedelta64` class is not considered a subclass of `~numpy.signedinteger`,
+the former only inheriting from `~numpy.generic` while static type checking.
+
+API
+---
+
 """
+# NOTE: The API section will be appended with additional entries
+# further down in this file
+
+from typing import TYPE_CHECKING, List
+
+if TYPE_CHECKING:
+    import sys
+    if sys.version_info >= (3, 8):
+        from typing import final
+    else:
+        from typing_extensions import final
+else:
+    def final(f): return f
+
+if not TYPE_CHECKING:
+    __all__ = ["ArrayLike", "DTypeLike", "NBitBase"]
+else:
+    # Ensure that all objects within this module are accessible while
+    # static type checking. This includes private ones, as we need them
+    # for internal use.
+    #
+    # Declare to mypy that `__all__` is a list of strings without assigning
+    # an explicit value
+    __all__: List[str]
+
+
+@final  # Dissallow the creation of arbitrary `NBitBase` subclasses
+class NBitBase:
+    """
+    An object representing `numpy.number` precision during static type checking.
+
+    Used exclusively for the purpose static type checking, `NBitBase`
+    represents the base of a hierachieral set of subclasses.
+    Each subsequent subclass is herein used for representing a lower level
+    of precision, *e.g.* ``64Bit > 32Bit > 16Bit``.
+
+    Examples
+    --------
+    Below is a typical usage example: `NBitBase` is herein used for annotating a
+    function that takes a float and integer of arbitrary precision as arguments
+    and returns a new float of whichever precision is largest
+    (*e.g.* ``np.float16 + np.int64 -> np.float64``).
+
+    .. code-block:: python
+
+        >>> from typing import TypeVar, TYPE_CHECKING
+        >>> import numpy as np
+        >>> import numpy.typing as npt
+
+        >>> T = TypeVar("T", bound=npt.NBitBase)
+
+        >>> def add(a: "np.floating[T]", b: "np.integer[T]") -> "np.floating[T]":
+        ...     return a + b
+
+        >>> a = np.float16()
+        >>> b = np.int64()
+        >>> out = add(a, b)
+
+        >>> if TYPE_CHECKING:
+        ...     reveal_locals()
+        ...     # note: Revealed local types are:
+        ...     # note:     a: numpy.floating[numpy.typing._16Bit*]
+        ...     # note:     b: numpy.signedinteger[numpy.typing._64Bit*]
+        ...     # note:     out: numpy.floating[numpy.typing._64Bit*]
+
+    """
+
+    def __init_subclass__(cls) -> None:
+        allowed_names = {
+            "NBitBase", "_256Bit", "_128Bit", "_96Bit", "_80Bit",
+            "_64Bit", "_32Bit", "_16Bit", "_8Bit",
+        }
+        if cls.__name__ not in allowed_names:
+            raise TypeError('cannot inherit from final class "NBitBase"')
+        super().__init_subclass__()
+
+
+# Silence errors about subclassing a `@final`-decorated class
+class _256Bit(NBitBase): ...  # type: ignore[misc]
+class _128Bit(_256Bit): ...  # type: ignore[misc]
+class _96Bit(_128Bit): ...  # type: ignore[misc]
+class _80Bit(_96Bit): ...  # type: ignore[misc]
+class _64Bit(_80Bit): ...  # type: ignore[misc]
+class _32Bit(_64Bit): ...  # type: ignore[misc]
+class _16Bit(_32Bit): ...  # type: ignore[misc]
+class _8Bit(_16Bit): ...  # type: ignore[misc]
+
+# Clean up the namespace
+del TYPE_CHECKING, final, List
+
+from ._scalars import (
+    _CharLike,
+    _BoolLike,
+    _UIntLike,
+    _IntLike,
+    _FloatLike,
+    _ComplexLike,
+    _TD64Like,
+    _NumberLike,
+    _ScalarLike,
+    _VoidLike,
+)
 from ._array_like import _SupportsArray, ArrayLike
 from ._shape import _Shape, _ShapeLike
-from ._dtype_like import DtypeLike
+from ._dtype_like import _SupportsDType, _VoidDTypeLike, DTypeLike
+
+if __doc__ is not None:
+    from ._add_docstring import _docstrings
+    __doc__ += _docstrings
+    __doc__ += '\n.. autoclass:: numpy.typing.NBitBase\n'
+    del _docstrings
+
+from numpy._pytesttester import PytestTester
+test = PytestTester(__name__)
+del PytestTester
diff --git a/numpy/typing/_add_docstring.py b/numpy/typing/_add_docstring.py
new file mode 100644
index 000000000..8e39fe2c6
--- /dev/null
+++ b/numpy/typing/_add_docstring.py
@@ -0,0 +1,96 @@
+"""A module for creating docstrings for sphinx ``data`` domains."""
+
+import re
+import textwrap
+
+_docstrings_list = []
+
+
+def add_newdoc(name, value, doc):
+    _docstrings_list.append((name, value, doc))
+
+
+def _parse_docstrings():
+    type_list_ret = []
+    for name, value, doc in _docstrings_list:
+        s = textwrap.dedent(doc).replace("\n", "\n    ")
+
+        # Replace sections by rubrics
+        lines = s.split("\n")
+        new_lines = []
+        indent = ""
+        for line in lines:
+            m = re.match(r'^(\s+)[-=]+\s*$', line)
+            if m and new_lines:
+                prev = textwrap.dedent(new_lines.pop())
+                if prev == "Examples":
+                    indent = ""
+                    new_lines.append(f'{m.group(1)}.. rubric:: {prev}')
+                else:
+                    indent = 4 * " "
+                    new_lines.append(f'{m.group(1)}.. admonition:: {prev}')
+                new_lines.append("")
+            else:
+                new_lines.append(f"{indent}{line}")
+        s = "\n".join(new_lines)
+
+        # Done.
+        type_list_ret.append(f""".. data:: {name}\n    :value: {value}\n    {s}""")
+    return "\n".join(type_list_ret)
+
+
+add_newdoc('ArrayLike', 'typing.Union[...]',
+    """
+    A `~typing.Union` representing objects that can be coerced into an `~numpy.ndarray`.
+
+    Among others this includes the likes of:
+
+    * Scalars.
+    * (Nested) sequences.
+    * Objects implementing the `~class.__array__` protocol.
+
+    See Also
+    --------
+    :term:`array_like`:
+        Any scalar or sequence that can be interpreted as an ndarray.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import numpy.typing as npt
+
+        >>> def as_array(a: npt.ArrayLike) -> np.ndarray:
+        ...     return np.array(a)
+
+    """)
+
+add_newdoc('DTypeLike', 'typing.Union[...]',
+    """
+    A `~typing.Union` representing objects that can be coerced into a `~numpy.dtype`.
+
+    Among others this includes the likes of:
+
+    * :class:`type` objects.
+    * Character codes or the names of :class:`type` objects.
+    * Objects with the ``.dtype`` attribute.
+
+    See Also
+    --------
+    :ref:`Specifying and constructing data types <arrays.dtypes.constructing>`
+        A comprehensive overview of all objects that can be coerced into data types.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        >>> import numpy as np
+        >>> import numpy.typing as npt
+
+        >>> def as_dtype(d: npt.DTypeLike) -> np.dtype:
+        ...     return np.dtype(d)
+
+    """)
+
+_docstrings = _parse_docstrings()
diff --git a/numpy/typing/_array_like.py b/numpy/typing/_array_like.py
index 76c0c839c..63b67b33a 100644
--- a/numpy/typing/_array_like.py
+++ b/numpy/typing/_array_like.py
@@ -1,8 +1,11 @@
+from __future__ import annotations
+
 import sys
-from typing import Any, overload, Sequence, TYPE_CHECKING, Union
+from typing import Any, overload, Sequence, TYPE_CHECKING, Union, TypeVar
 
-from numpy import ndarray
-from ._dtype_like import DtypeLike
+from numpy import ndarray, dtype
+from ._scalars import _ScalarLike
+from ._dtype_like import DTypeLike
 
 if sys.version_info >= (3, 8):
     from typing import Protocol
@@ -15,12 +18,15 @@ else:
     else:
         HAVE_PROTOCOL = True
 
+_DType = TypeVar("_DType", bound="dtype[Any]")
+
 if TYPE_CHECKING or HAVE_PROTOCOL:
-    class _SupportsArray(Protocol):
-        @overload
-        def __array__(self, __dtype: DtypeLike = ...) -> ndarray: ...
-        @overload
-        def __array__(self, dtype: DtypeLike = ...) -> ndarray: ...
+    # The `_SupportsArray` protocol only cares about the default dtype
+    # (i.e. `dtype=None`) of the to-be returned array.
+    # Concrete implementations of the protocol are responsible for adding
+    # any and all remaining overloads
+    class _SupportsArray(Protocol[_DType]):
+        def __array__(self, dtype: None = ...) -> ndarray[Any, _DType]: ...
 else:
     _SupportsArray = Any
 
@@ -31,4 +37,9 @@ else:
 # is resolved. See also the mypy issue:
 #
 # https://github.com/python/typing/issues/593
-ArrayLike = Union[bool, int, float, complex, _SupportsArray, Sequence]
+ArrayLike = Union[
+    _ScalarLike,
+    Sequence[_ScalarLike],
+    Sequence[Sequence[Any]],  # TODO: Wait for support for recursive types
+    "_SupportsArray[Any]",
+]
diff --git a/numpy/typing/_callable.py b/numpy/typing/_callable.py
new file mode 100644
index 000000000..831921fd7
--- /dev/null
+++ b/numpy/typing/_callable.py
@@ -0,0 +1,345 @@
+"""
+A module with various ``typing.Protocol`` subclasses that implement
+the ``__call__`` magic method.
+
+See the `Mypy documentation`_ on protocols for more details.
+
+.. _`Mypy documentation`: https://mypy.readthedocs.io/en/stable/protocols.html#callback-protocols
+
+"""
+
+import sys
+from typing import (
+    Union,
+    TypeVar,
+    overload,
+    Any,
+    Tuple,
+    NoReturn,
+    TYPE_CHECKING,
+)
+
+from numpy import (
+    ndarray,
+    generic,
+    bool_,
+    timedelta64,
+    number,
+    integer,
+    unsignedinteger,
+    signedinteger,
+    int8,
+    floating,
+    float64,
+    complexfloating,
+    complex128,
+)
+from ._scalars import (
+    _BoolLike,
+    _IntLike,
+    _FloatLike,
+    _ComplexLike,
+    _NumberLike,
+)
+from . import NBitBase
+from ._array_like import ArrayLike
+
+if sys.version_info >= (3, 8):
+    from typing import Protocol
+    HAVE_PROTOCOL = True
+else:
+    try:
+        from typing_extensions import Protocol
+    except ImportError:
+        HAVE_PROTOCOL = False
+    else:
+        HAVE_PROTOCOL = True
+
+if TYPE_CHECKING or HAVE_PROTOCOL:
+    _T = TypeVar("_T")
+    _2Tuple = Tuple[_T, _T]
+
+    _NBit_co = TypeVar("_NBit_co", covariant=True, bound=NBitBase)
+    _NBit = TypeVar("_NBit", bound=NBitBase)
+
+    _IntType = TypeVar("_IntType", bound=integer)
+    _FloatType = TypeVar("_FloatType", bound=floating)
+    _NumberType = TypeVar("_NumberType", bound=number)
+    _NumberType_co = TypeVar("_NumberType_co", covariant=True, bound=number)
+    _GenericType_co = TypeVar("_GenericType_co", covariant=True, bound=generic)
+
+    class _BoolOp(Protocol[_GenericType_co]):
+        @overload
+        def __call__(self, __other: _BoolLike) -> _GenericType_co: ...
+        @overload  # platform dependent
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(self, __other: _NumberType) -> _NumberType: ...
+
+    class _BoolBitOp(Protocol[_GenericType_co]):
+        @overload
+        def __call__(self, __other: _BoolLike) -> _GenericType_co: ...
+        @overload  # platform dependent
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: _IntType) -> _IntType: ...
+
+    class _BoolSub(Protocol):
+        # Note that `__other: bool_` is absent here
+        @overload
+        def __call__(self, __other: bool) -> NoReturn: ...
+        @overload  # platform dependent
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(self, __other: _NumberType) -> _NumberType: ...
+
+    class _BoolTrueDiv(Protocol):
+        @overload
+        def __call__(self, __other: Union[float, _IntLike]) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(self, __other: _NumberType) -> _NumberType: ...
+
+    class _BoolMod(Protocol):
+        @overload
+        def __call__(self, __other: _BoolLike) -> int8: ...
+        @overload  # platform dependent
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: _IntType) -> _IntType: ...
+        @overload
+        def __call__(self, __other: _FloatType) -> _FloatType: ...
+
+    class _BoolDivMod(Protocol):
+        @overload
+        def __call__(self, __other: _BoolLike) -> _2Tuple[int8]: ...
+        @overload  # platform dependent
+        def __call__(self, __other: int) -> _2Tuple[signedinteger[Any]]: ...
+        @overload
+        def __call__(self, __other: float) -> _2Tuple[float64]: ...
+        @overload
+        def __call__(self, __other: _IntType) -> _2Tuple[_IntType]: ...
+        @overload
+        def __call__(self, __other: _FloatType) -> _2Tuple[_FloatType]: ...
+
+    class _TD64Div(Protocol[_NumberType_co]):
+        @overload
+        def __call__(self, __other: timedelta64) -> _NumberType_co: ...
+        @overload
+        def __call__(self, __other: _FloatLike) -> timedelta64: ...
+
+    class _IntTrueDiv(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> floating[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> floating[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(self, __other: integer[_NBit]) -> floating[Union[_NBit_co, _NBit]]: ...
+
+    class _UnsignedIntOp(Protocol[_NBit_co]):
+        # NOTE: `uint64 + signedinteger -> float64`
+        @overload
+        def __call__(self, __other: bool) -> unsignedinteger[_NBit_co]: ...
+        @overload
+        def __call__(
+            self, __other: Union[int, signedinteger[Any]]
+        ) -> Union[signedinteger[Any], float64]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(
+            self, __other: unsignedinteger[_NBit]
+        ) -> unsignedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _UnsignedIntBitOp(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> unsignedinteger[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: signedinteger[Any]) -> signedinteger[Any]: ...
+        @overload
+        def __call__(
+            self, __other: unsignedinteger[_NBit]
+        ) -> unsignedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _UnsignedIntMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> unsignedinteger[_NBit_co]: ...
+        @overload
+        def __call__(
+            self, __other: Union[int, signedinteger[Any]]
+        ) -> Union[signedinteger[Any], float64]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(
+            self, __other: unsignedinteger[_NBit]
+        ) -> unsignedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _UnsignedIntDivMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> _2Tuple[signedinteger[_NBit_co]]: ...
+        @overload
+        def __call__(
+            self, __other: Union[int, signedinteger[Any]]
+        ) -> Union[_2Tuple[signedinteger[Any]], _2Tuple[float64]]: ...
+        @overload
+        def __call__(self, __other: float) -> _2Tuple[float64]: ...
+        @overload
+        def __call__(
+            self, __other: unsignedinteger[_NBit]
+        ) -> _2Tuple[unsignedinteger[Union[_NBit_co, _NBit]]]: ...
+
+    class _SignedIntOp(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> signedinteger[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(
+            self, __other: signedinteger[_NBit]
+        ) -> signedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _SignedIntBitOp(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> signedinteger[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(
+            self, __other: signedinteger[_NBit]
+        ) -> signedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _SignedIntMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> signedinteger[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> signedinteger[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(
+            self, __other: signedinteger[_NBit]
+        ) -> signedinteger[Union[_NBit_co, _NBit]]: ...
+
+    class _SignedIntDivMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> _2Tuple[signedinteger[_NBit_co]]: ...
+        @overload
+        def __call__(self, __other: int) -> _2Tuple[signedinteger[Any]]: ...
+        @overload
+        def __call__(self, __other: float) -> _2Tuple[float64]: ...
+        @overload
+        def __call__(
+            self, __other: signedinteger[_NBit]
+        ) -> _2Tuple[signedinteger[Union[_NBit_co, _NBit]]]: ...
+
+    class _FloatOp(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> floating[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> floating[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(self, __other: complex) -> complex128: ...
+        @overload
+        def __call__(
+            self, __other: Union[integer[_NBit], floating[_NBit]]
+        ) -> floating[Union[_NBit_co, _NBit]]: ...
+
+    class _FloatMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> floating[_NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> floating[Any]: ...
+        @overload
+        def __call__(self, __other: float) -> float64: ...
+        @overload
+        def __call__(
+            self, __other: Union[integer[_NBit], floating[_NBit]]
+        ) -> floating[Union[_NBit_co, _NBit]]: ...
+
+    class _FloatDivMod(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> _2Tuple[floating[_NBit_co]]: ...
+        @overload
+        def __call__(self, __other: int) -> _2Tuple[floating[Any]]: ...
+        @overload
+        def __call__(self, __other: float) -> _2Tuple[float64]: ...
+        @overload
+        def __call__(
+            self, __other: Union[integer[_NBit], floating[_NBit]]
+        ) -> _2Tuple[floating[Union[_NBit_co, _NBit]]]: ...
+
+    class _ComplexOp(Protocol[_NBit_co]):
+        @overload
+        def __call__(self, __other: bool) -> complexfloating[_NBit_co, _NBit_co]: ...
+        @overload
+        def __call__(self, __other: int) -> complexfloating[Any, Any]: ...
+        @overload
+        def __call__(self, __other: Union[float, complex]) -> complex128: ...
+        @overload
+        def __call__(
+            self,
+            __other: Union[
+                integer[_NBit],
+                floating[_NBit],
+                complexfloating[_NBit, _NBit],
+            ]
+        ) -> complexfloating[Union[_NBit_co, _NBit], Union[_NBit_co, _NBit]]: ...
+
+    class _NumberOp(Protocol):
+        def __call__(self, __other: _NumberLike) -> number: ...
+
+    class _ComparisonOp(Protocol[_T]):
+        @overload
+        def __call__(self, __other: _T) -> bool_: ...
+        @overload
+        def __call__(self, __other: ArrayLike) -> Union[ndarray, bool_]: ...
+
+else:
+    _BoolOp = Any
+    _BoolBitOp = Any
+    _BoolSub = Any
+    _BoolTrueDiv = Any
+    _BoolMod = Any
+    _BoolDivMod = Any
+    _TD64Div = Any
+    _IntTrueDiv = Any
+    _UnsignedIntOp = Any
+    _UnsignedIntBitOp = Any
+    _UnsignedIntMod = Any
+    _UnsignedIntDivMod = Any
+    _SignedIntOp = Any
+    _SignedIntBitOp = Any
+    _SignedIntMod = Any
+    _SignedIntDivMod = Any
+    _FloatOp = Any
+    _FloatMod = Any
+    _FloatDivMod = Any
+    _ComplexOp = Any
+    _NumberOp = Any
+    _ComparisonOp = Any
diff --git a/numpy/typing/_dtype_like.py b/numpy/typing/_dtype_like.py
index b9df0af04..1953bd5fc 100644
--- a/numpy/typing/_dtype_like.py
+++ b/numpy/typing/_dtype_like.py
@@ -1,25 +1,50 @@
-from typing import Any, Dict, List, Sequence, Tuple, Union
+import sys
+from typing import Any, List, Sequence, Tuple, Union, TYPE_CHECKING
 
 from numpy import dtype
 from ._shape import _ShapeLike
 
-_DtypeLikeNested = Any  # TODO: wait for support for recursive types
+if sys.version_info >= (3, 8):
+    from typing import Protocol, TypedDict
+    HAVE_PROTOCOL = True
+else:
+    try:
+        from typing_extensions import Protocol, TypedDict
+    except ImportError:
+        HAVE_PROTOCOL = False
+    else:
+        HAVE_PROTOCOL = True
 
-# Anything that can be coerced into numpy.dtype.
-# Reference: https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
-DtypeLike = Union[
-    dtype,
-    # default data type (float64)
-    None,
-    # array-scalar types and generic types
-    type,  # TODO: enumerate these when we add type hints for numpy scalars
-    # TODO: add a protocol for anything with a dtype attribute
-    # character codes, type strings or comma-separated fields, e.g., 'float64'
-    str,
+_DTypeLikeNested = Any  # TODO: wait for support for recursive types
+
+if TYPE_CHECKING or HAVE_PROTOCOL:
+    # Mandatory keys
+    class _DTypeDictBase(TypedDict):
+        names: Sequence[str]
+        formats: Sequence[_DTypeLikeNested]
+
+    # Mandatory + optional keys
+    class _DTypeDict(_DTypeDictBase, total=False):
+        offsets: Sequence[int]
+        titles: Sequence[Any]  # Only `str` elements are usable as indexing aliases, but all objects are legal
+        itemsize: int
+        aligned: bool
+
+    # A protocol for anything with the dtype attribute
+    class _SupportsDType(Protocol):
+        dtype: _DTypeLikeNested
+
+else:
+    _DTypeDict = Any
+    _SupportsDType = Any
+
+
+# Would create a dtype[np.void]
+_VoidDTypeLike = Union[
     # (flexible_dtype, itemsize)
-    Tuple[_DtypeLikeNested, int],
+    Tuple[_DTypeLikeNested, int],
     # (fixed_dtype, shape)
-    Tuple[_DtypeLikeNested, _ShapeLike],
+    Tuple[_DTypeLikeNested, _ShapeLike],
     # [(field_name, field_dtype, field_shape), ...]
     #
     # The type here is quite broad because NumPy accepts quite a wide
@@ -28,19 +53,29 @@ DtypeLike = Union[
     List[Any],
     # {'names': ..., 'formats': ..., 'offsets': ..., 'titles': ...,
     #  'itemsize': ...}
-    # TODO: use TypedDict when/if it's officially supported
-    Dict[
-        str,
-        Union[
-            Sequence[str],  # names
-            Sequence[_DtypeLikeNested],  # formats
-            Sequence[int],  # offsets
-            Sequence[Union[bytes, str, None]],  # titles
-            int,  # itemsize
-        ],
-    ],
-    # {'field1': ..., 'field2': ..., ...}
-    Dict[str, Tuple[_DtypeLikeNested, int]],
+    _DTypeDict,
     # (base_dtype, new_dtype)
-    Tuple[_DtypeLikeNested, _DtypeLikeNested],
+    Tuple[_DTypeLikeNested, _DTypeLikeNested],
 ]
+
+# Anything that can be coerced into numpy.dtype.
+# Reference: https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
+DTypeLike = Union[
+    dtype,
+    # default data type (float64)
+    None,
+    # array-scalar types and generic types
+    type,  # TODO: enumerate these when we add type hints for numpy scalars
+    # anything with a dtype attribute
+    _SupportsDType,
+    # character codes, type strings or comma-separated fields, e.g., 'float64'
+    str,
+    _VoidDTypeLike,
+]
+
+# NOTE: while it is possible to provide the dtype as a dict of
+# dtype-like objects (e.g. `{'field1': ..., 'field2': ..., ...}`),
+# this syntax is officially discourged and
+# therefore not included in the Union defining `DTypeLike`.
+#
+# See https://github.com/numpy/numpy/issues/16891 for more details.
diff --git a/numpy/typing/_scalars.py b/numpy/typing/_scalars.py
new file mode 100644
index 000000000..90b2eff7b
--- /dev/null
+++ b/numpy/typing/_scalars.py
@@ -0,0 +1,30 @@
+from typing import Union, Tuple, Any
+
+import numpy as np
+
+# NOTE: `_StrLike` and `_BytesLike` are pointless, as `np.str_` and `np.bytes_`
+# are already subclasses of their builtin counterpart
+
+_CharLike = Union[str, bytes]
+
+# The 6 `<X>Like` type-aliases below represent all scalars that can be
+# coerced into `<X>` (with the casting rule `same_kind`)
+_BoolLike = Union[bool, np.bool_]
+_UIntLike = Union[_BoolLike, np.unsignedinteger]
+_IntLike = Union[_BoolLike, int, np.integer]
+_FloatLike = Union[_IntLike, float, np.floating]
+_ComplexLike = Union[_FloatLike, complex, np.complexfloating]
+_TD64Like = Union[_IntLike, np.timedelta64]
+
+_NumberLike = Union[int, float, complex, np.number, np.bool_]
+_ScalarLike = Union[
+    int,
+    float,
+    complex,
+    str,
+    bytes,
+    np.generic,
+]
+
+# `_VoidLike` is technically not a scalar, but it's close enough
+_VoidLike = Union[Tuple[Any, ...], np.void]
diff --git a/numpy/tests/setup.py b/numpy/typing/setup.py
index f034cdf95..c444e769f 100644
--- a/numpy/tests/setup.py
+++ b/numpy/typing/setup.py
@@ -1,7 +1,8 @@
 def configuration(parent_package='', top_path=None):
     from numpy.distutils.misc_util import Configuration
-    config = Configuration('tests', parent_package, top_path)
-    config.add_data_dir('typing')
+    config = Configuration('typing', parent_package, top_path)
+    config.add_subpackage('tests')
+    config.add_data_dir('tests/data')
     return config
 
 
diff --git a/numpy/typing/tests/__init__.py b/numpy/typing/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/numpy/typing/tests/__init__.py
diff --git a/numpy/typing/tests/data/fail/arithmetic.py b/numpy/typing/tests/data/fail/arithmetic.py
new file mode 100644
index 000000000..f32eddc4b
--- /dev/null
+++ b/numpy/typing/tests/data/fail/arithmetic.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+b_ = np.bool_()
+dt = np.datetime64(0, "D")
+td = np.timedelta64(0, "D")
+
+b_ - b_  # E: No overload variant
+
+dt + dt  # E: Unsupported operand types
+td - dt  # E: Unsupported operand types
+td % 1  # E: Unsupported operand types
+td / dt  # E: No overload
+td % dt  # E: Unsupported operand types
+
+-b_  # E: Unsupported operand type
++b_  # E: Unsupported operand type
diff --git a/numpy/typing/tests/data/fail/array_constructors.py b/numpy/typing/tests/data/fail/array_constructors.py
new file mode 100644
index 000000000..9cb59fe5f
--- /dev/null
+++ b/numpy/typing/tests/data/fail/array_constructors.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+a: np.ndarray
+generator = (i for i in range(10))
+
+np.require(a, requirements=1)  # E: No overload variant
+np.require(a, requirements="TEST")  # E: incompatible type
+
+np.zeros("test")  # E: incompatible type
+np.zeros()  # E: Too few arguments
+
+np.ones("test")  # E: incompatible type
+np.ones()  # E: Too few arguments
+
+np.array(0, float, True)  # E: Too many positional
+
+np.linspace(None, 'bob')  # E: No overload variant
+np.linspace(0, 2, num=10.0)  # E: No overload variant
+np.linspace(0, 2, endpoint='True')  # E: No overload variant
+np.linspace(0, 2, retstep=b'False')  # E: No overload variant
+np.linspace(0, 2, dtype=0)  # E: No overload variant
+np.linspace(0, 2, axis=None)  # E: No overload variant
+
+np.logspace(None, 'bob')  # E: Argument 1
+np.logspace(0, 2, base=None)  # E: Argument "base"
+
+np.geomspace(None, 'bob')  # E: Argument 1
+
+np.stack(generator)  # E: No overload variant
+np.hstack({1, 2})  # E: incompatible type
+np.vstack(1)  # E: incompatible type
diff --git a/numpy/tests/typing/fail/array_like.py b/numpy/typing/tests/data/fail/array_like.py
index a97e72dc7..3bbd29061 100644
--- a/numpy/tests/typing/fail/array_like.py
+++ b/numpy/typing/tests/data/fail/array_like.py
@@ -11,6 +11,6 @@ x2: ArrayLike = A()  # E: Incompatible types in assignment
 x3: ArrayLike = {1: "foo", 2: "bar"}  # E: Incompatible types in assignment
 
 scalar = np.int64(1)
-scalar.__array__(dtype=np.float64)  # E: Unexpected keyword argument
+scalar.__array__(dtype=np.float64)  # E: No overload variant
 array = np.array([1])
-array.__array__(dtype=np.float64)  # E: Unexpected keyword argument
+array.__array__(dtype=np.float64)  # E: No overload variant
diff --git a/numpy/typing/tests/data/fail/bitwise_ops.py b/numpy/typing/tests/data/fail/bitwise_ops.py
new file mode 100644
index 000000000..8a8f89755
--- /dev/null
+++ b/numpy/typing/tests/data/fail/bitwise_ops.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+i8 = np.int64()
+i4 = np.int32()
+u8 = np.uint64()
+b_ = np.bool_()
+i = int()
+
+f8 = np.float64()
+
+b_ >> f8  # E: No overload variant
+i8 << f8  # E: No overload variant
+i | f8  # E: Unsupported operand types
+i8 ^ f8  # E: No overload variant
+u8 & f8  # E: No overload variant
+~f8  # E: Unsupported operand type
+
+# mypys' error message for `NoReturn` is unfortunately pretty bad
+# TODO: Reenable this once we add support for numerical precision for `number`s
+# a = u8 | 0  # E: Need type annotation
diff --git a/numpy/typing/tests/data/fail/constants.py b/numpy/typing/tests/data/fail/constants.py
new file mode 100644
index 000000000..67ee0e0bc
--- /dev/null
+++ b/numpy/typing/tests/data/fail/constants.py
@@ -0,0 +1,6 @@
+import numpy as np
+
+np.Inf = np.Inf  # E: Cannot assign to final
+np.ALLOW_THREADS = np.ALLOW_THREADS  # E: Cannot assign to final
+np.little_endian = np.little_endian  # E: Cannot assign to final
+np.UFUNC_PYVALS_NAME = np.UFUNC_PYVALS_NAME  # E: Cannot assign to final
diff --git a/numpy/typing/tests/data/fail/dtype.py b/numpy/typing/tests/data/fail/dtype.py
new file mode 100644
index 000000000..7d4783d8f
--- /dev/null
+++ b/numpy/typing/tests/data/fail/dtype.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+class Test:
+    not_dtype = float
+
+
+np.dtype(Test())  # E: No overload variant of "dtype" matches
+
+np.dtype(  # E: No overload variant of "dtype" matches
+    {
+        "field1": (float, 1),
+        "field2": (int, 3),
+    }
+)
+
+np.dtype[np.float64](np.int64)  # E: Argument 1 to "dtype" has incompatible type
diff --git a/numpy/typing/tests/data/fail/flatiter.py b/numpy/typing/tests/data/fail/flatiter.py
new file mode 100644
index 000000000..544ffbe4a
--- /dev/null
+++ b/numpy/typing/tests/data/fail/flatiter.py
@@ -0,0 +1,25 @@
+from typing import Any
+
+import numpy as np
+from numpy.typing import _SupportsArray
+
+
+class Index:
+    def __index__(self) -> int:
+        ...
+
+
+a: "np.flatiter[np.ndarray]"
+supports_array: _SupportsArray
+
+a.base = Any  # E: Property "base" defined in "flatiter" is read-only
+a.coords = Any  # E: Property "coords" defined in "flatiter" is read-only
+a.index = Any  # E: Property "index" defined in "flatiter" is read-only
+a.copy(order='C')  # E: Unexpected keyword argument
+
+# NOTE: Contrary to `ndarray.__getitem__` its counterpart in `flatiter`
+# does not accept objects with the `__array__` or `__index__` protocols;
+# boolean indexing is just plain broken (gh-17175)
+a[np.bool_()]  # E: No overload variant of "__getitem__"
+a[Index()]  # E: No overload variant of "__getitem__"
+a[supports_array]  # E: No overload variant of "__getitem__"
diff --git a/numpy/tests/typing/fail/fromnumeric.py b/numpy/typing/tests/data/fail/fromnumeric.py
index 66f8a89d0..c9156895d 100644
--- a/numpy/tests/typing/fail/fromnumeric.py
+++ b/numpy/typing/tests/data/fail/fromnumeric.py
@@ -124,3 +124,31 @@ np.amin(a, keepdims=1.0)  # E: No overload variant of "amin" matches argument ty
 np.amin(a, out=1.0)  # E: No overload variant of "amin" matches argument type
 np.amin(a, initial=[1.0])  # E: No overload variant of "amin" matches argument type
 np.amin(a, where=[1.0])  # E: List item 0 has incompatible type
+
+np.prod(a, axis=1.0)  # E: No overload variant of "prod" matches argument type
+np.prod(a, out=False)  # E: No overload variant of "prod" matches argument type
+np.prod(a, keepdims=1.0)  # E: No overload variant of "prod" matches argument type
+np.prod(a, initial=int)  # E: No overload variant of "prod" matches argument type
+np.prod(a, where=1.0)  # E: No overload variant of "prod" matches argument type
+
+np.cumprod(a, axis=1.0)  # E: Argument "axis" to "cumprod" has incompatible type
+np.cumprod(a, out=False)  # E: Argument "out" to "cumprod" has incompatible type
+
+np.size(a, axis=1.0)  # E: Argument "axis" to "size" has incompatible type
+
+np.around(a, decimals=1.0)  # E: No overload variant of "around" matches argument type
+np.around(a, out=type)  # E: No overload variant of "around" matches argument type
+
+np.mean(a, axis=1.0)  # E: No overload variant of "mean" matches argument type
+np.mean(a, out=False)  # E: No overload variant of "mean" matches argument type
+np.mean(a, keepdims=1.0)  # E: No overload variant of "mean" matches argument type
+
+np.std(a, axis=1.0)  # E: No overload variant of "std" matches argument type
+np.std(a, out=False)  # E: No overload variant of "std" matches argument type
+np.std(a, ddof='test')  # E: No overload variant of "std" matches argument type
+np.std(a, keepdims=1.0)  # E: No overload variant of "std" matches argument type
+
+np.var(a, axis=1.0)  # E: No overload variant of "var" matches argument type
+np.var(a, out=False)  # E: No overload variant of "var" matches argument type
+np.var(a, ddof='test')  # E: No overload variant of "var" matches argument type
+np.var(a, keepdims=1.0)  # E: No overload variant of "var" matches argument type
diff --git a/numpy/typing/tests/data/fail/modules.py b/numpy/typing/tests/data/fail/modules.py
new file mode 100644
index 000000000..5e2d820ab
--- /dev/null
+++ b/numpy/typing/tests/data/fail/modules.py
@@ -0,0 +1,10 @@
+import numpy as np
+
+np.testing.bob  # E: Module has no attribute
+np.bob  # E: Module has no attribute
+
+# Stdlib modules in the namespace by accident
+np.warnings  # E: Module has no attribute
+np.sys  # E: Module has no attribute
+np.os  # E: Module has no attribute
+np.math  # E: Module has no attribute
diff --git a/numpy/tests/typing/fail/ndarray.py b/numpy/typing/tests/data/fail/ndarray.py
index 5a5130d40..5a5130d40 100644
--- a/numpy/tests/typing/fail/ndarray.py
+++ b/numpy/typing/tests/data/fail/ndarray.py
diff --git a/numpy/typing/tests/data/fail/ndarray_misc.py b/numpy/typing/tests/data/fail/ndarray_misc.py
new file mode 100644
index 000000000..1e1496bfe
--- /dev/null
+++ b/numpy/typing/tests/data/fail/ndarray_misc.py
@@ -0,0 +1,21 @@
+"""
+Tests for miscellaneous (non-magic) ``np.ndarray``/``np.generic`` methods.
+
+More extensive tests are performed for the methods'
+function-based counterpart in `../from_numeric.py`.
+
+"""
+
+import numpy as np
+
+f8: np.float64
+
+f8.argpartition(0)  # E: has no attribute
+f8.diagonal()  # E: has no attribute
+f8.dot(1)  # E: has no attribute
+f8.nonzero()  # E: has no attribute
+f8.partition(0)  # E: has no attribute
+f8.put(0, 2)  # E: has no attribute
+f8.setfield(2, np.float64)  # E: has no attribute
+f8.sort()  # E: has no attribute
+f8.trace()  # E: has no attribute
diff --git a/numpy/tests/typing/fail/numerictypes.py b/numpy/typing/tests/data/fail/numerictypes.py
index dd03eacc1..94537a23b 100644
--- a/numpy/tests/typing/fail/numerictypes.py
+++ b/numpy/typing/tests/data/fail/numerictypes.py
@@ -10,4 +10,4 @@ np.issubsctype(1, np.int64)  # E: incompatible type "int"
 
 np.issubdtype(1, np.int64)  # E: incompatible type "int"
 
-np.find_common_type(np.int64, np.int64)  # E: incompatible type "Type[int64]"
+np.find_common_type(np.int64, np.int64)  # E: incompatible type "Type[signedinteger[Any]]"
diff --git a/numpy/tests/typing/fail/scalars.py b/numpy/typing/tests/data/fail/scalars.py
index 5d7221895..f09740875 100644
--- a/numpy/tests/typing/fail/scalars.py
+++ b/numpy/typing/tests/data/fail/scalars.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+f8: np.float64
+
 # Construction
 
 np.float32(3j)  # E: incompatible type
@@ -28,17 +30,6 @@ np.complex64(1, 2)  # E: Too many arguments
 
 np.datetime64(0)  # E: non-matching overload
 
-dt_64 = np.datetime64(0, "D")
-td_64 = np.timedelta64(1, "h")
-
-dt_64 + dt_64  # E: Unsupported operand types
-
-td_64 - dt_64  # E: Unsupported operand types
-td_64 / dt_64  # E: No overload
-td_64 % 1  # E: Unsupported operand types
-td_64 % dt_64  # E: Unsupported operand types
-
-
 class A:
     def __float__(self):
         return 1.0
@@ -58,11 +49,7 @@ np.void("test")  # E: incompatible type
 np.generic(1)  # E: Cannot instantiate abstract class
 np.number(1)  # E: Cannot instantiate abstract class
 np.integer(1)  # E: Cannot instantiate abstract class
-np.signedinteger(1)  # E: Cannot instantiate abstract class
-np.unsignedinteger(1)  # E: Cannot instantiate abstract class
 np.inexact(1)  # E: Cannot instantiate abstract class
-np.floating(1)  # E: Cannot instantiate abstract class
-np.complexfloating(1)  # E: Cannot instantiate abstract class
 np.character("test")  # E: Cannot instantiate abstract class
 np.flexible(b"test")  # E: Cannot instantiate abstract class
 
@@ -79,3 +66,11 @@ np.timedelta64(value=0)  # E: Unexpected keyword argument
 
 np.bytes_(b"hello", encoding='utf-8')  # E: No overload variant
 np.str_("hello", encoding='utf-8')  # E: No overload variant
+
+complex(np.bytes_("1"))  # E: No overload variant
+
+f8.item(1)  # E: incompatible type
+f8.item((0, 1))  # E: incompatible type
+f8.squeeze(axis=1)  # E: incompatible type
+f8.squeeze(axis=(0, 1))  # E: incompatible type
+f8.transpose(1)  # E: incompatible type
diff --git a/numpy/typing/tests/data/fail/ufunc_config.py b/numpy/typing/tests/data/fail/ufunc_config.py
new file mode 100644
index 000000000..f547fbb46
--- /dev/null
+++ b/numpy/typing/tests/data/fail/ufunc_config.py
@@ -0,0 +1,21 @@
+"""Typing tests for `numpy.core._ufunc_config`."""
+
+import numpy as np
+
+def func1(a: str, b: int, c: float) -> None: ...
+def func2(a: str, *, b: int) -> None: ...
+
+class Write1:
+    def write1(self, a: str) -> None: ...
+
+class Write2:
+    def write(self, a: str, b: str) -> None: ...
+
+class Write3:
+    def write(self, *, a: str) -> None: ...
+
+np.seterrcall(func1)  # E: Argument 1 to "seterrcall" has incompatible type
+np.seterrcall(func2)  # E: Argument 1 to "seterrcall" has incompatible type
+np.seterrcall(Write1())  # E: Argument 1 to "seterrcall" has incompatible type
+np.seterrcall(Write2())  # E: Argument 1 to "seterrcall" has incompatible type
+np.seterrcall(Write3())  # E: Argument 1 to "seterrcall" has incompatible type
diff --git a/numpy/tests/typing/fail/ufuncs.py b/numpy/typing/tests/data/fail/ufuncs.py
index 4da9d08ba..4da9d08ba 100644
--- a/numpy/tests/typing/fail/ufuncs.py
+++ b/numpy/typing/tests/data/fail/ufuncs.py
diff --git a/numpy/tests/typing/fail/warnings_and_errors.py b/numpy/typing/tests/data/fail/warnings_and_errors.py
index 7390cc45f..7390cc45f 100644
--- a/numpy/tests/typing/fail/warnings_and_errors.py
+++ b/numpy/typing/tests/data/fail/warnings_and_errors.py
diff --git a/numpy/tests/typing/mypy.ini b/numpy/typing/tests/data/mypy.ini
index 91d93588a..91d93588a 100644
--- a/numpy/tests/typing/mypy.ini
+++ b/numpy/typing/tests/data/mypy.ini
diff --git a/numpy/typing/tests/data/pass/arithmetic.py b/numpy/typing/tests/data/pass/arithmetic.py
new file mode 100644
index 000000000..ffbaf2975
--- /dev/null
+++ b/numpy/typing/tests/data/pass/arithmetic.py
@@ -0,0 +1,293 @@
+import numpy as np
+
+c16 = np.complex128(1)
+f8 = np.float64(1)
+i8 = np.int64(1)
+u8 = np.uint64(1)
+
+c8 = np.complex64(1)
+f4 = np.float32(1)
+i4 = np.int32(1)
+u4 = np.uint32(1)
+
+dt = np.datetime64(1, "D")
+td = np.timedelta64(1, "D")
+
+b_ = np.bool_(1)
+
+b = bool(1)
+c = complex(1)
+f = float(1)
+i = int(1)
+
+AR = np.ones(1, dtype=np.float64)
+AR.setflags(write=False)
+
+# unary ops
+
+-c16
+-c8
+-f8
+-f4
+-i8
+-i4
+-u8
+-u4
+-td
+-AR
+
++c16
++c8
++f8
++f4
++i8
++i4
++u8
++u4
++td
++AR
+
+abs(c16)
+abs(c8)
+abs(f8)
+abs(f4)
+abs(i8)
+abs(i4)
+abs(u8)
+abs(u4)
+abs(td)
+abs(b_)
+abs(AR)
+
+# Time structures
+
+dt + td
+dt + i
+dt + i4
+dt + i8
+dt - dt
+dt - i
+dt - i4
+dt - i8
+
+td + td
+td + i
+td + i4
+td + i8
+td - td
+td - i
+td - i4
+td - i8
+td / f
+td / f4
+td / f8
+td / td
+td // td
+td % td
+
+
+# boolean
+
+b_ / b
+b_ / b_
+b_ / i
+b_ / i8
+b_ / i4
+b_ / u8
+b_ / u4
+b_ / f
+b_ / f8
+b_ / f4
+b_ / c
+b_ / c16
+b_ / c8
+
+b / b_
+b_ / b_
+i / b_
+i8 / b_
+i4 / b_
+u8 / b_
+u4 / b_
+f / b_
+f8 / b_
+f4 / b_
+c / b_
+c16 / b_
+c8 / b_
+
+# Complex
+
+c16 + c16
+c16 + f8
+c16 + i8
+c16 + c8
+c16 + f4
+c16 + i4
+c16 + b_
+c16 + b
+c16 + c
+c16 + f
+c16 + i
+c16 + AR
+
+c16 + c16
+f8 + c16
+i8 + c16
+c8 + c16
+f4 + c16
+i4 + c16
+b_ + c16
+b + c16
+c + c16
+f + c16
+i + c16
+AR + c16
+
+c8 + c16
+c8 + f8
+c8 + i8
+c8 + c8
+c8 + f4
+c8 + i4
+c8 + b_
+c8 + b
+c8 + c
+c8 + f
+c8 + i
+c8 + AR
+
+c16 + c8
+f8 + c8
+i8 + c8
+c8 + c8
+f4 + c8
+i4 + c8
+b_ + c8
+b + c8
+c + c8
+f + c8
+i + c8
+AR + c8
+
+# Float
+
+f8 + f8
+f8 + i8
+f8 + f4
+f8 + i4
+f8 + b_
+f8 + b
+f8 + c
+f8 + f
+f8 + i
+f8 + AR
+
+f8 + f8
+i8 + f8
+f4 + f8
+i4 + f8
+b_ + f8
+b + f8
+c + f8
+f + f8
+i + f8
+AR + f8
+
+f4 + f8
+f4 + i8
+f4 + f4
+f4 + i4
+f4 + b_
+f4 + b
+f4 + c
+f4 + f
+f4 + i
+f4 + AR
+
+f8 + f4
+i8 + f4
+f4 + f4
+i4 + f4
+b_ + f4
+b + f4
+c + f4
+f + f4
+i + f4
+AR + f4
+
+# Int
+
+i8 + i8
+i8 + u8
+i8 + i4
+i8 + u4
+i8 + b_
+i8 + b
+i8 + c
+i8 + f
+i8 + i
+i8 + AR
+
+u8 + u8
+u8 + i4
+u8 + u4
+u8 + b_
+u8 + b
+u8 + c
+u8 + f
+u8 + i
+u8 + AR
+
+i8 + i8
+u8 + i8
+i4 + i8
+u4 + i8
+b_ + i8
+b + i8
+c + i8
+f + i8
+i + i8
+AR + i8
+
+u8 + u8
+i4 + u8
+u4 + u8
+b_ + u8
+b + u8
+c + u8
+f + u8
+i + u8
+AR + u8
+
+i4 + i8
+i4 + i4
+i4 + i
+i4 + b_
+i4 + b
+i4 + AR
+
+u4 + i8
+u4 + i4
+u4 + u8
+u4 + u4
+u4 + i
+u4 + b_
+u4 + b
+u4 + AR
+
+i8 + i4
+i4 + i4
+i + i4
+b_ + i4
+b + i4
+AR + i4
+
+i8 + u4
+i4 + u4
+u8 + u4
+u4 + u4
+b_ + u4
+b + u4
+i + u4
+AR + u4
diff --git a/numpy/typing/tests/data/pass/array_constructors.py b/numpy/typing/tests/data/pass/array_constructors.py
new file mode 100644
index 000000000..63208f139
--- /dev/null
+++ b/numpy/typing/tests/data/pass/array_constructors.py
@@ -0,0 +1,128 @@
+from typing import List, Any
+import numpy as np
+
+class Index:
+    def __index__(self) -> int:
+        return 0
+
+class SubClass(np.ndarray): ...
+
+i8 = np.int64(1)
+
+A = np.array([1])
+B = A.view(SubClass).copy()
+B_stack = np.array([[1], [1]]).view(SubClass)
+C = [1]
+
+def func(i: int, j: int, **kwargs: Any) -> SubClass:
+    return B
+
+np.array(1, dtype=float)
+np.array(1, copy=False)
+np.array(1, order='F')
+np.array(1, order=None)
+np.array(1, subok=True)
+np.array(1, ndmin=3)
+np.array(1, str, copy=True, order='C', subok=False, ndmin=2)
+
+np.asarray(A)
+np.asarray(B)
+np.asarray(C)
+
+np.asanyarray(A)
+np.asanyarray(B)
+np.asanyarray(B, dtype=int)
+np.asanyarray(C)
+
+np.ascontiguousarray(A)
+np.ascontiguousarray(B)
+np.ascontiguousarray(C)
+
+np.asfortranarray(A)
+np.asfortranarray(B)
+np.asfortranarray(C)
+
+np.require(A)
+np.require(B)
+np.require(B, dtype=int)
+np.require(B, requirements=None)
+np.require(B, requirements="E")
+np.require(B, requirements=["ENSUREARRAY"])
+np.require(B, requirements={"F", "E"})
+np.require(B, requirements=["C", "OWNDATA"])
+np.require(B, requirements="W")
+np.require(B, requirements="A")
+np.require(C)
+
+np.linspace(0, 2)
+np.linspace(0.5, [0, 1, 2])
+np.linspace([0, 1, 2], 3)
+np.linspace(0j, 2)
+np.linspace(0, 2, num=10)
+np.linspace(0, 2, endpoint=True)
+np.linspace(0, 2, retstep=True)
+np.linspace(0j, 2j, retstep=True)
+np.linspace(0, 2, dtype=bool)
+np.linspace([0, 1], [2, 3], axis=Index())
+
+np.logspace(0, 2, base=2)
+np.logspace(0, 2, base=2)
+np.logspace(0, 2, base=[1j, 2j], num=2)
+
+np.geomspace(1, 2)
+
+np.zeros_like(A)
+np.zeros_like(C)
+np.zeros_like(B)
+np.zeros_like(B, dtype=np.int64)
+
+np.ones_like(A)
+np.ones_like(C)
+np.ones_like(B)
+np.ones_like(B, dtype=np.int64)
+
+np.empty_like(A)
+np.empty_like(C)
+np.empty_like(B)
+np.empty_like(B, dtype=np.int64)
+
+np.full_like(A, i8)
+np.full_like(C, i8)
+np.full_like(B, i8)
+np.full_like(B, i8, dtype=np.int64)
+
+np.ones(1)
+np.ones([1, 1, 1])
+
+np.full(1, i8)
+np.full([1, 1, 1], i8)
+
+np.indices([1, 2, 3])
+np.indices([1, 2, 3], sparse=True)
+
+np.fromfunction(func, (3, 5))
+
+np.identity(10)
+
+np.atleast_1d(C)
+np.atleast_1d(A)
+np.atleast_1d(C, C)
+np.atleast_1d(C, A)
+np.atleast_1d(A, A)
+
+np.atleast_2d(C)
+
+np.atleast_3d(C)
+
+np.vstack([C, C])
+np.vstack([C, A])
+np.vstack([A, A])
+
+np.hstack([C, C])
+
+np.stack([C, C])
+np.stack([C, C], axis=0)
+np.stack([C, C], out=B_stack)
+
+np.block([[C, C], [C, C]])
+np.block(A)
diff --git a/numpy/tests/typing/pass/array_like.py b/numpy/typing/tests/data/pass/array_like.py
index 6b823ca7e..563fc08c7 100644
--- a/numpy/tests/typing/pass/array_like.py
+++ b/numpy/typing/tests/data/pass/array_like.py
@@ -1,7 +1,7 @@
 from typing import Any, List, Optional
 
 import numpy as np
-from numpy.typing import ArrayLike, DtypeLike, _SupportsArray
+from numpy.typing import ArrayLike, DTypeLike, _SupportsArray
 
 x1: ArrayLike = True
 x2: ArrayLike = 5
@@ -18,20 +18,20 @@ x12: ArrayLike = memoryview(b'foo')
 
 
 class A:
-    def __array__(self, dtype: DtypeLike = None) -> np.ndarray:
+    def __array__(self, dtype: DTypeLike = None) -> np.ndarray:
         return np.array([1, 2, 3])
 
 
 x13: ArrayLike = A()
 
 scalar: _SupportsArray = np.int64(1)
-scalar.__array__(np.float64)
+scalar.__array__(None)
 array: _SupportsArray = np.array(1)
-array.__array__(np.float64)
+array.__array__(None)
 
 a: _SupportsArray = A()
-a.__array__(np.int64)
-a.__array__(dtype=np.int64)
+a.__array__(None)
+a.__array__(dtype=None)
 
 # Escape hatch for when you mean to make something like an object
 # array.
diff --git a/numpy/typing/tests/data/pass/bitwise_ops.py b/numpy/typing/tests/data/pass/bitwise_ops.py
new file mode 100644
index 000000000..67449e2c2
--- /dev/null
+++ b/numpy/typing/tests/data/pass/bitwise_ops.py
@@ -0,0 +1,131 @@
+import numpy as np
+
+i8 = np.int64(1)
+u8 = np.uint64(1)
+
+i4 = np.int32(1)
+u4 = np.uint32(1)
+
+b_ = np.bool_(1)
+
+b = bool(1)
+i = int(1)
+
+AR = np.array([0, 1, 2], dtype=np.int32)
+AR.setflags(write=False)
+
+
+i8 << i8
+i8 >> i8
+i8 | i8
+i8 ^ i8
+i8 & i8
+
+i8 << AR
+i8 >> AR
+i8 | AR
+i8 ^ AR
+i8 & AR
+
+i4 << i4
+i4 >> i4
+i4 | i4
+i4 ^ i4
+i4 & i4
+
+i8 << i4
+i8 >> i4
+i8 | i4
+i8 ^ i4
+i8 & i4
+
+i8 << i
+i8 >> i
+i8 | i
+i8 ^ i
+i8 & i
+
+i8 << b_
+i8 >> b_
+i8 | b_
+i8 ^ b_
+i8 & b_
+
+i8 << b
+i8 >> b
+i8 | b
+i8 ^ b
+i8 & b
+
+u8 << u8
+u8 >> u8
+u8 | u8
+u8 ^ u8
+u8 & u8
+
+u8 << AR
+u8 >> AR
+u8 | AR
+u8 ^ AR
+u8 & AR
+
+u4 << u4
+u4 >> u4
+u4 | u4
+u4 ^ u4
+u4 & u4
+
+u4 << i4
+u4 >> i4
+u4 | i4
+u4 ^ i4
+u4 & i4
+
+u4 << i
+u4 >> i
+u4 | i
+u4 ^ i
+u4 & i
+
+u8 << b_
+u8 >> b_
+u8 | b_
+u8 ^ b_
+u8 & b_
+
+u8 << b
+u8 >> b
+u8 | b
+u8 ^ b
+u8 & b
+
+b_ << b_
+b_ >> b_
+b_ | b_
+b_ ^ b_
+b_ & b_
+
+b_ << AR
+b_ >> AR
+b_ | AR
+b_ ^ AR
+b_ & AR
+
+b_ << b
+b_ >> b
+b_ | b
+b_ ^ b
+b_ & b
+
+b_ << i
+b_ >> i
+b_ | i
+b_ ^ i
+b_ & i
+
+~i8
+~i4
+~u8
+~u4
+~b_
+~AR
diff --git a/numpy/typing/tests/data/pass/comparisons.py b/numpy/typing/tests/data/pass/comparisons.py
new file mode 100644
index 000000000..b298117a6
--- /dev/null
+++ b/numpy/typing/tests/data/pass/comparisons.py
@@ -0,0 +1,247 @@
+import numpy as np
+
+c16 = np.complex128()
+f8 = np.float64()
+i8 = np.int64()
+u8 = np.uint64()
+
+c8 = np.complex64()
+f4 = np.float32()
+i4 = np.int32()
+u4 = np.uint32()
+
+dt = np.datetime64(0, "D")
+td = np.timedelta64(0, "D")
+
+b_ = np.bool_()
+
+b = bool()
+c = complex()
+f = float()
+i = int()
+
+AR = np.array([0], dtype=np.int64)
+AR.setflags(write=False)
+
+SEQ = (0, 1, 2, 3, 4)
+
+# Time structures
+
+dt > dt
+
+td > td
+td > i
+td > i4
+td > i8
+td > AR
+td > SEQ
+
+# boolean
+
+b_ > b
+b_ > b_
+b_ > i
+b_ > i8
+b_ > i4
+b_ > u8
+b_ > u4
+b_ > f
+b_ > f8
+b_ > f4
+b_ > c
+b_ > c16
+b_ > c8
+b_ > AR
+b_ > SEQ
+
+# Complex
+
+c16 > c16
+c16 > f8
+c16 > i8
+c16 > c8
+c16 > f4
+c16 > i4
+c16 > b_
+c16 > b
+c16 > c
+c16 > f
+c16 > i
+c16 > AR
+c16 > SEQ
+
+c16 > c16
+f8 > c16
+i8 > c16
+c8 > c16
+f4 > c16
+i4 > c16
+b_ > c16
+b > c16
+c > c16
+f > c16
+i > c16
+AR > c16
+SEQ > c16
+
+c8 > c16
+c8 > f8
+c8 > i8
+c8 > c8
+c8 > f4
+c8 > i4
+c8 > b_
+c8 > b
+c8 > c
+c8 > f
+c8 > i
+c8 > AR
+c8 > SEQ
+
+c16 > c8
+f8 > c8
+i8 > c8
+c8 > c8
+f4 > c8
+i4 > c8
+b_ > c8
+b > c8
+c > c8
+f > c8
+i > c8
+AR > c8
+SEQ > c8
+
+# Float
+
+f8 > f8
+f8 > i8
+f8 > f4
+f8 > i4
+f8 > b_
+f8 > b
+f8 > c
+f8 > f
+f8 > i
+f8 > AR
+f8 > SEQ
+
+f8 > f8
+i8 > f8
+f4 > f8
+i4 > f8
+b_ > f8
+b > f8
+c > f8
+f > f8
+i > f8
+AR > f8
+SEQ > f8
+
+f4 > f8
+f4 > i8
+f4 > f4
+f4 > i4
+f4 > b_
+f4 > b
+f4 > c
+f4 > f
+f4 > i
+f4 > AR
+f4 > SEQ
+
+f8 > f4
+i8 > f4
+f4 > f4
+i4 > f4
+b_ > f4
+b > f4
+c > f4
+f > f4
+i > f4
+AR > f4
+SEQ > f4
+
+# Int
+
+i8 > i8
+i8 > u8
+i8 > i4
+i8 > u4
+i8 > b_
+i8 > b
+i8 > c
+i8 > f
+i8 > i
+i8 > AR
+i8 > SEQ
+
+u8 > u8
+u8 > i4
+u8 > u4
+u8 > b_
+u8 > b
+u8 > c
+u8 > f
+u8 > i
+u8 > AR
+u8 > SEQ
+
+i8 > i8
+u8 > i8
+i4 > i8
+u4 > i8
+b_ > i8
+b > i8
+c > i8
+f > i8
+i > i8
+AR > i8
+SEQ > i8
+
+u8 > u8
+i4 > u8
+u4 > u8
+b_ > u8
+b > u8
+c > u8
+f > u8
+i > u8
+AR > u8
+SEQ > u8
+
+i4 > i8
+i4 > i4
+i4 > i
+i4 > b_
+i4 > b
+i4 > AR
+i4 > SEQ
+
+u4 > i8
+u4 > i4
+u4 > u8
+u4 > u4
+u4 > i
+u4 > b_
+u4 > b
+u4 > AR
+u4 > SEQ
+
+i8 > i4
+i4 > i4
+i > i4
+b_ > i4
+b > i4
+AR > i4
+SEQ > i4
+
+i8 > u4
+i4 > u4
+u8 > u4
+u4 > u4
+b_ > u4
+b > u4
+i > u4
+AR > u4
+SEQ > u4
diff --git a/numpy/typing/tests/data/pass/dtype.py b/numpy/typing/tests/data/pass/dtype.py
new file mode 100644
index 000000000..a97edc302
--- /dev/null
+++ b/numpy/typing/tests/data/pass/dtype.py
@@ -0,0 +1,43 @@
+import numpy as np
+
+dtype_obj = np.dtype(np.str_)
+
+np.dtype(dtype=np.int64)
+np.dtype(int)
+np.dtype("int")
+np.dtype(None)
+
+np.dtype((int, 2))
+np.dtype((int, (1,)))
+
+np.dtype({"names": ["a", "b"], "formats": [int, float]})
+np.dtype({"names": ["a"], "formats": [int], "titles": [object]})
+np.dtype({"names": ["a"], "formats": [int], "titles": [object()]})
+
+np.dtype([("name", np.unicode_, 16), ("grades", np.float64, (2,)), ("age", "int32")])
+
+np.dtype(
+    {
+        "names": ["a", "b"],
+        "formats": [int, float],
+        "itemsize": 9,
+        "aligned": False,
+        "titles": ["x", "y"],
+        "offsets": [0, 1],
+    }
+)
+
+np.dtype((np.float_, float))
+
+
+class Test:
+    dtype = float
+
+
+np.dtype(Test())
+
+# Methods and attributes
+dtype_obj.base
+dtype_obj.subdtype
+dtype_obj.newbyteorder()
+dtype_obj.type
diff --git a/numpy/typing/tests/data/pass/flatiter.py b/numpy/typing/tests/data/pass/flatiter.py
new file mode 100644
index 000000000..4fdf25299
--- /dev/null
+++ b/numpy/typing/tests/data/pass/flatiter.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+a = np.empty((2, 2)).flat
+
+a.base
+a.copy()
+a.coords
+a.index
+iter(a)
+next(a)
+a[0]
+a[[0, 1, 2]]
+a[...]
+a[:]
+a.__array__()
+a.__array__(np.float64)
diff --git a/numpy/tests/typing/pass/fromnumeric.py b/numpy/typing/tests/data/pass/fromnumeric.py
index d9dd45c54..9e936e684 100644
--- a/numpy/tests/typing/pass/fromnumeric.py
+++ b/numpy/typing/tests/data/pass/fromnumeric.py
@@ -10,6 +10,7 @@ B.setflags(write=False)
 a = np.bool_(True)
 b = np.float32(1.0)
 c = 1.0
+d = np.array(1.0, dtype=np.float32)  # writeable
 
 np.take(a, 0)
 np.take(b, 0)
@@ -183,3 +184,77 @@ np.amin(A, axis=0)
 np.amin(B, axis=0)
 np.amin(A, keepdims=True)
 np.amin(B, keepdims=True)
+
+np.prod(a)
+np.prod(b)
+np.prod(c)
+np.prod(A)
+np.prod(B)
+np.prod(a, dtype=None)
+np.prod(A, dtype=None)
+np.prod(A, axis=0)
+np.prod(B, axis=0)
+np.prod(A, keepdims=True)
+np.prod(B, keepdims=True)
+np.prod(b, out=d)
+np.prod(B, out=d)
+
+np.cumprod(a)
+np.cumprod(b)
+np.cumprod(c)
+np.cumprod(A)
+np.cumprod(B)
+
+np.ndim(a)
+np.ndim(b)
+np.ndim(c)
+np.ndim(A)
+np.ndim(B)
+
+np.size(a)
+np.size(b)
+np.size(c)
+np.size(A)
+np.size(B)
+
+np.around(a)
+np.around(b)
+np.around(c)
+np.around(A)
+np.around(B)
+
+np.mean(a)
+np.mean(b)
+np.mean(c)
+np.mean(A)
+np.mean(B)
+np.mean(A, axis=0)
+np.mean(B, axis=0)
+np.mean(A, keepdims=True)
+np.mean(B, keepdims=True)
+np.mean(b, out=d)
+np.mean(B, out=d)
+
+np.std(a)
+np.std(b)
+np.std(c)
+np.std(A)
+np.std(B)
+np.std(A, axis=0)
+np.std(B, axis=0)
+np.std(A, keepdims=True)
+np.std(B, keepdims=True)
+np.std(b, out=d)
+np.std(B, out=d)
+
+np.var(a)
+np.var(b)
+np.var(c)
+np.var(A)
+np.var(B)
+np.var(A, axis=0)
+np.var(B, axis=0)
+np.var(A, keepdims=True)
+np.var(B, keepdims=True)
+np.var(b, out=d)
+np.var(B, out=d)
diff --git a/numpy/typing/tests/data/pass/literal.py b/numpy/typing/tests/data/pass/literal.py
new file mode 100644
index 000000000..8eaeb6afb
--- /dev/null
+++ b/numpy/typing/tests/data/pass/literal.py
@@ -0,0 +1,45 @@
+from functools import partial
+from typing import Callable, List, Tuple
+
+import pytest  # type: ignore
+import numpy as np
+
+AR = np.array(0)
+AR.setflags(write=False)
+
+KACF = frozenset({None, "K", "A", "C", "F"})
+ACF = frozenset({None, "A", "C", "F"})
+CF = frozenset({None, "C", "F"})
+
+order_list: List[Tuple[frozenset, Callable]] = [
+    (KACF, partial(np.ndarray, 1)),
+    (KACF, AR.tobytes),
+    (KACF, partial(AR.astype, int)),
+    (KACF, AR.copy),
+    (ACF, partial(AR.reshape, 1)),
+    (KACF, AR.flatten),
+    (KACF, AR.ravel),
+    (KACF, partial(np.array, 1)),
+    (CF, partial(np.zeros, 1)),
+    (CF, partial(np.ones, 1)),
+    (CF, partial(np.empty, 1)),
+    (CF, partial(np.full, 1, 1)),
+    (KACF, partial(np.zeros_like, AR)),
+    (KACF, partial(np.ones_like, AR)),
+    (KACF, partial(np.empty_like, AR)),
+    (KACF, partial(np.full_like, AR, 1)),
+    (KACF, partial(np.add, 1, 1)),  # i.e. np.ufunc.__call__
+    (ACF, partial(np.reshape, AR, 1)),
+    (KACF, partial(np.ravel, AR)),
+    (KACF, partial(np.asarray, 1)),
+    (KACF, partial(np.asanyarray, 1)),
+]
+
+for order_set, func in order_list:
+    for order in order_set:
+        func(order=order)
+
+    invalid_orders = KACF - order_set
+    for order in invalid_orders:
+        with pytest.raises(ValueError):
+            func(order=order)
diff --git a/numpy/typing/tests/data/pass/mod.py b/numpy/typing/tests/data/pass/mod.py
new file mode 100644
index 000000000..b5b9afb2a
--- /dev/null
+++ b/numpy/typing/tests/data/pass/mod.py
@@ -0,0 +1,149 @@
+import numpy as np
+
+f8 = np.float64(1)
+i8 = np.int64(1)
+u8 = np.uint64(1)
+
+f4 = np.float32(1)
+i4 = np.int32(1)
+u4 = np.uint32(1)
+
+td = np.timedelta64(1, "D")
+b_ = np.bool_(1)
+
+b = bool(1)
+f = float(1)
+i = int(1)
+
+AR = np.array([1], dtype=np.bool_)
+AR.setflags(write=False)
+
+AR2 = np.array([1], dtype=np.timedelta64)
+AR2.setflags(write=False)
+
+# Time structures
+
+td % td
+td % AR2
+AR2 % td
+
+divmod(td, td)
+divmod(td, AR2)
+divmod(AR2, td)
+
+# Bool
+
+b_ % b
+b_ % i
+b_ % f
+b_ % b_
+b_ % i8
+b_ % u8
+b_ % f8
+b_ % AR
+
+divmod(b_, b)
+divmod(b_, i)
+divmod(b_, f)
+divmod(b_, b_)
+divmod(b_, i8)
+divmod(b_, u8)
+divmod(b_, f8)
+divmod(b_, AR)
+
+b % b_
+i % b_
+f % b_
+b_ % b_
+i8 % b_
+u8 % b_
+f8 % b_
+AR % b_
+
+divmod(b, b_)
+divmod(i, b_)
+divmod(f, b_)
+divmod(b_, b_)
+divmod(i8, b_)
+divmod(u8, b_)
+divmod(f8, b_)
+divmod(AR, b_)
+
+# int
+
+i8 % b
+i8 % i
+i8 % f
+i8 % i8
+i8 % f8
+i4 % i8
+i4 % f8
+i4 % i4
+i4 % f4
+i8 % AR
+
+divmod(i8, b)
+divmod(i8, i)
+divmod(i8, f)
+divmod(i8, i8)
+divmod(i8, f8)
+divmod(i8, i4)
+divmod(i8, f4)
+divmod(i4, i4)
+divmod(i4, f4)
+divmod(i8, AR)
+
+b % i8
+i % i8
+f % i8
+i8 % i8
+f8 % i8
+i8 % i4
+f8 % i4
+i4 % i4
+f4 % i4
+AR % i8
+
+divmod(b, i8)
+divmod(i, i8)
+divmod(f, i8)
+divmod(i8, i8)
+divmod(f8, i8)
+divmod(i4, i8)
+divmod(f4, i8)
+divmod(i4, i4)
+divmod(f4, i4)
+divmod(AR, i8)
+
+# float
+
+f8 % b
+f8 % i
+f8 % f
+i8 % f4
+f4 % f4
+f8 % AR
+
+divmod(f8, b)
+divmod(f8, i)
+divmod(f8, f)
+divmod(f8, f8)
+divmod(f8, f4)
+divmod(f4, f4)
+divmod(f8, AR)
+
+b % f8
+i % f8
+f % f8
+f8 % f8
+f8 % f8
+f4 % f4
+AR % f8
+
+divmod(b, f8)
+divmod(i, f8)
+divmod(f, f8)
+divmod(f8, f8)
+divmod(f4, f8)
+divmod(f4, f4)
+divmod(AR, f8)
diff --git a/numpy/tests/typing/pass/ndarray_conversion.py b/numpy/typing/tests/data/pass/ndarray_conversion.py
index 303cf53e4..303cf53e4 100644
--- a/numpy/tests/typing/pass/ndarray_conversion.py
+++ b/numpy/typing/tests/data/pass/ndarray_conversion.py
diff --git a/numpy/typing/tests/data/pass/ndarray_misc.py b/numpy/typing/tests/data/pass/ndarray_misc.py
new file mode 100644
index 000000000..6c6f5d50b
--- /dev/null
+++ b/numpy/typing/tests/data/pass/ndarray_misc.py
@@ -0,0 +1,159 @@
+"""
+Tests for miscellaneous (non-magic) ``np.ndarray``/``np.generic`` methods.
+
+More extensive tests are performed for the methods'
+function-based counterpart in `../from_numeric.py`.
+
+"""
+
+from typing import cast
+import numpy as np
+
+class SubClass(np.ndarray): ...
+
+i4 = np.int32(1)
+A = np.array([[1]], dtype=np.int32)
+B0 = np.empty((), dtype=np.int32).view(SubClass)
+B1 = np.empty((1,), dtype=np.int32).view(SubClass)
+B2 = np.empty((1, 1), dtype=np.int32).view(SubClass)
+C = np.array([0, 1, 2], dtype=np.int32)
+D = np.empty(3).view(SubClass)
+
+i4.all()
+A.all()
+A.all(axis=0)
+A.all(keepdims=True)
+A.all(out=B0)
+
+i4.any()
+A.any()
+A.any(axis=0)
+A.any(keepdims=True)
+A.any(out=B0)
+
+i4.argmax()
+A.argmax()
+A.argmax(axis=0)
+A.argmax(out=B0)
+
+i4.argmin()
+A.argmin()
+A.argmin(axis=0)
+A.argmin(out=B0)
+
+i4.argsort()
+A.argsort()
+
+i4.choose([()])
+_choices = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]], dtype=np.int32)
+C.choose(_choices)
+C.choose(_choices, out=D)
+
+i4.clip(1)
+A.clip(1)
+A.clip(None, 1)
+A.clip(1, out=B2)
+A.clip(None, 1, out=B2)
+
+i4.compress([1])
+A.compress([1])
+A.compress([1], out=B1)
+
+i4.conj()
+A.conj()
+B0.conj()
+
+i4.conjugate()
+A.conjugate()
+B0.conjugate()
+
+i4.cumprod()
+A.cumprod()
+A.cumprod(out=B1)
+
+i4.cumsum()
+A.cumsum()
+A.cumsum(out=B1)
+
+i4.max()
+A.max()
+A.max(axis=0)
+A.max(keepdims=True)
+A.max(out=B0)
+
+i4.mean()
+A.mean()
+A.mean(axis=0)
+A.mean(keepdims=True)
+A.mean(out=B0)
+
+i4.min()
+A.min()
+A.min(axis=0)
+A.min(keepdims=True)
+A.min(out=B0)
+
+i4.newbyteorder()
+A.newbyteorder()
+B0.newbyteorder('|')
+
+i4.prod()
+A.prod()
+A.prod(axis=0)
+A.prod(keepdims=True)
+A.prod(out=B0)
+
+i4.ptp()
+A.ptp()
+A.ptp(axis=0)
+A.ptp(keepdims=True)
+A.astype(int).ptp(out=B0)
+
+i4.round()
+A.round()
+A.round(out=B2)
+
+i4.repeat(1)
+A.repeat(1)
+B0.repeat(1)
+
+i4.std()
+A.std()
+A.std(axis=0)
+A.std(keepdims=True)
+A.std(out=B0.astype(np.float64))
+
+i4.sum()
+A.sum()
+A.sum(axis=0)
+A.sum(keepdims=True)
+A.sum(out=B0)
+
+i4.take(0)
+A.take(0)
+A.take([0])
+A.take(0, out=B0)
+A.take([0], out=B1)
+
+i4.var()
+A.var()
+A.var(axis=0)
+A.var(keepdims=True)
+A.var(out=B0)
+
+A.argpartition([0])
+
+A.diagonal()
+
+A.dot(1)
+A.dot(1, out=B0)
+
+A.nonzero()
+
+C.searchsorted(1)
+
+A.trace()
+A.trace(out=B0)
+
+void = cast(np.void, np.array(1, dtype=[("f", np.float64)]).take(0))
+void.setfield(10, np.float64)
diff --git a/numpy/tests/typing/pass/ndarray_shape_manipulation.py b/numpy/typing/tests/data/pass/ndarray_shape_manipulation.py
index 0ca3dff39..0ca3dff39 100644
--- a/numpy/tests/typing/pass/ndarray_shape_manipulation.py
+++ b/numpy/typing/tests/data/pass/ndarray_shape_manipulation.py
diff --git a/numpy/typing/tests/data/pass/numeric.py b/numpy/typing/tests/data/pass/numeric.py
new file mode 100644
index 000000000..34fef7270
--- /dev/null
+++ b/numpy/typing/tests/data/pass/numeric.py
@@ -0,0 +1,89 @@
+"""
+Tests for :mod:`numpy.core.numeric`.
+
+Does not include tests which fall under ``array_constructors``.
+
+"""
+
+from typing import List
+import numpy as np
+
+class SubClass(np.ndarray):
+    ...
+
+i8 = np.int64(1)
+
+A = np.arange(27).reshape(3, 3, 3)
+B: List[List[List[int]]] = A.tolist()
+C = np.empty((27, 27)).view(SubClass)
+
+np.count_nonzero(i8)
+np.count_nonzero(A)
+np.count_nonzero(B)
+np.count_nonzero(A, keepdims=True)
+np.count_nonzero(A, axis=0)
+
+np.isfortran(i8)
+np.isfortran(A)
+
+np.argwhere(i8)
+np.argwhere(A)
+
+np.flatnonzero(i8)
+np.flatnonzero(A)
+
+np.correlate(B[0][0], A.ravel(), mode="valid")
+np.correlate(A.ravel(), A.ravel(), mode="same")
+
+np.convolve(B[0][0], A.ravel(), mode="valid")
+np.convolve(A.ravel(), A.ravel(), mode="same")
+
+np.outer(i8, A)
+np.outer(B, A)
+np.outer(A, A)
+np.outer(A, A, out=C)
+
+np.tensordot(B, A)
+np.tensordot(A, A)
+np.tensordot(A, A, axes=0)
+np.tensordot(A, A, axes=(0, 1))
+
+np.isscalar(i8)
+np.isscalar(A)
+np.isscalar(B)
+
+np.roll(A, 1)
+np.roll(A, (1, 2))
+np.roll(B, 1)
+
+np.rollaxis(A, 0, 1)
+
+np.moveaxis(A, 0, 1)
+np.moveaxis(A, (0, 1), (1, 2))
+
+np.cross(B, A)
+np.cross(A, A)
+
+np.indices([0, 1, 2])
+np.indices([0, 1, 2], sparse=False)
+np.indices([0, 1, 2], sparse=True)
+
+np.binary_repr(1)
+
+np.base_repr(1)
+
+np.allclose(i8, A)
+np.allclose(B, A)
+np.allclose(A, A)
+
+np.isclose(i8, A)
+np.isclose(B, A)
+np.isclose(A, A)
+
+np.array_equal(i8, A)
+np.array_equal(B, A)
+np.array_equal(A, A)
+
+np.array_equiv(i8, A)
+np.array_equiv(B, A)
+np.array_equiv(A, A)
diff --git a/numpy/tests/typing/pass/numerictypes.py b/numpy/typing/tests/data/pass/numerictypes.py
index 4f205cabc..4f205cabc 100644
--- a/numpy/tests/typing/pass/numerictypes.py
+++ b/numpy/typing/tests/data/pass/numerictypes.py
diff --git a/numpy/typing/tests/data/pass/scalars.py b/numpy/typing/tests/data/pass/scalars.py
new file mode 100644
index 000000000..b7f7880e4
--- /dev/null
+++ b/numpy/typing/tests/data/pass/scalars.py
@@ -0,0 +1,165 @@
+import sys
+import datetime as dt
+
+import pytest
+import numpy as np
+
+
+# Construction
+class D:
+    def __index__(self) -> int:
+        return 0
+
+
+class C:
+    def __complex__(self) -> complex:
+        return 3j
+
+
+class B:
+    def __int__(self) -> int:
+        return 4
+
+
+class A:
+    def __float__(self) -> float:
+        return 4.0
+
+
+np.complex64(3j)
+np.complex64(A())
+np.complex64(C())
+np.complex128(3j)
+np.complex128(C())
+np.complex128(None)
+np.complex64("1.2")
+np.complex128(b"2j")
+
+np.int8(4)
+np.int16(3.4)
+np.int32(4)
+np.int64(-1)
+np.uint8(B())
+np.uint32()
+np.int32("1")
+np.int64(b"2")
+
+np.float16(A())
+np.float32(16)
+np.float64(3.0)
+np.float64(None)
+np.float32("1")
+np.float16(b"2.5")
+
+if sys.version_info >= (3, 8):
+    np.uint64(D())
+    np.float32(D())
+    np.complex64(D())
+
+np.bytes_(b"hello")
+np.bytes_("hello", 'utf-8')
+np.bytes_("hello", encoding='utf-8')
+np.str_("hello")
+np.str_(b"hello", 'utf-8')
+np.str_(b"hello", encoding='utf-8')
+
+# Array-ish semantics
+np.int8().real
+np.int16().imag
+np.int32().data
+np.int64().flags
+
+np.uint8().itemsize * 2
+np.uint16().ndim + 1
+np.uint32().strides
+np.uint64().shape
+
+# Time structures
+np.datetime64()
+np.datetime64(0, "D")
+np.datetime64(0, b"D")
+np.datetime64(0, ('ms', 3))
+np.datetime64("2019")
+np.datetime64(b"2019")
+np.datetime64("2019", "D")
+np.datetime64(np.datetime64())
+np.datetime64(dt.datetime(2000, 5, 3))
+np.datetime64(None)
+np.datetime64(None, "D")
+
+np.timedelta64()
+np.timedelta64(0)
+np.timedelta64(0, "D")
+np.timedelta64(0, ('ms', 3))
+np.timedelta64(0, b"D")
+np.timedelta64("3")
+np.timedelta64(b"5")
+np.timedelta64(np.timedelta64(2))
+np.timedelta64(dt.timedelta(2))
+np.timedelta64(None)
+np.timedelta64(None, "D")
+
+np.void(1)
+np.void(np.int64(1))
+np.void(True)
+np.void(np.bool_(True))
+np.void(b"test")
+np.void(np.bytes_("test"))
+
+# Protocols
+i8 = np.int64()
+u8 = np.uint64()
+f8 = np.float64()
+c16 = np.complex128()
+b_ = np.bool_()
+td = np.timedelta64()
+U = np.str_("1")
+S = np.bytes_("1")
+AR = np.array(1, dtype=np.float64)
+
+int(i8)
+int(u8)
+int(f8)
+int(b_)
+int(td)
+int(U)
+int(S)
+int(AR)
+with pytest.warns(np.ComplexWarning):
+    int(c16)
+
+float(i8)
+float(u8)
+float(f8)
+float(b_)
+float(td)
+float(U)
+float(S)
+float(AR)
+with pytest.warns(np.ComplexWarning):
+    float(c16)
+
+complex(i8)
+complex(u8)
+complex(f8)
+complex(c16)
+complex(b_)
+complex(td)
+complex(U)
+complex(AR)
+
+
+# Misc
+c16.dtype
+c16.real
+c16.imag
+c16.real.real
+c16.real.imag
+c16.ndim
+c16.size
+c16.itemsize
+c16.shape
+c16.strides
+c16.squeeze()
+c16.byteswap()
+c16.transpose()
diff --git a/numpy/tests/typing/pass/simple.py b/numpy/typing/tests/data/pass/simple.py
index e0157ab81..243caf229 100644
--- a/numpy/tests/typing/pass/simple.py
+++ b/numpy/typing/tests/data/pass/simple.py
@@ -17,15 +17,6 @@ ndarray_func(np.array([1, 2]))
 array == 1
 array.dtype == float
 
-# Array creation routines checks
-ndarray_func(np.zeros([1, 2]))
-ndarray_func(np.ones([1, 2]))
-ndarray_func(np.empty([1, 2]))
-
-ndarray_func(np.zeros_like(array))
-ndarray_func(np.ones_like(array))
-ndarray_func(np.empty_like(array))
-
 # Dtype construction
 np.dtype(float)
 np.dtype(np.float64)
@@ -54,8 +45,6 @@ np.dtype(shape_like_dtype)
 object_dtype = [("field1", object)]
 np.dtype(object_dtype)
 
-np.dtype({"col1": ("U10", 0), "col2": ("float32", 10)})
-np.dtype((np.int32, {"real": (np.int16, 0), "imag": (np.int16, 2)}))
 np.dtype((np.int32, (np.int8, 4)))
 
 # Dtype comparision
diff --git a/numpy/tests/typing/pass/simple_py3.py b/numpy/typing/tests/data/pass/simple_py3.py
index c05a1ce61..c05a1ce61 100644
--- a/numpy/tests/typing/pass/simple_py3.py
+++ b/numpy/typing/tests/data/pass/simple_py3.py
diff --git a/numpy/typing/tests/data/pass/ufunc_config.py b/numpy/typing/tests/data/pass/ufunc_config.py
new file mode 100644
index 000000000..2d1314245
--- /dev/null
+++ b/numpy/typing/tests/data/pass/ufunc_config.py
@@ -0,0 +1,50 @@
+"""Typing tests for `numpy.core._ufunc_config`."""
+
+import numpy as np
+
+def func1(a: str, b: int) -> None: ...
+def func2(a: str, b: int, c: float = ...) -> None: ...
+def func3(a: str, b: int) -> int: ...
+
+class Write1:
+    def write(self, a: str) -> None: ...
+
+class Write2:
+    def write(self, a: str, b: int = ...) -> None: ...
+
+class Write3:
+    def write(self, a: str) -> int: ...
+
+
+_err_default = np.geterr()
+_bufsize_default = np.getbufsize()
+_errcall_default = np.geterrcall()
+
+try:
+    np.seterr(all=None)
+    np.seterr(divide="ignore")
+    np.seterr(over="warn")
+    np.seterr(under="call")
+    np.seterr(invalid="raise")
+    np.geterr()
+
+    np.setbufsize(4096)
+    np.getbufsize()
+
+    np.seterrcall(func1)
+    np.seterrcall(func2)
+    np.seterrcall(func3)
+    np.seterrcall(Write1())
+    np.seterrcall(Write2())
+    np.seterrcall(Write3())
+    np.geterrcall()
+
+    with np.errstate(call=func1, all="call"):
+        pass
+    with np.errstate(call=Write1(), divide="log", over="log"):
+        pass
+
+finally:
+    np.seterr(**_err_default)
+    np.setbufsize(_bufsize_default)
+    np.seterrcall(_errcall_default)
diff --git a/numpy/tests/typing/pass/ufuncs.py b/numpy/typing/tests/data/pass/ufuncs.py
index 82172952a..ad4d483d4 100644
--- a/numpy/tests/typing/pass/ufuncs.py
+++ b/numpy/typing/tests/data/pass/ufuncs.py
@@ -6,7 +6,10 @@ np.sin(1, out=np.empty(1))
 np.matmul(np.ones((2, 2, 2)), np.ones((2, 2, 2)), axes=[(0, 1), (0, 1), (0, 1)])
 np.sin(1, signature="D")
 np.sin(1, extobj=[16, 1, lambda: None])
-np.sin(1) + np.sin(1)
+# NOTE: `np.generic` subclasses are not guaranteed to support addition;
+# re-enable this we can infer the exact return type of `np.sin(...)`.
+#
+# np.sin(1) + np.sin(1)
 np.sin.types[0]
 np.sin.__name__
 
diff --git a/numpy/tests/typing/pass/warnings_and_errors.py b/numpy/typing/tests/data/pass/warnings_and_errors.py
index 5b6ec2626..5b6ec2626 100644
--- a/numpy/tests/typing/pass/warnings_and_errors.py
+++ b/numpy/typing/tests/data/pass/warnings_and_errors.py
diff --git a/numpy/typing/tests/data/reveal/arithmetic.py b/numpy/typing/tests/data/reveal/arithmetic.py
new file mode 100644
index 000000000..20310e691
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/arithmetic.py
@@ -0,0 +1,291 @@
+import numpy as np
+
+c16 = np.complex128()
+f8 = np.float64()
+i8 = np.int64()
+u8 = np.uint64()
+
+c8 = np.complex64()
+f4 = np.float32()
+i4 = np.int32()
+u4 = np.uint32()
+
+dt = np.datetime64(0, "D")
+td = np.timedelta64(0, "D")
+
+b_ = np.bool_()
+
+b = bool()
+c = complex()
+f = float()
+i = int()
+
+AR = np.array([0], dtype=np.float64)
+AR.setflags(write=False)
+
+# unary ops
+
+reveal_type(-c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(-c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(-f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(-f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(-i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(-i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(-u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(-u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(-td)  # E: numpy.timedelta64
+reveal_type(-AR)  # E: Any
+
+reveal_type(+c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(+c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(+f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(+f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(+i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(+i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(+u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(+u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(+td)  # E: numpy.timedelta64
+reveal_type(+AR)  # E: Any
+
+reveal_type(abs(c16))  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(abs(c8))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(abs(f8))  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(abs(f4))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(abs(i8))  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(abs(i4))  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(abs(u8))  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(abs(u4))  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(abs(td))  # E: numpy.timedelta64
+reveal_type(abs(b_))  # E: numpy.bool_
+reveal_type(abs(AR))  # E: Any
+
+# Time structures
+
+reveal_type(dt + td)  # E: numpy.datetime64
+reveal_type(dt + i)  # E: numpy.datetime64
+reveal_type(dt + i4)  # E: numpy.datetime64
+reveal_type(dt + i8)  # E: numpy.datetime64
+reveal_type(dt - dt)  # E: numpy.timedelta64
+reveal_type(dt - i)  # E: numpy.datetime64
+reveal_type(dt - i4)  # E: numpy.datetime64
+reveal_type(dt - i8)  # E: numpy.datetime64
+
+reveal_type(td + td)  # E: numpy.timedelta64
+reveal_type(td + i)  # E: numpy.timedelta64
+reveal_type(td + i4)  # E: numpy.timedelta64
+reveal_type(td + i8)  # E: numpy.timedelta64
+reveal_type(td - td)  # E: numpy.timedelta64
+reveal_type(td - i)  # E: numpy.timedelta64
+reveal_type(td - i4)  # E: numpy.timedelta64
+reveal_type(td - i8)  # E: numpy.timedelta64
+reveal_type(td / f)  # E: numpy.timedelta64
+reveal_type(td / f4)  # E: numpy.timedelta64
+reveal_type(td / f8)  # E: numpy.timedelta64
+reveal_type(td / td)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(td // td)  # E: numpy.signedinteger[numpy.typing._64Bit]
+
+# boolean
+
+reveal_type(b_ / b)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / i)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / i4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / u8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / u4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(b_ / c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(b_ / c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(b_ / c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+
+reveal_type(b / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i4 / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(u8 / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(u4 / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 / b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 / b_)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(c / b_)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 / b_)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 / b_)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+
+# Complex
+
+reveal_type(c16 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + f8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + i8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + f4)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + i4)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + b_)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + b)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + f)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c16 + i)  # E: numpy.complexfloating[Any, Any]
+reveal_type(c16 + AR)  # E: Any
+
+reveal_type(c16 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f8 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i8 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f4 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i4 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(b_ + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(b + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i + c16)  # E: numpy.complexfloating[Any, Any]
+reveal_type(AR + c16)  # E: Any
+
+reveal_type(c8 + c16)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + f8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + i8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c8 + f4)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c8 + i4)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c8 + b_)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c8 + b)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c8 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + f)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + i)  # E: numpy.complexfloating[Any, Any]
+reveal_type(c8 + AR)  # E: Any
+
+reveal_type(c16 + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f8 + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i8 + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(c8 + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(f4 + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(i4 + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(b_ + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(b + c8)  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(c + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + c8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i + c8)  # E: numpy.complexfloating[Any, Any]
+reveal_type(AR + c8)  # E: Any
+
+# Float
+
+reveal_type(f8 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + f4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + i4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + b)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f8 + f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 + i)  # E: numpy.floating[Any]
+reveal_type(f8 + AR)  # E: Any
+
+reveal_type(f8 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i4 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(c + f8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i + f8)  # E: numpy.floating[Any]
+reveal_type(AR + f8)  # E: Any
+
+reveal_type(f4 + f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 + i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 + f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(f4 + i4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(f4 + b_)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(f4 + b)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(f4 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f4 + f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 + i)  # E: numpy.floating[Any]
+reveal_type(f4 + AR)  # E: Any
+
+reveal_type(f8 + f4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 + f4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 + f4)  # E: umpy.floating[numpy.typing._32Bit]
+reveal_type(i4 + f4)  # E: umpy.floating[numpy.typing._32Bit]
+reveal_type(b_ + f4)  # E: umpy.floating[numpy.typing._32Bit]
+reveal_type(b + f4)  # E: umpy.floating[numpy.typing._32Bit]
+reveal_type(c + f4)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + f4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i + f4)  # E: numpy.floating[Any]
+reveal_type(AR + f4)  # E: Any
+
+# Int
+
+reveal_type(i8 + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 + u8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(i8 + i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 + u4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(i8 + b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 + b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(i8 + f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 + i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 + AR)  # E: Any
+
+reveal_type(u8 + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 + i4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u8 + u4)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 + b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 + b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 + c)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(u8 + f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(u8 + i)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u8 + AR)  # E: Any
+
+reveal_type(i8 + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(u8 + i8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(i4 + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(u4 + i8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(b_ + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(b + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(c + i8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i + i8)  # E: numpy.signedinteger[Any]
+reveal_type(AR + i8)  # E: Any
+
+reveal_type(u8 + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(i4 + u8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u4 + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(b_ + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(b + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(c + u8)  # E: numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]
+reveal_type(f + u8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i + u8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(AR + u8)  # E: Any
+
+reveal_type(i4 + i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i4 + i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 + i)  # E: numpy.signedinteger[Any]
+reveal_type(i4 + b_)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 + b)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 + AR)  # E: Any
+
+reveal_type(u4 + i8)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u4 + i4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u4 + u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u4 + u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 + i)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u4 + b_)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 + b)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 + AR)  # E: Any
+
+reveal_type(i8 + i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i4 + i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i + i4)  # E: numpy.signedinteger[Any]
+reveal_type(b_ + i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(b + i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(AR + i4)  # E: Any
+
+reveal_type(i8 + u4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(i4 + u4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(u8 + u4)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u4 + u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(b_ + u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(b + u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(i + u4)  # E: Union[numpy.signedinteger[Any], numpy.floating[numpy.typing._64Bit]]
+reveal_type(AR + u4)  # E: Any
diff --git a/numpy/typing/tests/data/reveal/array_constructors.py b/numpy/typing/tests/data/reveal/array_constructors.py
new file mode 100644
index 000000000..04d5cd229
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/array_constructors.py
@@ -0,0 +1,102 @@
+from typing import List, Any
+import numpy as np
+
+class SubClass(np.ndarray): ...
+
+i8: np.int64
+
+A: np.ndarray
+B: SubClass
+C: List[int]
+
+def func(i: int, j: int, **kwargs: Any) -> SubClass: ...
+
+reveal_type(np.asarray(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asarray(B))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asarray(C))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.asanyarray(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asanyarray(B))  # E: SubClass
+reveal_type(np.asanyarray(B, dtype=int))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asanyarray(C))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.ascontiguousarray(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ascontiguousarray(B))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ascontiguousarray(C))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.asfortranarray(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asfortranarray(B))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.asfortranarray(C))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.require(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.require(B))  # E: SubClass
+reveal_type(np.require(B, requirements=None))  # E: SubClass
+reveal_type(np.require(B, dtype=int))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.require(B, requirements="E"))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.require(B, requirements=["ENSUREARRAY"]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.require(B, requirements={"F", "E"}))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.require(B, requirements=["C", "OWNDATA"]))  # E: SubClass
+reveal_type(np.require(B, requirements="W"))  # E: SubClass
+reveal_type(np.require(B, requirements="A"))  # E: SubClass
+reveal_type(np.require(C))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.linspace(0, 10))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.linspace(0, 10, retstep=True))  # E: Tuple[numpy.ndarray[Any, Any], numpy.inexact[Any]]
+reveal_type(np.logspace(0, 10))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.geomspace(1, 10))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.zeros_like(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.zeros_like(C))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.zeros_like(B))  # E: SubClass
+reveal_type(np.zeros_like(B, dtype=np.int64))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.ones_like(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ones_like(C))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ones_like(B))  # E: SubClass
+reveal_type(np.ones_like(B, dtype=np.int64))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.empty_like(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.empty_like(C))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.empty_like(B))  # E: SubClass
+reveal_type(np.empty_like(B, dtype=np.int64))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.full_like(A, i8))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.full_like(C, i8))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.full_like(B, i8))  # E: SubClass
+reveal_type(np.full_like(B, i8, dtype=np.int64))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.ones(1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ones([1, 1, 1]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.full(1, i8))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.full([1, 1, 1], i8))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.indices([1, 2, 3]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.indices([1, 2, 3], sparse=True))  # E: tuple[numpy.ndarray[Any, Any]]
+
+reveal_type(np.fromfunction(func, (3, 5)))  # E: SubClass
+
+reveal_type(np.identity(10))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.atleast_1d(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.atleast_1d(C))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.atleast_1d(A, A))  # E: list[numpy.ndarray[Any, Any]]
+reveal_type(np.atleast_1d(A, C))  # E: list[numpy.ndarray[Any, Any]]
+reveal_type(np.atleast_1d(C, C))  # E: list[numpy.ndarray[Any, Any]]
+
+reveal_type(np.atleast_2d(A))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.atleast_3d(A))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.vstack([A, A]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.vstack([A, C]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.vstack([C, C]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.hstack([A, A]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.stack([A, A]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.stack([A, A], axis=0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.stack([A, A], out=B))  # E: SubClass
+
+reveal_type(np.block([[A, A], [A, A]]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.block(C))  # E: numpy.ndarray[Any, Any]
diff --git a/numpy/typing/tests/data/reveal/bitwise_ops.py b/numpy/typing/tests/data/reveal/bitwise_ops.py
new file mode 100644
index 000000000..cb9131a96
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/bitwise_ops.py
@@ -0,0 +1,131 @@
+import numpy as np
+
+i8 = np.int64(1)
+u8 = np.uint64(1)
+
+i4 = np.int32(1)
+u4 = np.uint32(1)
+
+b_ = np.bool_(1)
+
+b = bool(1)
+i = int(1)
+
+AR = np.array([0, 1, 2], dtype=np.int32)
+AR.setflags(write=False)
+
+
+reveal_type(i8 << i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 >> i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 | i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 ^ i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 & i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+
+reveal_type(i8 << AR)  # E: Any
+reveal_type(i8 >> AR)  # E: Any
+reveal_type(i8 | AR)  # E: Any
+reveal_type(i8 ^ AR)  # E: Any
+reveal_type(i8 & AR)  # E: Any
+
+reveal_type(i4 << i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 >> i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 | i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 ^ i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 & i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+
+reveal_type(i8 << i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 >> i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 | i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 ^ i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 & i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+
+reveal_type(i8 << i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 >> i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 | i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 ^ i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 & i)  # E: numpy.signedinteger[Any]
+
+reveal_type(i8 << b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 >> b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 | b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 ^ b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 & b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+
+reveal_type(i8 << b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 >> b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 | b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 ^ b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 & b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+
+reveal_type(u8 << u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 >> u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 | u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 ^ u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 & u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+
+reveal_type(u8 << AR)  # E: Any
+reveal_type(u8 >> AR)  # E: Any
+reveal_type(u8 | AR)  # E: Any
+reveal_type(u8 ^ AR)  # E: Any
+reveal_type(u8 & AR)  # E: Any
+
+reveal_type(u4 << u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 >> u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 | u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 ^ u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(u4 & u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+
+reveal_type(u4 << i4)  # E: numpy.signedinteger[Any]
+reveal_type(u4 >> i4)  # E: numpy.signedinteger[Any]
+reveal_type(u4 | i4)  # E: numpy.signedinteger[Any]
+reveal_type(u4 ^ i4)  # E: numpy.signedinteger[Any]
+reveal_type(u4 & i4)  # E: numpy.signedinteger[Any]
+
+reveal_type(u4 << i)  # E: numpy.signedinteger[Any]
+reveal_type(u4 >> i)  # E: numpy.signedinteger[Any]
+reveal_type(u4 | i)  # E: numpy.signedinteger[Any]
+reveal_type(u4 ^ i)  # E: numpy.signedinteger[Any]
+reveal_type(u4 & i)  # E: numpy.signedinteger[Any]
+
+reveal_type(u8 << b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 >> b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 | b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 ^ b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 & b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+
+reveal_type(u8 << b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 >> b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 | b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 ^ b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(u8 & b)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+
+reveal_type(b_ << b_)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ >> b_)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ | b_)  # E: numpy.bool_
+reveal_type(b_ ^ b_)  # E: numpy.bool_
+reveal_type(b_ & b_)  # E: numpy.bool_
+
+reveal_type(b_ << AR)  # E: Any
+reveal_type(b_ >> AR)  # E: Any
+reveal_type(b_ | AR)  # E: Any
+reveal_type(b_ ^ AR)  # E: Any
+reveal_type(b_ & AR)  # E: Any
+
+reveal_type(b_ << b)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ >> b)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ | b)  # E: numpy.bool_
+reveal_type(b_ ^ b)  # E: numpy.bool_
+reveal_type(b_ & b)  # E: numpy.bool_
+
+reveal_type(b_ << i)  # E: numpy.signedinteger[Any]
+reveal_type(b_ >> i)  # E: numpy.signedinteger[Any]
+reveal_type(b_ | i)  # E: numpy.signedinteger[Any]
+reveal_type(b_ ^ i)  # E: numpy.signedinteger[Any]
+reveal_type(b_ & i)  # E: numpy.signedinteger[Any]
+
+reveal_type(~i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(~i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(~u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(~u4)  # E: numpy.unsignedinteger[numpy.typing._32Bit]
+reveal_type(~b_)  # E: numpy.bool_
+reveal_type(~AR)  # E: Any
diff --git a/numpy/typing/tests/data/reveal/comparisons.py b/numpy/typing/tests/data/reveal/comparisons.py
new file mode 100644
index 000000000..507f713c7
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/comparisons.py
@@ -0,0 +1,247 @@
+import numpy as np
+
+c16 = np.complex128()
+f8 = np.float64()
+i8 = np.int64()
+u8 = np.uint64()
+
+c8 = np.complex64()
+f4 = np.float32()
+i4 = np.int32()
+u4 = np.uint32()
+
+dt = np.datetime64(0, "D")
+td = np.timedelta64(0, "D")
+
+b_ = np.bool_()
+
+b = bool()
+c = complex()
+f = float()
+i = int()
+
+AR = np.array([0], dtype=np.int64)
+AR.setflags(write=False)
+
+SEQ = (0, 1, 2, 3, 4)
+
+# Time structures
+
+reveal_type(dt > dt)  # E: numpy.bool_
+
+reveal_type(td > td)  # E: numpy.bool_
+reveal_type(td > i)  # E: numpy.bool_
+reveal_type(td > i4)  # E: numpy.bool_
+reveal_type(td > i8)  # E: numpy.bool_
+reveal_type(td > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(td > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+# boolean
+
+reveal_type(b_ > b)  # E: numpy.bool_
+reveal_type(b_ > b_)  # E: numpy.bool_
+reveal_type(b_ > i)  # E: numpy.bool_
+reveal_type(b_ > i8)  # E: numpy.bool_
+reveal_type(b_ > i4)  # E: numpy.bool_
+reveal_type(b_ > u8)  # E: numpy.bool_
+reveal_type(b_ > u4)  # E: numpy.bool_
+reveal_type(b_ > f)  # E: numpy.bool_
+reveal_type(b_ > f8)  # E: numpy.bool_
+reveal_type(b_ > f4)  # E: numpy.bool_
+reveal_type(b_ > c)  # E: numpy.bool_
+reveal_type(b_ > c16)  # E: numpy.bool_
+reveal_type(b_ > c8)  # E: numpy.bool_
+reveal_type(b_ > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(b_ > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+# Complex
+
+reveal_type(c16 > c16)  # E: numpy.bool_
+reveal_type(c16 > f8)  # E: numpy.bool_
+reveal_type(c16 > i8)  # E: numpy.bool_
+reveal_type(c16 > c8)  # E: numpy.bool_
+reveal_type(c16 > f4)  # E: numpy.bool_
+reveal_type(c16 > i4)  # E: numpy.bool_
+reveal_type(c16 > b_)  # E: numpy.bool_
+reveal_type(c16 > b)  # E: numpy.bool_
+reveal_type(c16 > c)  # E: numpy.bool_
+reveal_type(c16 > f)  # E: numpy.bool_
+reveal_type(c16 > i)  # E: numpy.bool_
+reveal_type(c16 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(c16 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(c16 > c16)  # E: numpy.bool_
+reveal_type(f8 > c16)  # E: numpy.bool_
+reveal_type(i8 > c16)  # E: numpy.bool_
+reveal_type(c8 > c16)  # E: numpy.bool_
+reveal_type(f4 > c16)  # E: numpy.bool_
+reveal_type(i4 > c16)  # E: numpy.bool_
+reveal_type(b_ > c16)  # E: numpy.bool_
+reveal_type(b > c16)  # E: numpy.bool_
+reveal_type(c > c16)  # E: numpy.bool_
+reveal_type(f > c16)  # E: numpy.bool_
+reveal_type(i > c16)  # E: numpy.bool_
+reveal_type(AR > c16)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > c16)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(c8 > c16)  # E: numpy.bool_
+reveal_type(c8 > f8)  # E: numpy.bool_
+reveal_type(c8 > i8)  # E: numpy.bool_
+reveal_type(c8 > c8)  # E: numpy.bool_
+reveal_type(c8 > f4)  # E: numpy.bool_
+reveal_type(c8 > i4)  # E: numpy.bool_
+reveal_type(c8 > b_)  # E: numpy.bool_
+reveal_type(c8 > b)  # E: numpy.bool_
+reveal_type(c8 > c)  # E: numpy.bool_
+reveal_type(c8 > f)  # E: numpy.bool_
+reveal_type(c8 > i)  # E: numpy.bool_
+reveal_type(c8 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(c8 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(c16 > c8)  # E: numpy.bool_
+reveal_type(f8 > c8)  # E: numpy.bool_
+reveal_type(i8 > c8)  # E: numpy.bool_
+reveal_type(c8 > c8)  # E: numpy.bool_
+reveal_type(f4 > c8)  # E: numpy.bool_
+reveal_type(i4 > c8)  # E: numpy.bool_
+reveal_type(b_ > c8)  # E: numpy.bool_
+reveal_type(b > c8)  # E: numpy.bool_
+reveal_type(c > c8)  # E: numpy.bool_
+reveal_type(f > c8)  # E: numpy.bool_
+reveal_type(i > c8)  # E: numpy.bool_
+reveal_type(AR > c8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > c8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+# Float
+
+reveal_type(f8 > f8)  # E: numpy.bool_
+reveal_type(f8 > i8)  # E: numpy.bool_
+reveal_type(f8 > f4)  # E: numpy.bool_
+reveal_type(f8 > i4)  # E: numpy.bool_
+reveal_type(f8 > b_)  # E: numpy.bool_
+reveal_type(f8 > b)  # E: numpy.bool_
+reveal_type(f8 > c)  # E: numpy.bool_
+reveal_type(f8 > f)  # E: numpy.bool_
+reveal_type(f8 > i)  # E: numpy.bool_
+reveal_type(f8 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(f8 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(f8 > f8)  # E: numpy.bool_
+reveal_type(i8 > f8)  # E: numpy.bool_
+reveal_type(f4 > f8)  # E: numpy.bool_
+reveal_type(i4 > f8)  # E: numpy.bool_
+reveal_type(b_ > f8)  # E: numpy.bool_
+reveal_type(b > f8)  # E: numpy.bool_
+reveal_type(c > f8)  # E: numpy.bool_
+reveal_type(f > f8)  # E: numpy.bool_
+reveal_type(i > f8)  # E: numpy.bool_
+reveal_type(AR > f8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > f8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(f4 > f8)  # E: numpy.bool_
+reveal_type(f4 > i8)  # E: numpy.bool_
+reveal_type(f4 > f4)  # E: numpy.bool_
+reveal_type(f4 > i4)  # E: numpy.bool_
+reveal_type(f4 > b_)  # E: numpy.bool_
+reveal_type(f4 > b)  # E: numpy.bool_
+reveal_type(f4 > c)  # E: numpy.bool_
+reveal_type(f4 > f)  # E: numpy.bool_
+reveal_type(f4 > i)  # E: numpy.bool_
+reveal_type(f4 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(f4 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(f8 > f4)  # E: numpy.bool_
+reveal_type(i8 > f4)  # E: numpy.bool_
+reveal_type(f4 > f4)  # E: numpy.bool_
+reveal_type(i4 > f4)  # E: numpy.bool_
+reveal_type(b_ > f4)  # E: numpy.bool_
+reveal_type(b > f4)  # E: numpy.bool_
+reveal_type(c > f4)  # E: numpy.bool_
+reveal_type(f > f4)  # E: numpy.bool_
+reveal_type(i > f4)  # E: numpy.bool_
+reveal_type(AR > f4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > f4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+# Int
+
+reveal_type(i8 > i8)  # E: numpy.bool_
+reveal_type(i8 > u8)  # E: numpy.bool_
+reveal_type(i8 > i4)  # E: numpy.bool_
+reveal_type(i8 > u4)  # E: numpy.bool_
+reveal_type(i8 > b_)  # E: numpy.bool_
+reveal_type(i8 > b)  # E: numpy.bool_
+reveal_type(i8 > c)  # E: numpy.bool_
+reveal_type(i8 > f)  # E: numpy.bool_
+reveal_type(i8 > i)  # E: numpy.bool_
+reveal_type(i8 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(i8 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(u8 > u8)  # E: numpy.bool_
+reveal_type(u8 > i4)  # E: numpy.bool_
+reveal_type(u8 > u4)  # E: numpy.bool_
+reveal_type(u8 > b_)  # E: numpy.bool_
+reveal_type(u8 > b)  # E: numpy.bool_
+reveal_type(u8 > c)  # E: numpy.bool_
+reveal_type(u8 > f)  # E: numpy.bool_
+reveal_type(u8 > i)  # E: numpy.bool_
+reveal_type(u8 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(u8 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(i8 > i8)  # E: numpy.bool_
+reveal_type(u8 > i8)  # E: numpy.bool_
+reveal_type(i4 > i8)  # E: numpy.bool_
+reveal_type(u4 > i8)  # E: numpy.bool_
+reveal_type(b_ > i8)  # E: numpy.bool_
+reveal_type(b > i8)  # E: numpy.bool_
+reveal_type(c > i8)  # E: numpy.bool_
+reveal_type(f > i8)  # E: numpy.bool_
+reveal_type(i > i8)  # E: numpy.bool_
+reveal_type(AR > i8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > i8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(u8 > u8)  # E: numpy.bool_
+reveal_type(i4 > u8)  # E: numpy.bool_
+reveal_type(u4 > u8)  # E: numpy.bool_
+reveal_type(b_ > u8)  # E: numpy.bool_
+reveal_type(b > u8)  # E: numpy.bool_
+reveal_type(c > u8)  # E: numpy.bool_
+reveal_type(f > u8)  # E: numpy.bool_
+reveal_type(i > u8)  # E: numpy.bool_
+reveal_type(AR > u8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > u8)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(i4 > i8)  # E: numpy.bool_
+reveal_type(i4 > i4)  # E: numpy.bool_
+reveal_type(i4 > i)  # E: numpy.bool_
+reveal_type(i4 > b_)  # E: numpy.bool_
+reveal_type(i4 > b)  # E: numpy.bool_
+reveal_type(i4 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(i4 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(u4 > i8)  # E: numpy.bool_
+reveal_type(u4 > i4)  # E: numpy.bool_
+reveal_type(u4 > u8)  # E: numpy.bool_
+reveal_type(u4 > u4)  # E: numpy.bool_
+reveal_type(u4 > i)  # E: numpy.bool_
+reveal_type(u4 > b_)  # E: numpy.bool_
+reveal_type(u4 > b)  # E: numpy.bool_
+reveal_type(u4 > AR)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(u4 > SEQ)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(i8 > i4)  # E: numpy.bool_
+reveal_type(i4 > i4)  # E: numpy.bool_
+reveal_type(i > i4)  # E: numpy.bool_
+reveal_type(b_ > i4)  # E: numpy.bool_
+reveal_type(b > i4)  # E: numpy.bool_
+reveal_type(AR > i4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > i4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+
+reveal_type(i8 > u4)  # E: numpy.bool_
+reveal_type(i4 > u4)  # E: numpy.bool_
+reveal_type(u8 > u4)  # E: numpy.bool_
+reveal_type(u4 > u4)  # E: numpy.bool_
+reveal_type(b_ > u4)  # E: numpy.bool_
+reveal_type(b > u4)  # E: numpy.bool_
+reveal_type(i > u4)  # E: numpy.bool_
+reveal_type(AR > u4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
+reveal_type(SEQ > u4)  # E: Union[numpy.ndarray[Any, Any], numpy.bool_]
diff --git a/numpy/tests/typing/reveal/constants.py b/numpy/typing/tests/data/reveal/constants.py
index 8e00810bd..b2382e861 100644
--- a/numpy/tests/typing/reveal/constants.py
+++ b/numpy/typing/tests/data/reveal/constants.py
@@ -40,5 +40,13 @@ reveal_type(np.SHIFT_OVERFLOW)  # E: int
 reveal_type(np.SHIFT_UNDERFLOW)  # E: int
 reveal_type(np.UFUNC_BUFSIZE_DEFAULT)  # E: int
 reveal_type(np.WRAP)  # E: int
-reveal_type(np.little_endian)  # E: int
 reveal_type(np.tracemalloc_domain)  # E: int
+
+reveal_type(np.little_endian)  # E: bool
+reveal_type(np.True_)  # E: numpy.bool_
+reveal_type(np.False_)  # E: numpy.bool_
+
+reveal_type(np.UFUNC_PYVALS_NAME)  # E: str
+
+reveal_type(np.sctypeDict)  # E: dict
+reveal_type(np.sctypes)  # E: TypedDict
diff --git a/numpy/typing/tests/data/reveal/dtype.py b/numpy/typing/tests/data/reveal/dtype.py
new file mode 100644
index 000000000..626a15270
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/dtype.py
@@ -0,0 +1,41 @@
+import numpy as np
+
+dtype_obj: np.dtype[np.str_]
+
+reveal_type(np.dtype(np.float64))  # E: numpy.dtype[numpy.floating[numpy.typing._64Bit]]
+reveal_type(np.dtype(np.int64))  # E: numpy.dtype[numpy.signedinteger[numpy.typing._64Bit]]
+
+# String aliases
+reveal_type(np.dtype("float64"))  # E: numpy.dtype[numpy.floating[numpy.typing._64Bit]]
+reveal_type(np.dtype("float32"))  # E: numpy.dtype[numpy.floating[numpy.typing._32Bit]]
+reveal_type(np.dtype("int64"))  # E: numpy.dtype[numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(np.dtype("int32"))  # E: numpy.dtype[numpy.signedinteger[numpy.typing._32Bit]]
+reveal_type(np.dtype("bool"))  # E: numpy.dtype[numpy.bool_]
+reveal_type(np.dtype("bytes"))  # E: numpy.dtype[numpy.bytes_]
+reveal_type(np.dtype("str"))  # E: numpy.dtype[numpy.str_]
+
+# Python types
+reveal_type(np.dtype(complex))  # E: numpy.dtype[numpy.complexfloating[numpy.typing._64Bit, numpy.typing._64Bit]]
+reveal_type(np.dtype(float))  # E: numpy.dtype[numpy.floating[numpy.typing._64Bit]]
+reveal_type(np.dtype(int))  # E: numpy.dtype
+reveal_type(np.dtype(bool))  # E: numpy.dtype[numpy.bool_]
+reveal_type(np.dtype(str))  # E: numpy.dtype[numpy.str_]
+reveal_type(np.dtype(bytes))  # E: numpy.dtype[numpy.bytes_]
+
+# Special case for None
+reveal_type(np.dtype(None))  # E: numpy.dtype[numpy.floating[numpy.typing._64Bit]]
+
+# Dtypes of dtypes
+reveal_type(np.dtype(np.dtype(np.float64)))  # E: numpy.dtype[numpy.floating[numpy.typing._64Bit]]
+
+# Parameterized dtypes
+reveal_type(np.dtype("S8"))  # E: numpy.dtype
+
+# Void
+reveal_type(np.dtype(("U", 10)))  # E: numpy.dtype[numpy.void]
+
+# Methods and attributes
+reveal_type(dtype_obj.base)  # E: numpy.dtype[numpy.str_]
+reveal_type(dtype_obj.subdtype)  # E: Union[Tuple[numpy.dtype[numpy.str_], builtins.tuple[builtins.int]], None]
+reveal_type(dtype_obj.newbyteorder())  # E: numpy.dtype[numpy.str_]
+reveal_type(dtype_obj.type)  # E: Type[numpy.str_]
diff --git a/numpy/typing/tests/data/reveal/flatiter.py b/numpy/typing/tests/data/reveal/flatiter.py
new file mode 100644
index 000000000..221101ebb
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/flatiter.py
@@ -0,0 +1,17 @@
+from typing import Any
+import numpy as np
+
+a: np.flatiter[np.ndarray[Any, np.dtype[np.str_]]]
+
+reveal_type(a.base)  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a.copy())  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a.coords)  # E: tuple[builtins.int]
+reveal_type(a.index)  # E: int
+reveal_type(iter(a))  # E: Iterator[numpy.str_]
+reveal_type(next(a))  # E: numpy.str_
+reveal_type(a[0])  # E: numpy.str_
+reveal_type(a[[0, 1, 2]])  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a[...])  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a[:])  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a.__array__())  # E: numpy.ndarray[Any, numpy.dtype[numpy.str_]]
+reveal_type(a.__array__(np.float64))  # E: numpy.ndarray[Any, numpy.dtype[Any]]
diff --git a/numpy/typing/tests/data/reveal/fromnumeric.py b/numpy/typing/tests/data/reveal/fromnumeric.py
new file mode 100644
index 000000000..2972fa1af
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/fromnumeric.py
@@ -0,0 +1,278 @@
+"""Tests for :mod:`numpy.core.fromnumeric`."""
+
+import numpy as np
+
+A = np.array(True, ndmin=2, dtype=bool)
+B = np.array(1.0, ndmin=2, dtype=np.float32)
+A.setflags(write=False)
+B.setflags(write=False)
+
+a = np.bool_(True)
+b = np.float32(1.0)
+c = 1.0
+d = np.array(1.0, dtype=np.float32)  # writeable
+
+reveal_type(np.take(a, 0))  # E: numpy.bool_
+reveal_type(np.take(b, 0))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(
+    np.take(c, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
+)
+reveal_type(
+    np.take(A, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
+)
+reveal_type(
+    np.take(B, 0)  # E: Union[numpy.generic, datetime.datetime, datetime.timedelta]
+)
+reveal_type(
+    np.take(  # E: Union[Union[numpy.generic, datetime.datetime, datetime.timedelta], numpy.ndarray[Any, Any]]
+        A, [0]
+    )
+)
+reveal_type(
+    np.take(  # E: Union[Union[numpy.generic, datetime.datetime, datetime.timedelta], numpy.ndarray[Any, Any]]
+        B, [0]
+    )
+)
+
+reveal_type(np.reshape(a, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.reshape(b, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.reshape(c, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.reshape(A, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.reshape(B, 1))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.choose(a, [True, True]))  # E: numpy.bool_
+reveal_type(np.choose(A, [True, True]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.repeat(a, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.repeat(b, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.repeat(c, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.repeat(A, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.repeat(B, 1))  # E: numpy.ndarray[Any, Any]
+
+# TODO: Add tests for np.put()
+
+reveal_type(np.swapaxes(A, 0, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.swapaxes(B, 0, 0))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.transpose(a))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.transpose(b))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.transpose(c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.transpose(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.transpose(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.partition(a, 0, axis=None))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.partition(b, 0, axis=None))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.partition(c, 0, axis=None))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.partition(A, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.partition(B, 0))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.argpartition(a, 0))  # E: numpy.integer[Any]
+reveal_type(np.argpartition(b, 0))  # E: numpy.integer[Any]
+reveal_type(np.argpartition(c, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.argpartition(A, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.argpartition(B, 0))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.sort(A, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.sort(B, 0))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.argsort(A, 0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.argsort(B, 0))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.argmax(A))  # E: numpy.integer[Any]
+reveal_type(np.argmax(B))  # E: numpy.integer[Any]
+reveal_type(np.argmax(A, axis=0))  # E: Union[numpy.integer[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.argmax(B, axis=0))  # E: Union[numpy.integer[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.argmin(A))  # E: numpy.integer[Any]
+reveal_type(np.argmin(B))  # E: numpy.integer[Any]
+reveal_type(np.argmin(A, axis=0))  # E: Union[numpy.integer[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.argmin(B, axis=0))  # E: Union[numpy.integer[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.searchsorted(A[0], 0))  # E: numpy.integer[Any]
+reveal_type(np.searchsorted(B[0], 0))  # E: numpy.integer[Any]
+reveal_type(np.searchsorted(A[0], [0]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.searchsorted(B[0], [0]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.resize(a, (5, 5)))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.resize(b, (5, 5)))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.resize(c, (5, 5)))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.resize(A, (5, 5)))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.resize(B, (5, 5)))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.squeeze(a))  # E: numpy.bool_
+reveal_type(np.squeeze(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.squeeze(c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.squeeze(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.squeeze(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.diagonal(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.diagonal(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.trace(A))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.trace(B))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.ravel(a))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ravel(b))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ravel(c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ravel(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.ravel(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.nonzero(a))  # E: tuple[numpy.ndarray[Any, Any]]
+reveal_type(np.nonzero(b))  # E: tuple[numpy.ndarray[Any, Any]]
+reveal_type(np.nonzero(c))  # E: tuple[numpy.ndarray[Any, Any]]
+reveal_type(np.nonzero(A))  # E: tuple[numpy.ndarray[Any, Any]]
+reveal_type(np.nonzero(B))  # E: tuple[numpy.ndarray[Any, Any]]
+
+reveal_type(np.shape(a))  # E: tuple[builtins.int]
+reveal_type(np.shape(b))  # E: tuple[builtins.int]
+reveal_type(np.shape(c))  # E: tuple[builtins.int]
+reveal_type(np.shape(A))  # E: tuple[builtins.int]
+reveal_type(np.shape(B))  # E: tuple[builtins.int]
+
+reveal_type(np.compress([True], a))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.compress([True], b))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.compress([True], c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.compress([True], A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.compress([True], B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.clip(a, 0, 1.0))  # E: numpy.number[Any]
+reveal_type(np.clip(b, -1, 1))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.clip(c, 0, 1))  # E: numpy.number[Any]
+reveal_type(np.clip(A, 0, 1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.clip(B, 0, 1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.sum(a))  # E: numpy.number[Any]
+reveal_type(np.sum(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.sum(c))  # E: numpy.number[Any]
+reveal_type(np.sum(A))  # E: numpy.number[Any]
+reveal_type(np.sum(B))  # E: numpy.number[Any]
+reveal_type(np.sum(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.sum(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.all(a))  # E: numpy.bool_
+reveal_type(np.all(b))  # E: numpy.bool_
+reveal_type(np.all(c))  # E: numpy.bool_
+reveal_type(np.all(A))  # E: numpy.bool_
+reveal_type(np.all(B))  # E: numpy.bool_
+reveal_type(np.all(A, axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.all(B, axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.all(A, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.all(B, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+
+reveal_type(np.any(a))  # E: numpy.bool_
+reveal_type(np.any(b))  # E: numpy.bool_
+reveal_type(np.any(c))  # E: numpy.bool_
+reveal_type(np.any(A))  # E: numpy.bool_
+reveal_type(np.any(B))  # E: numpy.bool_
+reveal_type(np.any(A, axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.any(B, axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.any(A, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.any(B, keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+
+reveal_type(np.cumsum(a))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumsum(b))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumsum(c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumsum(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumsum(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.ptp(a))  # E: numpy.number[Any]
+reveal_type(np.ptp(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.ptp(c))  # E: numpy.number[Any]
+reveal_type(np.ptp(A))  # E: numpy.number[Any]
+reveal_type(np.ptp(B))  # E: numpy.number[Any]
+reveal_type(np.ptp(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.ptp(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.ptp(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.ptp(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.amax(a))  # E: numpy.number[Any]
+reveal_type(np.amax(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.amax(c))  # E: numpy.number[Any]
+reveal_type(np.amax(A))  # E: numpy.number[Any]
+reveal_type(np.amax(B))  # E: numpy.number[Any]
+reveal_type(np.amax(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amax(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amax(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amax(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.amin(a))  # E: numpy.number[Any]
+reveal_type(np.amin(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.amin(c))  # E: numpy.number[Any]
+reveal_type(np.amin(A))  # E: numpy.number[Any]
+reveal_type(np.amin(B))  # E: numpy.number[Any]
+reveal_type(np.amin(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amin(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amin(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.amin(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.prod(a))  # E: numpy.number[Any]
+reveal_type(np.prod(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.prod(c))  # E: numpy.number[Any]
+reveal_type(np.prod(A))  # E: numpy.number[Any]
+reveal_type(np.prod(B))  # E: numpy.number[Any]
+reveal_type(np.prod(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.prod(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.prod(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.prod(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.prod(b, out=d))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.prod(B, out=d))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.cumprod(a))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumprod(b))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumprod(c))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumprod(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cumprod(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.ndim(a))  # E: int
+reveal_type(np.ndim(b))  # E: int
+reveal_type(np.ndim(c))  # E: int
+reveal_type(np.ndim(A))  # E: int
+reveal_type(np.ndim(B))  # E: int
+
+reveal_type(np.size(a))  # E: int
+reveal_type(np.size(b))  # E: int
+reveal_type(np.size(c))  # E: int
+reveal_type(np.size(A))  # E: int
+reveal_type(np.size(B))  # E: int
+
+reveal_type(np.around(a))  # E: numpy.number[Any]
+reveal_type(np.around(b))  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.around(c))  # E: numpy.number[Any]
+reveal_type(np.around(A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.around(B))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.mean(a))  # E: numpy.number[Any]
+reveal_type(np.mean(b))  # E: numpy.number[Any]
+reveal_type(np.mean(c))  # E: numpy.number[Any]
+reveal_type(np.mean(A))  # E: numpy.number[Any]
+reveal_type(np.mean(B))  # E: numpy.number[Any]
+reveal_type(np.mean(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.mean(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.mean(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.mean(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.mean(b, out=d))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.mean(B, out=d))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.std(a))  # E: numpy.number[Any]
+reveal_type(np.std(b))  # E: numpy.number[Any]
+reveal_type(np.std(c))  # E: numpy.number[Any]
+reveal_type(np.std(A))  # E: numpy.number[Any]
+reveal_type(np.std(B))  # E: numpy.number[Any]
+reveal_type(np.std(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.std(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.std(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.std(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.std(b, out=d))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.std(B, out=d))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.var(a))  # E: numpy.number[Any]
+reveal_type(np.var(b))  # E: numpy.number[Any]
+reveal_type(np.var(c))  # E: numpy.number[Any]
+reveal_type(np.var(A))  # E: numpy.number[Any]
+reveal_type(np.var(B))  # E: numpy.number[Any]
+reveal_type(np.var(A, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.var(B, axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.var(A, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.var(B, keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.var(b, out=d))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.var(B, out=d))  # E: numpy.ndarray[Any, Any]
diff --git a/numpy/typing/tests/data/reveal/mod.py b/numpy/typing/tests/data/reveal/mod.py
new file mode 100644
index 000000000..4292041f8
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/mod.py
@@ -0,0 +1,149 @@
+import numpy as np
+
+f8 = np.float64()
+i8 = np.int64()
+u8 = np.uint64()
+
+f4 = np.float32()
+i4 = np.int32()
+u4 = np.uint32()
+
+td = np.timedelta64(0, "D")
+b_ = np.bool_()
+
+b = bool()
+f = float()
+i = int()
+
+AR = np.array([1], dtype=np.bool_)
+AR.setflags(write=False)
+
+AR2 = np.array([1], dtype=np.timedelta64)
+AR2.setflags(write=False)
+
+# Time structures
+
+reveal_type(td % td)  # E: numpy.timedelta64
+reveal_type(AR2 % td)  # E: Any
+reveal_type(td % AR2)  # E: Any
+
+reveal_type(divmod(td, td))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.timedelta64]
+reveal_type(divmod(AR2, td))  # E: Tuple[Any, Any]
+reveal_type(divmod(td, AR2))  # E: Tuple[Any, Any]
+
+# Bool
+
+reveal_type(b_ % b)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ % i)  # E: numpy.signedinteger[Any]
+reveal_type(b_ % f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ % b_)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(b_ % i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(b_ % u8)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(b_ % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ % AR)  # E: Any
+
+reveal_type(divmod(b_, b))  # E: Tuple[numpy.signedinteger[numpy.typing._8Bit], numpy.signedinteger[numpy.typing._8Bit]]
+reveal_type(divmod(b_, i))  # E: Tuple[numpy.signedinteger[Any], numpy.signedinteger[Any]]
+reveal_type(divmod(b_, f))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(b_, b_))  # E: Tuple[numpy.signedinteger[numpy.typing._8Bit], numpy.signedinteger[numpy.typing._8Bit]]
+reveal_type(divmod(b_, i8))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(b_, u8))  # E: Tuple[numpy.unsignedinteger[numpy.typing._64Bit], numpy.unsignedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(b_, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(b_, AR))  # E: Tuple[Any, Any]
+
+reveal_type(b % b_)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(i % b_)  # E: numpy.signedinteger[Any]
+reveal_type(f % b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(b_ % b_)  # E: numpy.signedinteger[numpy.typing._8Bit]
+reveal_type(i8 % b_)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(u8 % b_)  # E: numpy.unsignedinteger[numpy.typing._64Bit]
+reveal_type(f8 % b_)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(AR % b_)  # E: Any
+
+reveal_type(divmod(b, b_))  # E: Tuple[numpy.signedinteger[numpy.typing._8Bit], numpy.signedinteger[numpy.typing._8Bit]]
+reveal_type(divmod(i, b_))  # E: Tuple[numpy.signedinteger[Any], numpy.signedinteger[Any]]
+reveal_type(divmod(f, b_))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(b_, b_))  # E: Tuple[numpy.signedinteger[numpy.typing._8Bit], numpy.signedinteger[numpy.typing._8Bit]]
+reveal_type(divmod(i8, b_))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(u8, b_))  # E: Tuple[numpy.unsignedinteger[numpy.typing._64Bit], numpy.unsignedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(f8, b_))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(AR, b_))  # E: Tuple[Any, Any]
+
+# int
+
+reveal_type(i8 % b)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 % i)  # E: numpy.signedinteger[Any]
+reveal_type(i8 % f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 % i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i8 % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i4 % i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i4 % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i4 % i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(i4 % f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(i8 % AR)  # E: Any
+
+reveal_type(divmod(i8, b))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(i8, i))  # E: Tuple[numpy.signedinteger[Any], numpy.signedinteger[Any]]
+reveal_type(divmod(i8, f))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i8, i8))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(i8, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i8, i4))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(i8, f4))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i4, i4))  # E: Tuple[numpy.signedinteger[numpy.typing._32Bit], numpy.signedinteger[numpy.typing._32Bit]]
+reveal_type(divmod(i4, f4))  # E: Tuple[numpy.floating[numpy.typing._32Bit], numpy.floating[numpy.typing._32Bit]]
+reveal_type(divmod(i8, AR))  # E: Tuple[Any, Any]
+
+reveal_type(b % i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(i % i8)  # E: numpy.signedinteger[Any]
+reveal_type(f % i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 % i8)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(f8 % i8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 % i4)  # E: numpy.signedinteger[numpy.typing._64Bit]
+reveal_type(f8 % i4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i4 % i4)  # E: numpy.signedinteger[numpy.typing._32Bit]
+reveal_type(f4 % i4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(AR % i8)  # E: Any
+
+reveal_type(divmod(b, i8))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(i, i8))  # E: Tuple[numpy.signedinteger[Any], numpy.signedinteger[Any]]
+reveal_type(divmod(f, i8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i8, i8))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(f8, i8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i4, i8))  # E: Tuple[numpy.signedinteger[numpy.typing._64Bit], numpy.signedinteger[numpy.typing._64Bit]]
+reveal_type(divmod(f4, i8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i4, i4))  # E: Tuple[numpy.signedinteger[numpy.typing._32Bit], numpy.signedinteger[numpy.typing._32Bit]]
+reveal_type(divmod(f4, i4))  # E: Tuple[numpy.floating[numpy.typing._32Bit], numpy.floating[numpy.typing._32Bit]]
+reveal_type(divmod(AR, i8))  # E: Tuple[Any, Any]
+
+# float
+
+reveal_type(f8 % b)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 % i)  # E: numpy.floating[Any]
+reveal_type(f8 % f)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i8 % f4)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 % f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(f8 % AR)  # E: Any
+
+reveal_type(divmod(f8, b))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f8, i))  # E: Tuple[numpy.floating[Any], numpy.floating[Any]]
+reveal_type(divmod(f8, f))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f8, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f8, f4))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f4, f4))  # E: Tuple[numpy.floating[numpy.typing._32Bit], numpy.floating[numpy.typing._32Bit]]
+reveal_type(divmod(f8, AR))  # E: Tuple[Any, Any]
+
+reveal_type(b % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(i % f8)  # E: numpy.floating[Any]
+reveal_type(f % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f8 % f8)  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(f4 % f4)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(AR % f8)  # E: Any
+
+reveal_type(divmod(b, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(i, f8))  # E: Tuple[numpy.floating[Any], numpy.floating[Any]]
+reveal_type(divmod(f, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f8, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f4, f8))  # E: Tuple[numpy.floating[numpy.typing._64Bit], numpy.floating[numpy.typing._64Bit]]
+reveal_type(divmod(f4, f4))  # E: Tuple[numpy.floating[numpy.typing._32Bit], numpy.floating[numpy.typing._32Bit]]
+reveal_type(divmod(AR, f8))  # E: Tuple[Any, Any]
diff --git a/numpy/typing/tests/data/reveal/modules.py b/numpy/typing/tests/data/reveal/modules.py
new file mode 100644
index 000000000..406463152
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/modules.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+reveal_type(np)  # E: ModuleType
+
+reveal_type(np.char)  # E: ModuleType
+reveal_type(np.ctypeslib)  # E: ModuleType
+reveal_type(np.emath)  # E: ModuleType
+reveal_type(np.fft)  # E: ModuleType
+reveal_type(np.lib)  # E: ModuleType
+reveal_type(np.linalg)  # E: ModuleType
+reveal_type(np.ma)  # E: ModuleType
+reveal_type(np.matrixlib)  # E: ModuleType
+reveal_type(np.polynomial)  # E: ModuleType
+reveal_type(np.random)  # E: ModuleType
+reveal_type(np.rec)  # E: ModuleType
+reveal_type(np.testing)  # E: ModuleType
+reveal_type(np.version)  # E: ModuleType
+
+# TODO: Remove when annotations have been added to `np.testing.assert_equal`
+reveal_type(np.testing.assert_equal)  # E: Any
diff --git a/numpy/typing/tests/data/reveal/nbit_base_example.py b/numpy/typing/tests/data/reveal/nbit_base_example.py
new file mode 100644
index 000000000..0c4c53f9b
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/nbit_base_example.py
@@ -0,0 +1,18 @@
+from typing import TypeVar, Union
+import numpy as np
+import numpy.typing as npt
+
+T = TypeVar("T", bound=npt.NBitBase)
+
+def add(a: np.floating[T], b: np.integer[T]) -> np.floating[T]:
+    return a + b
+
+i8: np.int64
+i4: np.int32
+f8: np.float64
+f4: np.float32
+
+reveal_type(add(f8, i8))  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(add(f4, i8))  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(add(f8, i4))  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(add(f4, i4))  # E: numpy.floating[numpy.typing._32Bit]
diff --git a/numpy/tests/typing/reveal/ndarray_conversion.py b/numpy/typing/tests/data/reveal/ndarray_conversion.py
index 411adcf63..4ee637b75 100644
--- a/numpy/tests/typing/reveal/ndarray_conversion.py
+++ b/numpy/typing/tests/data/reveal/ndarray_conversion.py
@@ -9,7 +9,7 @@ reveal_type(nd.item(0, 1))  # E: Any
 reveal_type(nd.item((0, 1)))  # E: Any
 
 # tolist
-reveal_type(nd.tolist())  # E: builtins.list[Any]
+reveal_type(nd.tolist())  # E: Any
 
 # itemset does not return a value
 # tostring is pretty simple
diff --git a/numpy/typing/tests/data/reveal/ndarray_misc.py b/numpy/typing/tests/data/reveal/ndarray_misc.py
new file mode 100644
index 000000000..3e640b3ba
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/ndarray_misc.py
@@ -0,0 +1,150 @@
+"""
+Tests for miscellaneous (non-magic) ``np.ndarray``/``np.generic`` methods.
+
+More extensive tests are performed for the methods'
+function-based counterpart in `../from_numeric.py`.
+
+"""
+
+import numpy as np
+
+class SubClass(np.ndarray): ...
+
+f8: np.float64
+A: np.ndarray
+B: SubClass
+
+reveal_type(f8.all())  # E: numpy.bool_
+reveal_type(A.all())  # E: numpy.bool_
+reveal_type(A.all(axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(A.all(keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(A.all(out=B))  # E: SubClass
+
+reveal_type(f8.any())  # E: numpy.bool_
+reveal_type(A.any())  # E: numpy.bool_
+reveal_type(A.any(axis=0))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(A.any(keepdims=True))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(A.any(out=B))  # E: SubClass
+
+reveal_type(f8.argmax())  # E: numpy.signedinteger[Any]
+reveal_type(A.argmax())  # E: numpy.signedinteger[Any]
+reveal_type(A.argmax(axis=0))  # E: Union[numpy.signedinteger[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.argmax(out=B))  # E: SubClass
+
+reveal_type(f8.argmin())  # E: numpy.signedinteger[Any]
+reveal_type(A.argmin())  # E: numpy.signedinteger[Any]
+reveal_type(A.argmin(axis=0))  # E: Union[numpy.signedinteger[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.argmin(out=B))  # E: SubClass
+
+reveal_type(f8.argsort())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.argsort())  # E: numpy.ndarray[Any, Any]
+
+reveal_type(f8.astype(np.int64).choose([()]))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.choose([0]))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.choose([0], out=B))  # E: SubClass
+
+reveal_type(f8.clip(1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.clip(1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.clip(None, 1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.clip(1, out=B))  # E: SubClass
+reveal_type(A.clip(None, 1, out=B))  # E: SubClass
+
+reveal_type(f8.compress([0]))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.compress([0]))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.compress([0], out=B))  # E: SubClass
+
+reveal_type(f8.conj())  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(A.conj())  # E: numpy.ndarray[Any, Any]
+reveal_type(B.conj())  # E: SubClass
+
+reveal_type(f8.conjugate())  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(A.conjugate())  # E: numpy.ndarray[Any, Any]
+reveal_type(B.conjugate())  # E: SubClass
+
+reveal_type(f8.cumprod())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.cumprod())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.cumprod(out=B))  # E: SubClass
+
+reveal_type(f8.cumsum())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.cumsum())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.cumsum(out=B))  # E: SubClass
+
+reveal_type(f8.max())  # E: numpy.number[Any]
+reveal_type(A.max())  # E: numpy.number[Any]
+reveal_type(A.max(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.max(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.max(out=B))  # E: SubClass
+
+reveal_type(f8.mean())  # E: numpy.number[Any]
+reveal_type(A.mean())  # E: numpy.number[Any]
+reveal_type(A.mean(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.mean(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.mean(out=B))  # E: SubClass
+
+reveal_type(f8.min())  # E: numpy.number[Any]
+reveal_type(A.min())  # E: numpy.number[Any]
+reveal_type(A.min(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.min(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.min(out=B))  # E: SubClass
+
+reveal_type(f8.newbyteorder())  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(A.newbyteorder())  # E: numpy.ndarray[Any, Any]
+reveal_type(B.newbyteorder('|'))  # E: SubClass
+
+reveal_type(f8.prod())  # E: numpy.number[Any]
+reveal_type(A.prod())  # E: numpy.number[Any]
+reveal_type(A.prod(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.prod(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.prod(out=B))  # E: SubClass
+
+reveal_type(f8.ptp())  # E: numpy.number[Any]
+reveal_type(A.ptp())  # E: numpy.number[Any]
+reveal_type(A.ptp(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.ptp(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.ptp(out=B))  # E: SubClass
+
+reveal_type(f8.round())  # E: numpy.floating[numpy.typing._64Bit]
+reveal_type(A.round())  # E: numpy.ndarray[Any, Any]
+reveal_type(A.round(out=B))  # E: SubClass
+
+reveal_type(f8.repeat(1))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.repeat(1))  # E: numpy.ndarray[Any, Any]
+reveal_type(B.repeat(1))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(f8.std())  # E: numpy.number[Any]
+reveal_type(A.std())  # E: numpy.number[Any]
+reveal_type(A.std(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.std(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.std(out=B))  # E: SubClass
+
+reveal_type(f8.sum())  # E: numpy.number[Any]
+reveal_type(A.sum())  # E: numpy.number[Any]
+reveal_type(A.sum(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.sum(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.sum(out=B))  # E: SubClass
+
+reveal_type(f8.take(0))  # E: numpy.generic
+reveal_type(A.take(0))  # E: numpy.generic
+reveal_type(A.take([0]))  # E: numpy.ndarray[Any, Any]
+reveal_type(A.take(0, out=B))  # E: SubClass
+reveal_type(A.take([0], out=B))  # E: SubClass
+
+reveal_type(f8.var())  # E: numpy.number[Any]
+reveal_type(A.var())  # E: numpy.number[Any]
+reveal_type(A.var(axis=0))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.var(keepdims=True))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.var(out=B))  # E: SubClass
+
+reveal_type(A.argpartition([0]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(A.diagonal())  # E: numpy.ndarray[Any, Any]
+
+reveal_type(A.dot(1))  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.dot(1, out=B))  # E: SubClass
+
+reveal_type(A.nonzero())  # E: tuple[numpy.ndarray[Any, Any]]
+
+reveal_type(A.searchsorted([1]))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(A.trace())  # E: Union[numpy.number[Any], numpy.ndarray[Any, Any]]
+reveal_type(A.trace(out=B))  # E: SubClass
diff --git a/numpy/tests/typing/reveal/ndarray_shape_manipulation.py b/numpy/typing/tests/data/reveal/ndarray_shape_manipulation.py
index a44e1cfa1..a44e1cfa1 100644
--- a/numpy/tests/typing/reveal/ndarray_shape_manipulation.py
+++ b/numpy/typing/tests/data/reveal/ndarray_shape_manipulation.py
diff --git a/numpy/typing/tests/data/reveal/numeric.py b/numpy/typing/tests/data/reveal/numeric.py
new file mode 100644
index 000000000..78e5c1d61
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/numeric.py
@@ -0,0 +1,89 @@
+"""
+Tests for :mod:`numpy.core.numeric`.
+
+Does not include tests which fall under ``array_constructors``.
+
+"""
+
+from typing import List
+import numpy as np
+
+class SubClass(np.ndarray):
+    ...
+
+i8: np.int64
+
+A: np.ndarray
+B: List[int]
+C: SubClass
+
+reveal_type(np.count_nonzero(i8))  # E: int
+reveal_type(np.count_nonzero(A))  # E: int
+reveal_type(np.count_nonzero(B))  # E: int
+reveal_type(np.count_nonzero(A, keepdims=True))  # E: Union[numpy.signedinteger[Any], numpy.ndarray[Any, Any]]
+reveal_type(np.count_nonzero(A, axis=0))  # E: Union[numpy.signedinteger[Any], numpy.ndarray[Any, Any]]
+
+reveal_type(np.isfortran(i8))  # E: bool
+reveal_type(np.isfortran(A))  # E: bool
+
+reveal_type(np.argwhere(i8))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.argwhere(A))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.flatnonzero(i8))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.flatnonzero(A))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.correlate(B, A, mode="valid"))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.correlate(A, A, mode="same"))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.convolve(B, A, mode="valid"))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.convolve(A, A, mode="same"))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.outer(i8, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.outer(B, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.outer(A, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.outer(A, A, out=C))  # E: SubClass
+
+reveal_type(np.tensordot(B, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.tensordot(A, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.tensordot(A, A, axes=0))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.tensordot(A, A, axes=(0, 1)))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.isscalar(i8))  # E: bool
+reveal_type(np.isscalar(A))  # E: bool
+reveal_type(np.isscalar(B))  # E: bool
+
+reveal_type(np.roll(A, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.roll(A, (1, 2)))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.roll(B, 1))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.rollaxis(A, 0, 1))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.moveaxis(A, 0, 1))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.moveaxis(A, (0, 1), (1, 2)))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.cross(B, A))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.cross(A, A))  # E: numpy.ndarray[Any, Any]
+
+reveal_type(np.indices([0, 1, 2]))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.indices([0, 1, 2], sparse=False))  # E: numpy.ndarray[Any, Any]
+reveal_type(np.indices([0, 1, 2], sparse=True))  # E: tuple[numpy.ndarray[Any, Any]]
+
+reveal_type(np.binary_repr(1))  # E: str
+
+reveal_type(np.base_repr(1))  # E: str
+
+reveal_type(np.allclose(i8, A))  # E: bool
+reveal_type(np.allclose(B, A))  # E: bool
+reveal_type(np.allclose(A, A))  # E: bool
+
+reveal_type(np.isclose(i8, A))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.isclose(B, A))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+reveal_type(np.isclose(A, A))  # E: Union[numpy.bool_, numpy.ndarray[Any, Any]]
+
+reveal_type(np.array_equal(i8, A))  # E: bool
+reveal_type(np.array_equal(B, A))  # E: bool
+reveal_type(np.array_equal(A, A))  # E: bool
+
+reveal_type(np.array_equiv(i8, A))  # E: bool
+reveal_type(np.array_equiv(B, A))  # E: bool
+reveal_type(np.array_equiv(A, A))  # E: bool
diff --git a/numpy/tests/typing/reveal/numerictypes.py b/numpy/typing/tests/data/reveal/numerictypes.py
index e026158cd..e026158cd 100644
--- a/numpy/tests/typing/reveal/numerictypes.py
+++ b/numpy/typing/tests/data/reveal/numerictypes.py
diff --git a/numpy/typing/tests/data/reveal/scalars.py b/numpy/typing/tests/data/reveal/scalars.py
new file mode 100644
index 000000000..e887e302d
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/scalars.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+x = np.complex64(3 + 2j)
+
+reveal_type(x.real)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(x.imag)  # E: numpy.floating[numpy.typing._32Bit]
+
+reveal_type(x.real.real)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(x.real.imag)  # E: numpy.floating[numpy.typing._32Bit]
+
+reveal_type(x.itemsize)  # E: int
+reveal_type(x.shape)  # E: Tuple[]
+reveal_type(x.strides)  # E: Tuple[]
+
+reveal_type(x.ndim)  # E: Literal[0]
+reveal_type(x.size)  # E: Literal[1]
+
+reveal_type(x.squeeze())  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(x.byteswap())  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+reveal_type(x.transpose())  # E: numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]
+
+reveal_type(x.dtype)  # E: numpy.dtype[numpy.complexfloating[numpy.typing._32Bit, numpy.typing._32Bit]]
+
+reveal_type(np.complex64().real)  # E: numpy.floating[numpy.typing._32Bit]
+reveal_type(np.complex128().imag)  # E: numpy.floating[numpy.typing._64Bit]
+
+reveal_type(np.unicode_('foo'))  # E: numpy.str_
+reveal_type(np.str0('foo'))  # E: numpy.str_
diff --git a/numpy/typing/tests/data/reveal/ufunc_config.py b/numpy/typing/tests/data/reveal/ufunc_config.py
new file mode 100644
index 000000000..26be80314
--- /dev/null
+++ b/numpy/typing/tests/data/reveal/ufunc_config.py
@@ -0,0 +1,25 @@
+"""Typing tests for `numpy.core._ufunc_config`."""
+
+import numpy as np
+
+def func(a: str, b: int) -> None: ...
+
+class Write:
+    def write(self, value: str) -> None: ...
+
+reveal_type(np.seterr(all=None))  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+reveal_type(np.seterr(divide="ignore"))  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+reveal_type(np.seterr(over="warn"))  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+reveal_type(np.seterr(under="call"))  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+reveal_type(np.seterr(invalid="raise"))  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+reveal_type(np.geterr())  # E: TypedDict('numpy.core._ufunc_config._ErrDict'
+
+reveal_type(np.setbufsize(4096))  # E: int
+reveal_type(np.getbufsize())  # E: int
+
+reveal_type(np.seterrcall(func))  # E: Union[None, def (builtins.str, builtins.int) -> Any, numpy.core._ufunc_config._SupportsWrite]
+reveal_type(np.seterrcall(Write()))  # E: Union[None, def (builtins.str, builtins.int) -> Any, numpy.core._ufunc_config._SupportsWrite]
+reveal_type(np.geterrcall())  # E: Union[None, def (builtins.str, builtins.int) -> Any, numpy.core._ufunc_config._SupportsWrite]
+
+reveal_type(np.errstate(call=func, all="call"))  # E: numpy.errstate[def (a: builtins.str, b: builtins.int)]
+reveal_type(np.errstate(call=Write(), divide="log", over="log"))  # E: numpy.errstate[ufunc_config.Write]
diff --git a/numpy/tests/typing/reveal/warnings_and_errors.py b/numpy/typing/tests/data/reveal/warnings_and_errors.py
index c428deb7a..c428deb7a 100644
--- a/numpy/tests/typing/reveal/warnings_and_errors.py
+++ b/numpy/typing/tests/data/reveal/warnings_and_errors.py
diff --git a/numpy/typing/tests/test_isfile.py b/numpy/typing/tests/test_isfile.py
new file mode 100644
index 000000000..569f05435
--- /dev/null
+++ b/numpy/typing/tests/test_isfile.py
@@ -0,0 +1,33 @@
+import os
+from pathlib import Path
+
+import numpy as np
+from numpy.testing import assert_
+
+ROOT = Path(np.__file__).parents[0]
+FILES = [
+    ROOT / "py.typed",
+    ROOT / "__init__.pyi",
+    ROOT / "char.pyi",
+    ROOT / "ctypeslib.pyi",
+    ROOT / "emath.pyi",
+    ROOT / "rec.pyi",
+    ROOT / "core" / "__init__.pyi",
+    ROOT / "distutils" / "__init__.pyi",
+    ROOT / "f2py" / "__init__.pyi",
+    ROOT / "fft" / "__init__.pyi",
+    ROOT / "lib" / "__init__.pyi",
+    ROOT / "linalg" / "__init__.pyi",
+    ROOT / "ma" / "__init__.pyi",
+    ROOT / "matrixlib" / "__init__.pyi",
+    ROOT / "polynomial" / "__init__.pyi",
+    ROOT / "random" / "__init__.pyi",
+    ROOT / "testing" / "__init__.pyi",
+]
+
+
+class TestIsFile:
+    def test_isfile(self):
+        """Test if all ``.pyi`` files are properly installed."""
+        for file in FILES:
+            assert_(os.path.isfile(file))
diff --git a/numpy/tests/test_typing.py b/numpy/typing/tests/test_typing.py
index 04ea3c64d..90de4fd6d 100644
--- a/numpy/tests/test_typing.py
+++ b/numpy/typing/tests/test_typing.py
@@ -3,6 +3,7 @@ import itertools
 import os
 import re
 from collections import defaultdict
+from typing import Optional
 
 import pytest
 try:
@@ -12,15 +13,13 @@ except ImportError:
 else:
     NO_MYPY = False
 
-TESTS_DIR = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)),
-    "typing",
-)
-PASS_DIR = os.path.join(TESTS_DIR, "pass")
-FAIL_DIR = os.path.join(TESTS_DIR, "fail")
-REVEAL_DIR = os.path.join(TESTS_DIR, "reveal")
-MYPY_INI = os.path.join(TESTS_DIR, "mypy.ini")
-CACHE_DIR = os.path.join(TESTS_DIR, ".mypy_cache")
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+PASS_DIR = os.path.join(DATA_DIR, "pass")
+FAIL_DIR = os.path.join(DATA_DIR, "fail")
+REVEAL_DIR = os.path.join(DATA_DIR, "reveal")
+MYPY_INI = os.path.join(DATA_DIR, "mypy.ini")
+CACHE_DIR = os.path.join(DATA_DIR, ".mypy_cache")
 
 
 def get_test_cases(directory):
@@ -38,6 +37,7 @@ def get_test_cases(directory):
                 )
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
 @pytest.mark.parametrize("path", get_test_cases(PASS_DIR))
 def test_success(path):
@@ -52,9 +52,12 @@ def test_success(path):
     assert re.match(r"Success: no issues found in \d+ source files?", stdout.strip())
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
 @pytest.mark.parametrize("path", get_test_cases(FAIL_DIR))
 def test_fail(path):
+    __tracebackhide__ = True
+
     stdout, stderr, exitcode = api.run([
         "--config-file",
         MYPY_INI,
@@ -89,21 +92,43 @@ def test_fail(path):
 
     for i, line in enumerate(lines):
         lineno = i + 1
-        if " E:" not in line and lineno not in errors:
+        if line.startswith('#') or (" E:" not in line and lineno not in errors):
             continue
 
         target_line = lines[lineno - 1]
         if "# E:" in target_line:
             marker = target_line.split("# E:")[-1].strip()
-            assert lineno in errors, f'Extra error "{marker}"'
-            assert marker in errors[lineno]
+            expected_error = errors.get(lineno)
+            _test_fail(path, marker, expected_error, lineno)
         else:
             pytest.fail(f"Error {repr(errors[lineno])} not found")
 
 
+_FAIL_MSG1 = """Extra error at line {}
+
+Extra error: {!r}
+"""
+
+_FAIL_MSG2 = """Error mismatch at line {}
+
+Expected error: {!r}
+Observed error: {!r}
+"""
+
+
+def _test_fail(path: str, error: str, expected_error: Optional[str], lineno: int) -> None:
+    if expected_error is None:
+        raise AssertionError(_FAIL_MSG1.format(lineno, error))
+    elif error not in expected_error:
+        raise AssertionError(_FAIL_MSG2.format(lineno, expected_error, error))
+
+
+@pytest.mark.slow
 @pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
 @pytest.mark.parametrize("path", get_test_cases(REVEAL_DIR))
 def test_reveal(path):
+    __tracebackhide__ = True
+
     stdout, stderr, exitcode = api.run([
         "--config-file",
         MYPY_INI,
@@ -113,9 +138,10 @@ def test_reveal(path):
     ])
 
     with open(path) as fin:
-        lines = fin.readlines()
+        lines = fin.read().replace('*', '').split("\n")
 
-    for error_line in stdout.split("\n"):
+    stdout_list = stdout.replace('*', '').split("\n")
+    for error_line in stdout_list:
         error_line = error_line.strip()
         if not error_line:
             continue
@@ -126,12 +152,26 @@ def test_reveal(path):
         )
         if match is None:
             raise ValueError(f"Unexpected reveal line format: {error_line}")
-        lineno = int(match.group('lineno'))
+        lineno = int(match.group('lineno')) - 1
         assert "Revealed type is" in error_line
-        marker = lines[lineno - 1].split("# E:")[-1].strip()
-        assert marker in error_line
+
+        marker = lines[lineno].split("# E:")[-1].strip()
+        _test_reveal(path, marker, error_line, 1 + lineno)
+
+
+_REVEAL_MSG = """Reveal mismatch at line {}
+
+Expected reveal: {!r}
+Observed reveal: {!r}
+"""
+
+
+def _test_reveal(path: str, reveal: str, expected_reveal: str, lineno: int) -> None:
+    if reveal not in expected_reveal:
+        raise AssertionError(_REVEAL_MSG.format(lineno, expected_reveal, reveal))
 
 
+@pytest.mark.slow
 @pytest.mark.skipif(NO_MYPY, reason="Mypy is not installed")
 @pytest.mark.parametrize("path", get_test_cases(PASS_DIR))
 def test_code_runs(path):
diff --git a/numpy/version.py b/numpy/version.py
new file mode 100644
index 000000000..8a1d05aa4
--- /dev/null
+++ b/numpy/version.py
@@ -0,0 +1,11 @@
+from ._version import get_versions
+
+__ALL__ = ['version', 'full_version', 'git_revision', 'release']
+
+vinfo = get_versions()
+version: str = vinfo["version"]
+full_version: str = vinfo['version']
+git_revision: str = vinfo['full-revisionid']
+release = 'dev0' not in version
+
+del get_versions, vinfo