Merge branch 'master' into fix-issue-10244

author: Charles Harris <charlesr.harris@gmail.com> 2020-12-13 14:14:49 -0700
committer: GitHub <noreply@github.com> 2020-12-13 14:14:49 -0700
commit: 3fe2d9d2627fc0f84aeed293ff8afa7c1f08d899 (patch)
tree: 2ea27fe06a19c39e8d7a5fe2f87cb7e05363247d /numpy/core
parent: 7d7e446fcbeeff70d905bde2eb0264a797488280 (diff)
parent: eff302e5e8678fa17fb3d8156d49eb585b0876d9 (diff)
download: numpy-3fe2d9d2627fc0f84aeed293ff8afa7c1f08d899.tar.gz
213 files changed, 22356 insertions, 6597 deletions
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index c77885954..e8d3a381b 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -96,6 +96,7 @@ from .numeric import absolute as abs
 # do this after everything else, to minimize the chance of this misleadingly
 # appearing in an import-time traceback
 from . import _add_newdocs
+from . import _add_newdocs_scalars
 # add these for module-freeze analysis (like PyInstaller)
 from . import _dtype_ctypes
 from . import _internal
@@ -113,10 +114,9 @@ __all__ += getlimits.__all__
 __all__ += shape_base.__all__
 __all__ += einsumfunc.__all__
 
-# Make it possible so that ufuncs can be pickled
-#  Here are the loading and unloading functions
-# The name numpy.core._ufunc_reconstruct must be
-#   available for unpickling to work.
+# We used to use `np.core._ufunc_reconstruct` to unpickle. This is unnecessary,
+# but old pickles saved before 1.20 will be using it, and there is no reason
+# to break loading them.
 def _ufunc_reconstruct(module, name):
     # The `fromlist` kwarg is required to ensure that `mod` points to the
     # inner-most module rather than the parent package when module name is
@@ -126,14 +126,17 @@ def _ufunc_reconstruct(module, name):
     return getattr(mod, name)
 
 def _ufunc_reduce(func):
-    from pickle import whichmodule
-    name = func.__name__
-    return _ufunc_reconstruct, (whichmodule(func, name), name)
+    # Report the `__name__`. pickle will try to find the module. Note that
+    # pickle supports for this `__name__` to be a `__qualname__`. It may
+    # make sense to add a `__qualname__` to ufuncs, to allow this more
+    # explicitly (Numba has ufuncs as attributes).
+    # See also: https://github.com/dask/distributed/issues/3450
+    return func.__name__
 
 
 import copyreg
 
-copyreg.pickle(ufunc, _ufunc_reduce, _ufunc_reconstruct)
+copyreg.pickle(ufunc, _ufunc_reduce)
 # Unclutter namespace (must keep _ufunc_reconstruct for unpickling)
 del copyreg
 del _ufunc_reduce
diff --git a/numpy/core/__init__.pyi b/numpy/core/__init__.pyi
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/numpy/core/__init__.pyi
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index a3c404a64..2cbfe52be 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -9,9 +9,8 @@ NOTE: Many of the methods of ndarray have corresponding functions.
 
 """
 
-from numpy.core import numerictypes as _numerictypes
-from numpy.core import dtype
 from numpy.core.function_base import add_newdoc
+from numpy.core.overrides import array_function_like_doc
 
 ###############################################################################
 #
@@ -606,6 +605,7 @@ add_newdoc('numpy.core', 'broadcast',
     --------
     broadcast_arrays
     broadcast_to
+    broadcast_shapes
 
     Examples
     --------
@@ -786,7 +786,8 @@ add_newdoc('numpy.core', 'broadcast', ('reset',
 
 add_newdoc('numpy.core.multiarray', 'array',
     """
-    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0)
+    array(object, dtype=None, *, copy=True, order='K', subok=False, ndmin=0,
+          like=None)
 
     Create an array.
 
@@ -829,6 +830,9 @@ add_newdoc('numpy.core.multiarray', 'array',
         Specifies the minimum number of dimensions that the resulting
         array should have.  Ones will be pre-pended to the shape as
         needed to meet this requirement.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -895,11 +899,14 @@ add_newdoc('numpy.core.multiarray', 'array',
     matrix([[1, 2],
             [3, 4]])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'empty',
     """
-    empty(shape, dtype=float, order='C')
+    empty(shape, dtype=float, order='C', *, like=None)
 
     Return a new array of given shape and type, without initializing entries.
 
@@ -914,6 +921,9 @@ add_newdoc('numpy.core.multiarray', 'empty',
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -946,7 +956,10 @@ add_newdoc('numpy.core.multiarray', 'empty',
     array([[-1073741821, -1067949133],
            [  496041986,    19249760]])                     #uninitialized
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'scalar',
     """
@@ -964,7 +977,7 @@ add_newdoc('numpy.core.multiarray', 'scalar',
 
 add_newdoc('numpy.core.multiarray', 'zeros',
     """
-    zeros(shape, dtype=float, order='C')
+    zeros(shape, dtype=float, order='C', *, like=None)
 
     Return a new array of given shape and type, filled with zeros.
 
@@ -979,6 +992,9 @@ add_newdoc('numpy.core.multiarray', 'zeros',
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1013,7 +1029,10 @@ add_newdoc('numpy.core.multiarray', 'zeros',
     array([(0, 0), (0, 0)],
           dtype=[('x', '<i4'), ('y', '<i4')])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'set_typeDict',
     """set_typeDict(dict)
@@ -1025,7 +1044,7 @@ add_newdoc('numpy.core.multiarray', 'set_typeDict',
 
 add_newdoc('numpy.core.multiarray', 'fromstring',
     """
-    fromstring(string, dtype=float, count=-1, sep='')
+    fromstring(string, dtype=float, count=-1, sep='', *, like=None)
 
     A new 1-D array initialized from text data in a string.
 
@@ -1058,6 +1077,9 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
             text, the binary mode of `fromstring` will first encode it into
             bytes using either utf-8 (python 3) or the default encoding
             (python 2), neither of which produce sane results.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1081,7 +1103,10 @@ add_newdoc('numpy.core.multiarray', 'fromstring',
     >>> np.fromstring('1, 2', dtype=int, sep=',')
     array([1, 2])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'compare_chararrays',
     """
@@ -1122,7 +1147,7 @@ add_newdoc('numpy.core.multiarray', 'compare_chararrays',
 
 add_newdoc('numpy.core.multiarray', 'fromiter',
     """
-    fromiter(iter, dtype, count=-1)
+    fromiter(iter, dtype, count=-1, *, like=None)
 
     Create a new 1-dimensional array from an iterable object.
 
@@ -1135,6 +1160,9 @@ add_newdoc('numpy.core.multiarray', 'fromiter',
     count : int, optional
         The number of items to read from *iterable*.  The default is -1,
         which means all data is read.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1152,11 +1180,14 @@ add_newdoc('numpy.core.multiarray', 'fromiter',
     >>> np.fromiter(iterable, float)
     array([  0.,   1.,   4.,   9.,  16.])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'fromfile',
     """
-    fromfile(file, dtype=float, count=-1, sep='', offset=0)
+    fromfile(file, dtype=float, count=-1, sep='', offset=0, *, like=None)
 
     Construct an array from data in a text or binary file.
 
@@ -1195,6 +1226,9 @@ add_newdoc('numpy.core.multiarray', 'fromfile',
         Only permitted for binary files.
 
         .. versionadded:: 1.17.0
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     See also
     --------
@@ -1241,11 +1275,14 @@ add_newdoc('numpy.core.multiarray', 'fromfile',
     array([((10, 0), 98.25)],
           dtype=[('time', [('min', '<i8'), ('sec', '<i8')]), ('temp', '<f8')])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', 'frombuffer',
     """
-    frombuffer(buffer, dtype=float, count=-1, offset=0)
+    frombuffer(buffer, dtype=float, count=-1, offset=0, *, like=None)
 
     Interpret a buffer as a 1-dimensional array.
 
@@ -1259,6 +1296,9 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
         Number of items to read. ``-1`` means all data in the buffer.
     offset : int, optional
         Start reading the buffer from this offset (in bytes); default: 0.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Notes
     -----
@@ -1283,7 +1323,10 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
     >>> np.frombuffer(b'\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)
     array([1, 2, 3], dtype=uint8)
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core', 'fastCopyAndTranspose',
     """_fastCopyAndTranspose(a)""")
@@ -1293,7 +1336,7 @@ add_newdoc('numpy.core.multiarray', 'correlate',
 
 add_newdoc('numpy.core.multiarray', 'arange',
     """
-    arange([start,] stop[, step,], dtype=None)
+    arange([start,] stop[, step,], dtype=None, *, like=None)
 
     Return evenly spaced values within a given interval.
 
@@ -1322,6 +1365,9 @@ add_newdoc('numpy.core.multiarray', 'arange',
     dtype : dtype
         The type of the output array.  If `dtype` is not given, infer the data
         type from the other input arguments.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1350,7 +1396,10 @@ add_newdoc('numpy.core.multiarray', 'arange',
     >>> np.arange(3,7,2)
     array([3, 5])
 
-    """)
+    """.replace(
+        "${ARRAY_FUNCTION_LIKE}",
+        array_function_like_doc,
+    ))
 
 add_newdoc('numpy.core.multiarray', '_get_ndarray_c_version',
     """_get_ndarray_c_version()
@@ -2521,7 +2570,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__setstate__',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('all',
     """
-    a.all(axis=None, out=None, keepdims=False)
+    a.all(axis=None, out=None, keepdims=False, *, where=True)
 
     Returns True if all elements evaluate to True.
 
@@ -2536,7 +2585,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('all',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('any',
     """
-    a.any(axis=None, out=None, keepdims=False)
+    a.any(axis=None, out=None, keepdims=False, *, where=True)
 
     Returns True if any of the elements of `a` evaluate to True.
 
@@ -2568,7 +2617,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('argmin',
     """
     a.argmin(axis=None, out=None)
 
-    Return indices of the minimum values along the given axis of `a`.
+    Return indices of the minimum values along the given axis.
 
     Refer to `numpy.argmin` for detailed documentation.
 
@@ -3193,7 +3242,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('max',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('mean',
     """
-    a.mean(axis=None, dtype=None, out=None, keepdims=False)
+    a.mean(axis=None, dtype=None, out=None, keepdims=False, *, where=True)
 
     Returns the average of the array elements along given axis.
 
@@ -3223,7 +3272,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('min',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('newbyteorder',
     """
-    arr.newbyteorder(new_order='S')
+    arr.newbyteorder(new_order='S', /)
 
     Return the array with the same data viewed with a different byte order.
 
@@ -3764,7 +3813,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('squeeze',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('std',
     """
-    a.std(axis=None, dtype=None, out=None, ddof=0, keepdims=False)
+    a.std(axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True)
 
     Returns the standard deviation of the array elements along given axis.
 
@@ -4051,7 +4100,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('transpose',
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('var',
     """
-    a.var(axis=None, dtype=None, out=None, ddof=0, keepdims=False)
+    a.var(axis=None, dtype=None, out=None, ddof=0, keepdims=False, *, where=True)
 
     Returns the variance of the array elements, along given axis.
 
@@ -4457,10 +4506,8 @@ add_newdoc('numpy.core', 'ufunc',
 
     A detailed explanation of ufuncs can be found in the docs for :ref:`ufuncs`.
 
-    Calling ufuncs:
-    ===============
+    **Calling ufuncs:** ``op(*x[, out], where=True, **kwargs)``
 
-    op(*x[, out], where=True, **kwargs)
     Apply `op` to the arguments `*x` elementwise, broadcasting the arguments.
 
     The broadcasting rules are:
@@ -4691,14 +4738,14 @@ add_newdoc('numpy.core', 'ufunc', ('signature',
 
 add_newdoc('numpy.core', 'ufunc', ('reduce',
     """
-    reduce(a, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
+    reduce(array, axis=0, dtype=None, out=None, keepdims=False, initial=<no value>, where=True)
 
-    Reduces `a`'s dimension by one, by applying ufunc along one axis.
+    Reduces `array`'s dimension by one, by applying ufunc along one axis.
 
-    Let :math:`a.shape = (N_0, ..., N_i, ..., N_{M-1})`.  Then
-    :math:`ufunc.reduce(a, axis=i)[k_0, ..,k_{i-1}, k_{i+1}, .., k_{M-1}]` =
+    Let :math:`array.shape = (N_0, ..., N_i, ..., N_{M-1})`.  Then
+    :math:`ufunc.reduce(array, axis=i)[k_0, ..,k_{i-1}, k_{i+1}, .., k_{M-1}]` =
     the result of iterating `j` over :math:`range(N_i)`, cumulatively applying
-    ufunc to each :math:`a[k_0, ..,k_{i-1}, j, k_{i+1}, .., k_{M-1}]`.
+    ufunc to each :math:`array[k_0, ..,k_{i-1}, j, k_{i+1}, .., k_{M-1}]`.
     For a one-dimensional array, reduce produces results equivalent to:
     ::
 
@@ -4711,7 +4758,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
     Parameters
     ----------
-    a : array_like
+    array : array_like
         The array to act on.
     axis : None or int or tuple of ints, optional
         Axis or axes along which a reduction is performed.
@@ -4744,7 +4791,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
         in the result as dimensions with size one. With this option,
-        the result will broadcast correctly against the original `arr`.
+        the result will broadcast correctly against the original `array`.
 
         .. versionadded:: 1.7.0
     initial : scalar, optional
@@ -4758,7 +4805,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduce',
 
     where : array_like of bool, optional
         A boolean array which is broadcasted to match the dimensions
-        of `a`, and selects elements to include in the reduction. Note
+        of `array`, and selects elements to include in the reduction. Note
         that for ufuncs like ``minimum`` that do not have an identity
         defined, one has to pass in also ``initial``.
 
@@ -4900,28 +4947,28 @@ add_newdoc('numpy.core', 'ufunc', ('accumulate',
 
 add_newdoc('numpy.core', 'ufunc', ('reduceat',
     """
-    reduceat(a, indices, axis=0, dtype=None, out=None)
+    reduceat(array, indices, axis=0, dtype=None, out=None)
 
     Performs a (local) reduce with specified slices over a single axis.
 
     For i in ``range(len(indices))``, `reduceat` computes
-    ``ufunc.reduce(a[indices[i]:indices[i+1]])``, which becomes the i-th
+    ``ufunc.reduce(array[indices[i]:indices[i+1]])``, which becomes the i-th
     generalized "row" parallel to `axis` in the final result (i.e., in a
     2-D array, for example, if `axis = 0`, it becomes the i-th row, but if
     `axis = 1`, it becomes the i-th column).  There are three exceptions to this:
 
     * when ``i = len(indices) - 1`` (so for the last index),
-      ``indices[i+1] = a.shape[axis]``.
+      ``indices[i+1] = array.shape[axis]``.
     * if ``indices[i] >= indices[i + 1]``, the i-th generalized "row" is
-      simply ``a[indices[i]]``.
-    * if ``indices[i] >= len(a)`` or ``indices[i] < 0``, an error is raised.
+      simply ``array[indices[i]]``.
+    * if ``indices[i] >= len(array)`` or ``indices[i] < 0``, an error is raised.
 
     The shape of the output depends on the size of `indices`, and may be
-    larger than `a` (this happens if ``len(indices) > a.shape[axis]``).
+    larger than `array` (this happens if ``len(indices) > array.shape[axis]``).
 
     Parameters
     ----------
-    a : array_like
+    array : array_like
         The array to act on.
     indices : array_like
         Paired indices, comma separated (not colon), specifying slices to
@@ -4951,14 +4998,15 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
     -----
     A descriptive example:
 
-    If `a` is 1-D, the function `ufunc.accumulate(a)` is the same as
-    ``ufunc.reduceat(a, indices)[::2]`` where `indices` is
+    If `array` is 1-D, the function `ufunc.accumulate(array)` is the same as
+    ``ufunc.reduceat(array, indices)[::2]`` where `indices` is
     ``range(len(array) - 1)`` with a zero placed
     in every other element:
-    ``indices = zeros(2 * len(a) - 1)``, ``indices[1::2] = range(1, len(a))``.
+    ``indices = zeros(2 * len(array) - 1)``,
+    ``indices[1::2] = range(1, len(array))``.
 
-    Don't be fooled by this attribute's name: `reduceat(a)` is not
-    necessarily smaller than `a`.
+    Don't be fooled by this attribute's name: `reduceat(array)` is not
+    necessarily smaller than `array`.
 
     Examples
     --------
@@ -5007,7 +5055,7 @@ add_newdoc('numpy.core', 'ufunc', ('reduceat',
 
 add_newdoc('numpy.core', 'ufunc', ('outer',
     r"""
-    outer(A, B, **kwargs)
+    outer(A, B, /, **kwargs)
 
     Apply the ufunc `op` to all pairs (a, b) with a in `A` and b in `B`.
 
@@ -5077,7 +5125,7 @@ add_newdoc('numpy.core', 'ufunc', ('outer',
 
 add_newdoc('numpy.core', 'ufunc', ('at',
     """
-    at(a, indices, b=None)
+    at(a, indices, b=None, /)
 
     Performs unbuffered in place operation on operand 'a' for elements
     specified by 'indices'. For addition ufunc, this method is equivalent to
@@ -5493,6 +5541,45 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('kind',
 
     """))
 
+add_newdoc('numpy.core.multiarray', 'dtype', ('metadata',
+    """
+    Either ``None`` or a readonly dictionary of metadata (mappingproxy).
+
+    The metadata field can be set using any dictionary at data-type
+    creation. NumPy currently has no uniform approach to propagating
+    metadata; although some array operations preserve it, there is no
+    guarantee that others will.
+
+    .. warning::
+
+        Although used in certain projects, this feature was long undocumented
+        and is not well supported. Some aspects of metadata propagation
+        are expected to change in the future.
+
+    Examples
+    --------
+
+    >>> dt = np.dtype(float, metadata={"key": "value"})
+    >>> dt.metadata["key"]
+    'value'
+    >>> arr = np.array([1, 2, 3], dtype=dt)
+    >>> arr.dtype.metadata
+    mappingproxy({'key': 'value'})
+
+    Adding arrays with identical datatypes currently preserves the metadata:
+
+    >>> (arr + arr).dtype.metadata
+    mappingproxy({'key': 'value'})
+
+    But if the arrays have different dtype metadata, the metadata may be 
+    dropped:
+
+    >>> dt2 = np.dtype(float, metadata={"key2": "value2"})
+    >>> arr2 = np.array([3, 2, 1], dtype=dt2)
+    >>> (arr + arr2).dtype.metadata is None
+    True  # The metadata field is cleared so None is returned
+    """))
+
 add_newdoc('numpy.core.multiarray', 'dtype', ('name',
     """
     A bit-width name for this data-type.
@@ -5649,7 +5736,7 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('type',
 
 add_newdoc('numpy.core.multiarray', 'dtype', ('newbyteorder',
     """
-    newbyteorder(new_order='S')
+    newbyteorder(new_order='S', /)
 
     Return a new dtype with a different byte order.
 
@@ -6025,7 +6112,7 @@ add_newdoc('numpy.core.numerictypes', 'generic',
 
 add_newdoc('numpy.core.numerictypes', 'generic', ('newbyteorder',
     """
-    newbyteorder(new_order='S')
+    newbyteorder(new_order='S', /)
 
     Return a new `dtype` with a different byte order.
 
@@ -6195,183 +6282,3 @@ add_newdoc('numpy.core.numerictypes', 'character',
     Abstract base class of all character string scalar types.
 
     """)
-
-
-##############################################################################
-#
-# Documentation for concrete scalar classes
-#
-##############################################################################
-
-def numeric_type_aliases(aliases):
-    def type_aliases_gen():
-        for alias, doc in aliases:
-            try:
-                alias_type = getattr(_numerictypes, alias)
-            except AttributeError:
-                # The set of aliases that actually exist varies between platforms
-                pass
-            else:
-                yield (alias_type, alias, doc)
-    return list(type_aliases_gen())
-
-
-possible_aliases = numeric_type_aliases([
-    ('int8', '8-bit signed integer (-128 to 127)'),
-    ('int16', '16-bit signed integer (-32768 to 32767)'),
-    ('int32', '32-bit signed integer (-2147483648 to 2147483647)'),
-    ('int64', '64-bit signed integer (-9223372036854775808 to 9223372036854775807)'),
-    ('intp', 'Signed integer large enough to fit pointer, compatible with C ``intptr_t``'),
-    ('uint8', '8-bit unsigned integer (0 to 255)'),
-    ('uint16', '16-bit unsigned integer (0 to 65535)'),
-    ('uint32', '32-bit unsigned integer (0 to 4294967295)'),
-    ('uint64', '64-bit unsigned integer (0 to 18446744073709551615)'),
-    ('uintp', 'Unsigned integer large enough to fit pointer, compatible with C ``uintptr_t``'),
-    ('float16', '16-bit-precision floating-point number type: sign bit, 5 bits exponent, 10 bits mantissa'),
-    ('float32', '32-bit-precision floating-point number type: sign bit, 8 bits exponent, 23 bits mantissa'),
-    ('float64', '64-bit precision floating-point number type: sign bit, 11 bits exponent, 52 bits mantissa'),
-    ('float96', '96-bit extended-precision floating-point number type'),
-    ('float128', '128-bit extended-precision floating-point number type'),
-    ('complex64', 'Complex number type composed of 2 32-bit-precision floating-point numbers'),
-    ('complex128', 'Complex number type composed of 2 64-bit-precision floating-point numbers'),
-    ('complex192', 'Complex number type composed of 2 96-bit extended-precision floating-point numbers'),
-    ('complex256', 'Complex number type composed of 2 128-bit extended-precision floating-point numbers'),
-    ])
-
-
-def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
-    o = getattr(_numerictypes, obj)
-
-    character_code = dtype(o).char
-    canonical_name_doc = "" if obj == o.__name__ else "Canonical name: ``np.{}``.\n    ".format(obj)
-    alias_doc = ''.join("Alias: ``np.{}``.\n    ".format(alias) for alias in fixed_aliases)
-    alias_doc += ''.join("Alias *on this platform*: ``np.{}``: {}.\n    ".format(alias, doc)
-                         for (alias_type, alias, doc) in possible_aliases if alias_type is o)
-
-    docstring = """
-    {doc}
-    Character code: ``'{character_code}'``.
-    {canonical_name_doc}{alias_doc}
-    """.format(doc=doc.strip(), character_code=character_code,
-               canonical_name_doc=canonical_name_doc, alias_doc=alias_doc)
-
-    add_newdoc('numpy.core.numerictypes', obj, docstring)
-
-
-add_newdoc_for_scalar_type('bool_', ['bool8'],
-    """
-    Boolean type (True or False), stored as a byte.
-    """)
-
-add_newdoc_for_scalar_type('byte', [],
-    """
-    Signed integer type, compatible with C ``char``.
-    """)
-
-add_newdoc_for_scalar_type('short', [],
-    """
-    Signed integer type, compatible with C ``short``.
-    """)
-
-add_newdoc_for_scalar_type('intc', [],
-    """
-    Signed integer type, compatible with C ``int``.
-    """)
-
-add_newdoc_for_scalar_type('int_', [],
-    """
-    Signed integer type, compatible with Python `int` anc C ``long``.
-    """)
-
-add_newdoc_for_scalar_type('longlong', [],
-    """
-    Signed integer type, compatible with C ``long long``.
-    """)
-
-add_newdoc_for_scalar_type('ubyte', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned char``.
-    """)
-
-add_newdoc_for_scalar_type('ushort', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned short``.
-    """)
-
-add_newdoc_for_scalar_type('uintc', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned int``.
-    """)
-
-add_newdoc_for_scalar_type('uint', [],
-    """
-    Unsigned integer type, compatible with C ``unsigned long``.
-    """)
-
-add_newdoc_for_scalar_type('ulonglong', [],
-    """
-    Signed integer type, compatible with C ``unsigned long long``.
-    """)
-
-add_newdoc_for_scalar_type('half', [],
-    """
-    Half-precision floating-point number type.
-    """)
-
-add_newdoc_for_scalar_type('single', [],
-    """
-    Single-precision floating-point number type, compatible with C ``float``.
-    """)
-
-add_newdoc_for_scalar_type('double', ['float_'],
-    """
-    Double-precision floating-point number type, compatible with Python `float`
-    and C ``double``.
-    """)
-
-add_newdoc_for_scalar_type('longdouble', ['longfloat'],
-    """
-    Extended-precision floating-point number type, compatible with C
-    ``long double`` but not necessarily with IEEE 754 quadruple-precision.
-    """)
-
-add_newdoc_for_scalar_type('csingle', ['singlecomplex'],
-    """
-    Complex number type composed of two single-precision floating-point
-    numbers.
-    """)
-
-add_newdoc_for_scalar_type('cdouble', ['cfloat', 'complex_'],
-    """
-    Complex number type composed of two double-precision floating-point
-    numbers, compatible with Python `complex`.
-    """)
-
-add_newdoc_for_scalar_type('clongdouble', ['clongfloat', 'longcomplex'],
-    """
-    Complex number type composed of two extended-precision floating-point
-    numbers.
-    """)
-
-add_newdoc_for_scalar_type('object_', [],
-    """
-    Any Python object.
-    """)
-
-# TODO: work out how to put this on the base class, np.floating
-for float_name in ('half', 'single', 'double', 'longdouble'):
-    add_newdoc('numpy.core.numerictypes', float_name, ('as_integer_ratio',
-        """
-        {ftype}.as_integer_ratio() -> (int, int)
-
-        Return a pair of integers, whose ratio is exactly equal to the original
-        floating point number, and with a positive denominator.
-        Raise OverflowError on infinities and a ValueError on NaNs.
-
-        >>> np.{ftype}(10.0).as_integer_ratio()
-        (10, 1)
-        >>> np.{ftype}(0.0).as_integer_ratio()
-        (0, 1)
-        >>> np.{ftype}(-.25).as_integer_ratio()
-        (-1, 4)
-        """.format(ftype=float_name)))
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
new file mode 100644
index 000000000..b9b151224
--- /dev/null
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -0,0 +1,251 @@
+"""
+This file is separate from ``_add_newdocs.py`` so that it can be mocked out by
+our sphinx ``conf.py`` during doc builds, where we want to avoid showing
+platform-dependent information.
+"""
+from numpy.core import dtype
+from numpy.core import numerictypes as _numerictypes
+from numpy.core.function_base import add_newdoc
+
+##############################################################################
+#
+# Documentation for concrete scalar classes
+#
+##############################################################################
+
+def numeric_type_aliases(aliases):
+    def type_aliases_gen():
+        for alias, doc in aliases:
+            try:
+                alias_type = getattr(_numerictypes, alias)
+            except AttributeError:
+                # The set of aliases that actually exist varies between platforms
+                pass
+            else:
+                yield (alias_type, alias, doc)
+    return list(type_aliases_gen())
+
+
+possible_aliases = numeric_type_aliases([
+    ('int8', '8-bit signed integer (``-128`` to ``127``)'),
+    ('int16', '16-bit signed integer (``-32_768`` to ``32_767``)'),
+    ('int32', '32-bit signed integer (``-2_147_483_648`` to ``2_147_483_647``)'),
+    ('int64', '64-bit signed integer (``-9_223_372_036_854_775_808`` to ``9_223_372_036_854_775_807``)'),
+    ('intp', 'Signed integer large enough to fit pointer, compatible with C ``intptr_t``'),
+    ('uint8', '8-bit unsigned integer (``0`` to ``255``)'),
+    ('uint16', '16-bit unsigned integer (``0`` to ``65_535``)'),
+    ('uint32', '32-bit unsigned integer (``0`` to ``4_294_967_295``)'),
+    ('uint64', '64-bit unsigned integer (``0`` to ``18_446_744_073_709_551_615``)'),
+    ('uintp', 'Unsigned integer large enough to fit pointer, compatible with C ``uintptr_t``'),
+    ('float16', '16-bit-precision floating-point number type: sign bit, 5 bits exponent, 10 bits mantissa'),
+    ('float32', '32-bit-precision floating-point number type: sign bit, 8 bits exponent, 23 bits mantissa'),
+    ('float64', '64-bit precision floating-point number type: sign bit, 11 bits exponent, 52 bits mantissa'),
+    ('float96', '96-bit extended-precision floating-point number type'),
+    ('float128', '128-bit extended-precision floating-point number type'),
+    ('complex64', 'Complex number type composed of 2 32-bit-precision floating-point numbers'),
+    ('complex128', 'Complex number type composed of 2 64-bit-precision floating-point numbers'),
+    ('complex192', 'Complex number type composed of 2 96-bit extended-precision floating-point numbers'),
+    ('complex256', 'Complex number type composed of 2 128-bit extended-precision floating-point numbers'),
+    ])
+
+
+def add_newdoc_for_scalar_type(obj, fixed_aliases, doc):
+    # note: `:field: value` is rST syntax which renders as field lists.
+    o = getattr(_numerictypes, obj)
+
+    character_code = dtype(o).char
+    canonical_name_doc = "" if obj == o.__name__ else ":Canonical name: `numpy.{}`\n    ".format(obj)
+    alias_doc = ''.join(":Alias: `numpy.{}`\n    ".format(alias) for alias in fixed_aliases)
+    alias_doc += ''.join(":Alias on this platform: `numpy.{}`: {}.\n    ".format(alias, doc)
+                         for (alias_type, alias, doc) in possible_aliases if alias_type is o)
+    docstring = """
+    {doc}
+
+    :Character code: ``'{character_code}'``
+    {canonical_name_doc}{alias_doc}
+    """.format(doc=doc.strip(), character_code=character_code,
+               canonical_name_doc=canonical_name_doc, alias_doc=alias_doc)
+
+    add_newdoc('numpy.core.numerictypes', obj, docstring)
+
+
+add_newdoc_for_scalar_type('bool_', ['bool8'],
+    """
+    Boolean type (True or False), stored as a byte.
+
+    .. warning::
+
+       The :class:`bool_` type is not a subclass of the :class:`int_` type
+       (the :class:`bool_` is not even a number type). This is different
+       than Python's default implementation of :class:`bool` as a
+       sub-class of :class:`int`.
+    """)
+
+add_newdoc_for_scalar_type('byte', [],
+    """
+    Signed integer type, compatible with C ``char``.
+    """)
+
+add_newdoc_for_scalar_type('short', [],
+    """
+    Signed integer type, compatible with C ``short``.
+    """)
+
+add_newdoc_for_scalar_type('intc', [],
+    """
+    Signed integer type, compatible with C ``int``.
+    """)
+
+add_newdoc_for_scalar_type('int_', [],
+    """
+    Signed integer type, compatible with Python `int` and C ``long``.
+    """)
+
+add_newdoc_for_scalar_type('longlong', [],
+    """
+    Signed integer type, compatible with C ``long long``.
+    """)
+
+add_newdoc_for_scalar_type('ubyte', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned char``.
+    """)
+
+add_newdoc_for_scalar_type('ushort', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned short``.
+    """)
+
+add_newdoc_for_scalar_type('uintc', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned int``.
+    """)
+
+add_newdoc_for_scalar_type('uint', [],
+    """
+    Unsigned integer type, compatible with C ``unsigned long``.
+    """)
+
+add_newdoc_for_scalar_type('ulonglong', [],
+    """
+    Signed integer type, compatible with C ``unsigned long long``.
+    """)
+
+add_newdoc_for_scalar_type('half', [],
+    """
+    Half-precision floating-point number type.
+    """)
+
+add_newdoc_for_scalar_type('single', [],
+    """
+    Single-precision floating-point number type, compatible with C ``float``.
+    """)
+
+add_newdoc_for_scalar_type('double', ['float_'],
+    """
+    Double-precision floating-point number type, compatible with Python `float`
+    and C ``double``.
+    """)
+
+add_newdoc_for_scalar_type('longdouble', ['longfloat'],
+    """
+    Extended-precision floating-point number type, compatible with C
+    ``long double`` but not necessarily with IEEE 754 quadruple-precision.
+    """)
+
+add_newdoc_for_scalar_type('csingle', ['singlecomplex'],
+    """
+    Complex number type composed of two single-precision floating-point
+    numbers.
+    """)
+
+add_newdoc_for_scalar_type('cdouble', ['cfloat', 'complex_'],
+    """
+    Complex number type composed of two double-precision floating-point
+    numbers, compatible with Python `complex`.
+    """)
+
+add_newdoc_for_scalar_type('clongdouble', ['clongfloat', 'longcomplex'],
+    """
+    Complex number type composed of two extended-precision floating-point
+    numbers.
+    """)
+
+add_newdoc_for_scalar_type('object_', [],
+    """
+    Any Python object.
+    """)
+
+add_newdoc_for_scalar_type('str_', ['unicode_'],
+    r"""
+    A unicode string.
+
+    When used in arrays, this type strips trailing null codepoints.
+
+    Unlike the builtin `str`, this supports the :ref:`python:bufferobjects`, exposing its
+    contents as UCS4:
+
+    >>> m = memoryview(np.str_("abc"))
+    >>> m.format
+    '3w'
+    >>> m.tobytes()
+    b'a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00'
+    """)
+
+add_newdoc_for_scalar_type('bytes_', ['string_'],
+    r"""
+    A byte string.
+
+    When used in arrays, this type strips trailing null bytes.
+    """)
+
+add_newdoc_for_scalar_type('void', [],
+    r"""
+    Either an opaque sequence of bytes, or a structure.
+    
+    >>> np.void(b'abcd')
+    void(b'\x61\x62\x63\x64')
+    
+    Structured `void` scalars can only be constructed via extraction from :ref:`structured_arrays`:
+    
+    >>> arr = np.array((1, 2), dtype=[('x', np.int8), ('y', np.int8)])
+    >>> arr[()]
+    (1, 2)  # looks like a tuple, but is `np.void`
+    """)
+
+add_newdoc_for_scalar_type('datetime64', [],
+    """
+    A datetime stored as a 64-bit integer, counting from ``1970-01-01T00:00:00``.
+
+    >>> np.datetime64(10, 'Y')
+    numpy.datetime64('1980')
+    >>> np.datetime64(10, 'D')
+    numpy.datetime64('1970-01-11')
+    
+    See :ref:`arrays.datetime` for more information.
+    """)
+
+add_newdoc_for_scalar_type('timedelta64', [],
+    """
+    A timedelta stored as a 64-bit integer.
+    
+    See :ref:`arrays.datetime` for more information.
+    """)
+
+# TODO: work out how to put this on the base class, np.floating
+for float_name in ('half', 'single', 'double', 'longdouble'):
+    add_newdoc('numpy.core.numerictypes', float_name, ('as_integer_ratio',
+        """
+        {ftype}.as_integer_ratio() -> (int, int)
+
+        Return a pair of integers, whose ratio is exactly equal to the original
+        floating point number, and with a positive denominator.
+        Raise `OverflowError` on infinities and a `ValueError` on NaNs.
+
+        >>> np.{ftype}(10.0).as_integer_ratio()
+        (10, 1)
+        >>> np.{ftype}(0.0).as_integer_ratio()
+        (0, 1)
+        >>> np.{ftype}(-.25).as_integer_ratio()
+        (-1, 4)
+        """.format(ftype=float_name)))
diff --git a/numpy/core/_asarray.py b/numpy/core/_asarray.py
index df569f22d..a406308f3 100644
--- a/numpy/core/_asarray.py
+++ b/numpy/core/_asarray.py
@@ -3,7 +3,11 @@ Functions in the ``as*array`` family that promote array-likes into arrays.
 
 `require` fits this category despite its name not matching this pattern.
 """
-from .overrides import set_module
+from .overrides import (
+    array_function_dispatch,
+    set_array_function_like_doc,
+    set_module,
+)
 from .multiarray import array
 
 
@@ -11,8 +15,14 @@ __all__ = [
     "asarray", "asanyarray", "ascontiguousarray", "asfortranarray", "require",
 ]
 
+
+def _asarray_dispatcher(a, dtype=None, order=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asarray(a, dtype=None, order=None):
+def asarray(a, dtype=None, order=None, *, like=None):
     """Convert the input to an array.
 
     Parameters
@@ -23,10 +33,16 @@ def asarray(a, dtype=None, order=None):
         of lists and ndarrays.
     dtype : data-type, optional
         By default, the data-type is inferred from the input data.
-    order : {'C', 'F'}, optional
-        Whether to use row-major (C-style) or
-        column-major (Fortran-style) memory representation.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Memory layout.  'A' and 'K' depend on the order of input array a.
+        'C' row-major (C-style), 
+        'F' column-major (Fortran-style) memory representation.
+        'A' (any) means 'F' if `a` is Fortran contiguous, 'C' otherwise
+        'K' (keep) preserve input order
         Defaults to 'C'.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -80,11 +96,20 @@ def asarray(a, dtype=None, order=None):
     True
 
     """
+    if like is not None:
+        return _asarray_with_like(a, dtype=dtype, order=order, like=like)
+
     return array(a, dtype, copy=False, order=order)
 
 
+_asarray_with_like = array_function_dispatch(
+    _asarray_dispatcher
+)(asarray)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asanyarray(a, dtype=None, order=None):
+def asanyarray(a, dtype=None, order=None, *, like=None):
     """Convert the input to an ndarray, but pass ndarray subclasses through.
 
     Parameters
@@ -95,9 +120,16 @@ def asanyarray(a, dtype=None, order=None):
         tuples of lists, and ndarrays.
     dtype : data-type, optional
         By default, the data-type is inferred from the input data.
-    order : {'C', 'F'}, optional
-        Whether to use row-major (C-style) or column-major
-        (Fortran-style) memory representation.  Defaults to 'C'.
+    order : {'C', 'F', 'A', 'K'}, optional
+        Memory layout.  'A' and 'K' depend on the order of input array a.
+        'C' row-major (C-style), 
+        'F' column-major (Fortran-style) memory representation.
+        'A' (any) means 'F' if `a` is Fortran contiguous, 'C' otherwise
+        'K' (keep) preserve input order
+        Defaults to 'C'.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -133,11 +165,24 @@ def asanyarray(a, dtype=None, order=None):
     True
 
     """
+    if like is not None:
+        return _asanyarray_with_like(a, dtype=dtype, order=order, like=like)
+
     return array(a, dtype, copy=False, order=order, subok=True)
 
 
+_asanyarray_with_like = array_function_dispatch(
+    _asarray_dispatcher
+)(asanyarray)
+
+
+def _asarray_contiguous_fortran_dispatcher(a, dtype=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def ascontiguousarray(a, dtype=None):
+def ascontiguousarray(a, dtype=None, *, like=None):
     """
     Return a contiguous array (ndim >= 1) in memory (C order).
 
@@ -147,6 +192,9 @@ def ascontiguousarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         Data-type of returned array.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -174,11 +222,20 @@ def ascontiguousarray(a, dtype=None):
     so it will not preserve 0-d arrays.  
 
     """
+    if like is not None:
+        return _ascontiguousarray_with_like(a, dtype=dtype, like=like)
+
     return array(a, dtype, copy=False, order='C', ndmin=1)
 
 
+_ascontiguousarray_with_like = array_function_dispatch(
+    _asarray_contiguous_fortran_dispatcher
+)(ascontiguousarray)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def asfortranarray(a, dtype=None):
+def asfortranarray(a, dtype=None, *, like=None):
     """
     Return an array (ndim >= 1) laid out in Fortran order in memory.
 
@@ -188,6 +245,9 @@ def asfortranarray(a, dtype=None):
         Input array.
     dtype : str or dtype object, optional
         By default, the data-type is inferred from the input data.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -215,11 +275,24 @@ def asfortranarray(a, dtype=None):
     so it will not preserve 0-d arrays.  
 
     """
+    if like is not None:
+        return _asfortranarray_with_like(a, dtype=dtype, like=like)
+
     return array(a, dtype, copy=False, order='F', ndmin=1)
 
 
+_asfortranarray_with_like = array_function_dispatch(
+    _asarray_contiguous_fortran_dispatcher
+)(asfortranarray)
+
+
+def _require_dispatcher(a, dtype=None, requirements=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def require(a, dtype=None, requirements=None):
+def require(a, dtype=None, requirements=None, *, like=None):
     """
     Return an ndarray of the provided type that satisfies requirements.
 
@@ -243,6 +316,9 @@ def require(a, dtype=None, requirements=None):
        * 'WRITEABLE' ('W')    - ensure a writable array
        * 'OWNDATA' ('O')      - ensure an array that owns its own data
        * 'ENSUREARRAY', ('E') - ensure a base array, instead of a subclass
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -286,6 +362,14 @@ def require(a, dtype=None, requirements=None):
       UPDATEIFCOPY : False
 
     """
+    if like is not None:
+        return _require_with_like(
+            a,
+            dtype=dtype,
+            requirements=requirements,
+            like=like,
+        )
+
     possible_flags = {'C': 'C', 'C_CONTIGUOUS': 'C', 'CONTIGUOUS': 'C',
                       'F': 'F', 'F_CONTIGUOUS': 'F', 'FORTRAN': 'F',
                       'A': 'A', 'ALIGNED': 'A',
@@ -320,3 +404,8 @@ def require(a, dtype=None, requirements=None):
             arr = arr.copy(order)
             break
     return arr
+
+
+_require_with_like = array_function_dispatch(
+    _require_dispatcher
+)(require)
diff --git a/numpy/core/_asarray.pyi b/numpy/core/_asarray.pyi
new file mode 100644
index 000000000..8c200ba22
--- /dev/null
+++ b/numpy/core/_asarray.pyi
@@ -0,0 +1,77 @@
+import sys
+from typing import TypeVar, Union, Iterable, overload
+
+from numpy import ndarray, _OrderKACF
+from numpy.typing import ArrayLike, DTypeLike
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+def asarray(
+    a: object,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+@overload
+def asanyarray(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> _ArrayType: ...
+@overload
+def asanyarray(
+    a: object,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+def ascontiguousarray(
+    a: object, dtype: DTypeLike = ..., *, like: ArrayLike = ...
+) -> ndarray: ...
+def asfortranarray(
+    a: object, dtype: DTypeLike = ..., *, like: ArrayLike = ...
+) -> ndarray: ...
+
+_Requirements = Literal[
+    "C", "C_CONTIGUOUS", "CONTIGUOUS",
+    "F", "F_CONTIGUOUS", "FORTRAN",
+    "A", "ALIGNED",
+    "W", "WRITEABLE",
+    "O", "OWNDATA"
+]
+_E = Literal["E", "ENSUREARRAY"]
+_RequirementsWithE = Union[_Requirements, _E]
+
+@overload
+def require(
+    a: _ArrayType,
+    dtype: None = ...,
+    requirements: Union[None, _Requirements, Iterable[_Requirements]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> _ArrayType: ...
+@overload
+def require(
+    a: object,
+    dtype: DTypeLike = ...,
+    requirements: Union[_E, Iterable[_RequirementsWithE]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
+@overload
+def require(
+    a: object,
+    dtype: DTypeLike = ...,
+    requirements: Union[None, _Requirements, Iterable[_Requirements]] = ...,
+    *,
+    like: ArrayLike = ...
+) -> ndarray: ...
diff --git a/numpy/core/_dtype.py b/numpy/core/_dtype.py
index 50aeeb5bc..4249071ff 100644
--- a/numpy/core/_dtype.py
+++ b/numpy/core/_dtype.py
@@ -176,7 +176,7 @@ def _byte_order_str(dtype):
 
 
 def _datetime_metadata_str(dtype):
-    # TODO: this duplicates the C append_metastr_to_string
+    # TODO: this duplicates the C metastr_to_unicode functionality
     unit, count = np.datetime_data(dtype)
     if unit == 'generic':
         return ''
diff --git a/numpy/core/_exceptions.py b/numpy/core/_exceptions.py
index 99172e23d..5e17ed3b2 100644
--- a/numpy/core/_exceptions.py
+++ b/numpy/core/_exceptions.py
@@ -26,8 +26,6 @@ def _display_as_base(cls):
     """
     assert issubclass(cls, Exception)
     cls.__name__ = cls.__base__.__name__
-    cls.__qualname__ = cls.__base__.__qualname__
-    set_module(cls.__base__.__module__)(cls)
     return cls
 
 
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index 85853622a..449926f58 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -17,26 +17,25 @@ except ImportError:
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
 
-if (sys.byteorder == 'little'):
+if sys.byteorder == 'little':
     _nbo = '<'
 else:
     _nbo = '>'
 
 def _makenames_list(adict, align):
     allfields = []
-    fnames = list(adict.keys())
-    for fname in fnames:
-        obj = adict[fname]
+
+    for fname, obj in adict.items():
         n = len(obj)
-        if not isinstance(obj, tuple) or n not in [2, 3]:
+        if not isinstance(obj, tuple) or n not in (2, 3):
             raise ValueError("entry not a 2- or 3- tuple")
-        if (n > 2) and (obj[2] == fname):
+        if n > 2 and obj[2] == fname:
             continue
         num = int(obj[1])
-        if (num < 0):
+        if num < 0:
             raise ValueError("invalid offset.")
         format = dtype(obj[0], align=align)
-        if (n > 2):
+        if n > 2:
             title = obj[2]
         else:
             title = None
@@ -68,7 +67,7 @@ def _usefields(adict, align):
             res = adict[name]
             formats.append(res[0])
             offsets.append(res[1])
-            if (len(res) > 2):
+            if len(res) > 2:
                 titles.append(res[2])
             else:
                 titles.append(None)
@@ -108,7 +107,7 @@ def _array_descr(descriptor):
     for field in ordered_fields:
         if field[1] > offset:
             num = field[1] - offset
-            result.append(('', '|V%d' % num))
+            result.append(('', f'|V{num}'))
             offset += num
         elif field[1] < offset:
             raise ValueError(
@@ -128,7 +127,7 @@ def _array_descr(descriptor):
 
     if descriptor.itemsize > offset:
         num = descriptor.itemsize - offset
-        result.append(('', '|V%d' % num))
+        result.append(('', f'|V{num}'))
 
     return result
 
@@ -191,7 +190,7 @@ def _commastring(astr):
                     (order1, order2))
             order = order1
 
-        if order in ['|', '=', _nbo]:
+        if order in ('|', '=', _nbo):
             order = ''
         dtype = order + dtype
         if (repeats == ''):
@@ -223,7 +222,7 @@ def _getintp_ctype():
         val = dummy_ctype(np.intp)
     else:
         char = dtype('p').char
-        if (char == 'i'):
+        if char == 'i':
             val = ctypes.c_int
         elif char == 'l':
             val = ctypes.c_long
@@ -379,7 +378,7 @@ def _newnames(datatype, order):
                     raise ValueError(f"unknown field name: {name}") from None
             seen.add(name)
         return tuple(list(order) + nameslist)
-    raise ValueError("unsupported order value: %s" % (order,))
+    raise ValueError(f"unsupported order value: {order}")
 
 def _copy_fields(ary):
     """Return copy of structured array with padding between fields removed.
@@ -680,8 +679,7 @@ def __dtype_from_pep3118(stream, is_subdtype):
 
         if not (is_padding and name is None):
             if name is not None and name in field_spec['names']:
-                raise RuntimeError("Duplicate field name '%s' in PEP3118 format"
-                                   % name)
+                raise RuntimeError(f"Duplicate field name '{name}' in PEP3118 format")
             field_spec['names'].append(name)
             field_spec['formats'].append(value)
             field_spec['offsets'].append(offset)
@@ -717,7 +715,7 @@ def _fix_names(field_spec):
 
         j = 0
         while True:
-            name = 'f{}'.format(j)
+            name = f'f{j}'
             if name not in names:
                 break
             j = j + 1
@@ -790,7 +788,7 @@ def _ufunc_doc_signature_formatter(ufunc):
     if ufunc.nin == 1:
         in_args = 'x'
     else:
-        in_args = ', '.join('x{}'.format(i+1) for i in range(ufunc.nin))
+        in_args = ', '.join(f'x{i+1}' for i in range(ufunc.nin))
 
     # output arguments are both keyword or positional
     if ufunc.nout == 0:
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index 86ddf4d17..1867ba68c 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -4,6 +4,7 @@ and the Python code for the NumPy-namespace function
 
 """
 import warnings
+from contextlib import nullcontext
 
 from numpy.core import multiarray as mu
 from numpy.core import umath as um
@@ -11,7 +12,7 @@ from numpy.core._asarray import asanyarray
 from numpy.core import numerictypes as nt
 from numpy.core import _exceptions
 from numpy._globals import _NoValue
-from numpy.compat import pickle, os_fspath, contextlib_nullcontext
+from numpy.compat import pickle, os_fspath
 
 # save those O(100) nanoseconds!
 umr_maximum = um.maximum.reduce
@@ -50,20 +51,38 @@ def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
           initial=_NoValue, where=True):
     return umr_prod(a, axis, dtype, out, keepdims, initial, where)
 
-def _any(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_any(a, axis, dtype, out, keepdims)
-
-def _all(a, axis=None, dtype=None, out=None, keepdims=False):
-    return umr_all(a, axis, dtype, out, keepdims)
-
-def _count_reduce_items(arr, axis):
-    if axis is None:
-        axis = tuple(range(arr.ndim))
-    if not isinstance(axis, tuple):
-        axis = (axis,)
-    items = 1
-    for ax in axis:
-        items *= arr.shape[mu.normalize_axis_index(ax, arr.ndim)]
+def _any(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
+    # Parsing keyword arguments is currently fairly slow, so avoid it for now
+    if where is True:
+        return umr_any(a, axis, dtype, out, keepdims)
+    return umr_any(a, axis, dtype, out, keepdims, where=where)
+
+def _all(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
+    # Parsing keyword arguments is currently fairly slow, so avoid it for now
+    if where is True:
+        return umr_all(a, axis, dtype, out, keepdims)
+    return umr_all(a, axis, dtype, out, keepdims, where=where)
+
+def _count_reduce_items(arr, axis, keepdims=False, where=True):
+    # fast-path for the default case
+    if where is True:
+        # no boolean mask given, calculate items according to axis
+        if axis is None:
+            axis = tuple(range(arr.ndim))
+        elif not isinstance(axis, tuple):
+            axis = (axis,)
+        items = nt.intp(1)
+        for ax in axis:
+            items *= arr.shape[mu.normalize_axis_index(ax, arr.ndim)]
+    else:
+        # TODO: Optimize case when `where` is broadcast along a non-reduction
+        # axis and full sum is more excessive than needed.
+
+        # guarded to protect circular imports
+        from numpy.lib.stride_tricks import broadcast_to
+        # count True values in (potentially broadcasted) boolean mask
+        items = umr_sum(broadcast_to(where, arr.shape), axis, nt.intp, None,
+                        keepdims)
     return items
 
 # Numpy 1.17.0, 2019-02-24
@@ -140,13 +159,13 @@ def _clip(a, min=None, max=None, out=None, *, casting=None, **kwargs):
         return _clip_dep_invoke_with_casting(
             um.clip, a, min, max, out=out, casting=casting, **kwargs)
 
-def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
+def _mean(a, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     arr = asanyarray(a)
 
     is_float16_result = False
-    rcount = _count_reduce_items(arr, axis)
-    # Make this warning show up first
-    if rcount == 0:
+
+    rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where)
+    if rcount == 0 if where is True else umr_any(rcount == 0):
         warnings.warn("Mean of empty slice.", RuntimeWarning, stacklevel=2)
 
     # Cast bool, unsigned int, and int to float64 by default
@@ -157,7 +176,7 @@ def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
             dtype = mu.dtype('f4')
             is_float16_result = True
 
-    ret = umr_sum(arr, axis, dtype, out, keepdims)
+    ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
     if isinstance(ret, mu.ndarray):
         ret = um.true_divide(
                 ret, rcount, out=ret, casting='unsafe', subok=False)
@@ -173,12 +192,13 @@ def _mean(a, axis=None, dtype=None, out=None, keepdims=False):
 
     return ret
 
-def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *,
+         where=True):
     arr = asanyarray(a)
 
-    rcount = _count_reduce_items(arr, axis)
+    rcount = _count_reduce_items(arr, axis, keepdims=keepdims, where=where)
     # Make this warning show up on top.
-    if ddof >= rcount:
+    if ddof >= rcount if where is True else umr_any(ddof >= rcount):
         warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning,
                       stacklevel=2)
 
@@ -189,10 +209,18 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
     # Compute the mean.
     # Note that if dtype is not of inexact type then arraymean will
     # not be either.
-    arrmean = umr_sum(arr, axis, dtype, keepdims=True)
+    arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
+    # The shape of rcount has to match arrmean to not change the shape of out
+    # in broadcasting. Otherwise, it cannot be stored back to arrmean.
+    if rcount.ndim == 0:
+        # fast-path for default case when where is True
+        div = rcount
+    else:
+        # matching rcount to arrmean when where is specified as array
+        div = rcount.reshape(arrmean.shape)
     if isinstance(arrmean, mu.ndarray):
-        arrmean = um.true_divide(
-                arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
+        arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
+                                 subok=False)
     else:
         arrmean = arrmean.dtype.type(arrmean / rcount)
 
@@ -213,10 +241,10 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
     else:
         x = um.multiply(x, um.conjugate(x), out=x).real
 
-    ret = umr_sum(x, axis, dtype, out, keepdims)
+    ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
 
     # Compute degrees of freedom and make sure it is not negative.
-    rcount = max([rcount - ddof, 0])
+    rcount = um.maximum(rcount - ddof, 0)
 
     # divide by degrees of freedom
     if isinstance(ret, mu.ndarray):
@@ -229,9 +257,10 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
 
     return ret
 
-def _std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
+def _std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *,
+         where=True):
     ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
-               keepdims=keepdims)
+               keepdims=keepdims, where=where)
 
     if isinstance(ret, mu.ndarray):
         ret = um.sqrt(ret, out=ret)
@@ -251,7 +280,7 @@ def _ptp(a, axis=None, out=None, keepdims=False):
 
 def _dump(self, file, protocol=2):
     if hasattr(file, 'write'):
-        ctx = contextlib_nullcontext(file)
+        ctx = nullcontext(file)
     else:
         ctx = open(os_fspath(file), "wb")
     with ctx as f:
diff --git a/numpy/core/_type_aliases.pyi b/numpy/core/_type_aliases.pyi
new file mode 100644
index 000000000..6a1099cd3
--- /dev/null
+++ b/numpy/core/_type_aliases.pyi
@@ -0,0 +1,19 @@
+import sys
+from typing import Dict, Union, Type, List
+
+from numpy import generic, signedinteger, unsignedinteger, floating, complexfloating
+
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+class _SCTypes(TypedDict):
+    int: List[Type[signedinteger]]
+    uint: List[Type[unsignedinteger]]
+    float: List[Type[floating]]
+    complex: List[Type[complexfloating]]
+    others: List[type]
+
+sctypeDict: Dict[Union[int, str], Type[generic]]
+sctypes: _SCTypes
diff --git a/numpy/core/_ufunc_config.pyi b/numpy/core/_ufunc_config.pyi
new file mode 100644
index 000000000..e90f1c510
--- /dev/null
+++ b/numpy/core/_ufunc_config.pyi
@@ -0,0 +1,43 @@
+import sys
+from typing import Optional, Union, Callable, Any
+
+if sys.version_info >= (3, 8):
+    from typing import Literal, Protocol, TypedDict
+else:
+    from typing_extensions import Literal, Protocol, TypedDict
+
+_ErrKind = Literal["ignore", "warn", "raise", "call", "print", "log"]
+_ErrFunc = Callable[[str, int], Any]
+
+class _SupportsWrite(Protocol):
+    def write(self, __msg: str) -> Any: ...
+
+class _ErrDict(TypedDict):
+    divide: _ErrKind
+    over: _ErrKind
+    under: _ErrKind
+    invalid: _ErrKind
+
+class _ErrDictOptional(TypedDict, total=False):
+    all: Optional[_ErrKind]
+    divide: Optional[_ErrKind]
+    over: Optional[_ErrKind]
+    under: Optional[_ErrKind]
+    invalid: Optional[_ErrKind]
+
+def seterr(
+    all: Optional[_ErrKind] = ...,
+    divide: Optional[_ErrKind] = ...,
+    over: Optional[_ErrKind] = ...,
+    under: Optional[_ErrKind] = ...,
+    invalid: Optional[_ErrKind] = ...,
+) -> _ErrDict: ...
+def geterr() -> _ErrDict: ...
+def setbufsize(size: int) -> int: ...
+def getbufsize() -> int: ...
+def seterrcall(
+    func: Union[None, _ErrFunc, _SupportsWrite]
+) -> Union[None, _ErrFunc, _SupportsWrite]: ...
+def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite]: ...
+
+# See `numpy/__init__.pyi` for the `errstate` class
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 5d9642ea8..ad1530419 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -1628,6 +1628,3 @@ def set_string_function(f, repr=True):
             return multiarray.set_string_function(_default_array_str, 0)
     else:
         return multiarray.set_string_function(f, repr)
-
-set_string_function(_default_array_str, False)
-set_string_function(_default_array_repr, True)
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 1868610f4..2d3a65391 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -50,8 +50,9 @@
 # Version 13 (NumPy 1.17) No change.
 # Version 13 (NumPy 1.18) No change.
 # Version 13 (NumPy 1.19) No change.
-# Version 13 (NumPy 1.20) No change.
 0x0000000d = 5b0e8bbded00b166125974fc71e80a33
 
-# Version 14 (NumPy 1.19) DType related API additions
+# Version 14 (NumPy 1.20)
+# DType related API additions.
+# A new field was added to the end of PyArrayObject_fields.
 0x0000000e = 17a0f366e55ec05e5c5c149123478452
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index 856db0410..ca6a22828 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -26,6 +26,7 @@ API_FILES = [join('multiarray', 'alloc.c'),
              join('multiarray', 'array_assign_array.c'),
              join('multiarray', 'array_assign_scalar.c'),
              join('multiarray', 'array_coercion.c'),
+             join('multiarray', 'array_method.c'),
              join('multiarray', 'arrayobject.c'),
              join('multiarray', 'arraytypes.c.src'),
              join('multiarray', 'buffer.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 2ce2fdb55..cb1147b93 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -8,12 +8,12 @@ sys.path.insert(0, os.path.dirname(__file__))
 import ufunc_docstrings as docstrings
 sys.path.pop(0)
 
-Zero = "PyInt_FromLong(0)"
-One = "PyInt_FromLong(1)"
+Zero = "PyLong_FromLong(0)"
+One = "PyLong_FromLong(1)"
 True_ = "(Py_INCREF(Py_True), Py_True)"
 False_ = "(Py_INCREF(Py_False), Py_False)"
 None_ = object()
-AllOnes = "PyInt_FromLong(-1)"
+AllOnes = "PyLong_FromLong(-1)"
 MinusInfinity = 'PyFloat_FromDouble(-NPY_INFINITY)'
 ReorderableNone = "(Py_INCREF(Py_None), Py_None)"
 
@@ -48,8 +48,11 @@ class TypeDescription:
     simd: list
         Available SIMD ufunc loops, dispatched at runtime in specified order
         Currently only supported for simples types (see make_arrays)
+    dispatch: list
+        Available SIMD ufunc loops, dispatched at runtime in specified order
+        Currently only supported for simples types (see make_arrays)
     """
-    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None):
+    def __init__(self, type, f=None, in_=None, out=None, astype=None, simd=None, dispatch=None):
         self.type = type
         self.func_data = f
         if astype is None:
@@ -62,6 +65,7 @@ class TypeDescription:
             out = out.replace('P', type)
         self.out = out
         self.simd = simd
+        self.dispatch = dispatch
 
     def finish_signature(self, nin, nout):
         if self.in_ is None:
@@ -86,7 +90,7 @@ def build_func_data(types, f):
     func_data = [_fdata_map.get(t, '%s') % (f,) for t in types]
     return func_data
 
-def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
+def TD(types, f=None, astype=None, in_=None, out=None, simd=None, dispatch=None):
     if f is not None:
         if isinstance(f, str):
             func_data = build_func_data(types, f)
@@ -115,7 +119,14 @@ def TD(types, f=None, astype=None, in_=None, out=None, simd=None):
             simdt = [k for k, v in simd if t in v]
         else:
             simdt = []
-        tds.append(TypeDescription(t, f=fd, in_=i, out=o, astype=astype, simd=simdt))
+        # [(dispatch file name without extension '.dispatch.c*', list of types)]
+        if dispatch:
+            dispt = [k for k, v in dispatch if t in v]
+        else:
+            dispt = []
+        tds.append(TypeDescription(
+            t, f=fd, in_=i, out=o, astype=astype, simd=simdt, dispatch=dispt
+        ))
     return tds
 
 class Ufunc:
@@ -341,14 +352,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.square'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f', 'FDfd')]),
+          TD(ints+inexact, simd=[('avx2', ints), ('avx512f', 'FD')], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_square'),
           ),
 'reciprocal':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.reciprocal'),
           None,
-          TD(ints+inexact, simd=[('avx2', ints), ('fma', 'fd'), ('avx512f','fd')]),
+          TD(ints+inexact, simd=[('avx2', ints)], dispatch=[('loops_unary_fp', 'fd')]),
           TD(O, f='Py_reciprocal'),
           ),
 # This is no longer used as numpy.ones_like, however it is
@@ -378,7 +389,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
           TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -726,6 +737,7 @@ defdict = {
           None,
           TD('e', f='log', astype={'e':'f'}),
           TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+          TD('d', simd=[('avx512f', 'd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
           ),
@@ -755,7 +767,7 @@ defdict = {
           docstrings.get('numpy.core.umath.sqrt'),
           None,
           TD('e', f='sqrt', astype={'e':'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
           ),
@@ -1024,6 +1036,16 @@ def make_arrays(funcdict):
                             ISA=vt.upper(), isa=vt,
                             fname=name, type=tname, idx=k
                         ))
+                if t.dispatch is not None:
+                    for dname in t.dispatch:
+                        code2list.append(textwrap.dedent("""\
+                        #ifndef NPY_DISABLE_OPTIMIZATION
+                        #include "{dname}.dispatch.h"
+                        #endif
+                        NPY_CPU_DISPATCH_CALL_XB({name}_functions[{k}] = {tname}_{name});
+                        """).format(
+                            dname=dname, name=name, tname=tname, k=k
+                        ))
             else:
                 funclist.append('NULL')
                 try:
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 82cd6fb27..b7edd2834 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -107,6 +107,13 @@ add_newdoc('numpy.core.umath', 'absolute',
     >>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap='gray')
     >>> plt.show()
 
+    The `abs` function can be used as a shorthand for ``np.absolute`` on
+    ndarrays.
+
+    >>> x = np.array([-1.2, 1.2])
+    >>> abs(x)
+    array([1.2, 1.2])
+
     """)
 
 add_newdoc('numpy.core.umath', 'add',
@@ -141,6 +148,14 @@ add_newdoc('numpy.core.umath', 'add',
            [  3.,   5.,   7.],
            [  6.,   8.,  10.]])
 
+    The ``+`` operator can be used as a shorthand for ``np.add`` on ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 + x2
+    array([[ 0.,  2.,  4.],
+           [ 3.,  5.,  7.],
+           [ 6.,  8., 10.]])
     """)
 
 add_newdoc('numpy.core.umath', 'arccos',
@@ -608,6 +623,14 @@ add_newdoc('numpy.core.umath', 'bitwise_and',
     >>> np.bitwise_and([True, True], [False, True])
     array([False,  True])
 
+    The ``&`` operator can be used as a shorthand for ``np.bitwise_and`` on
+    ndarrays.
+
+    >>> x1 = np.array([2, 5, 255])
+    >>> x2 = np.array([3, 14, 16])
+    >>> x1 & x2
+    array([ 2,  4, 16])
+
     """)
 
 add_newdoc('numpy.core.umath', 'bitwise_or',
@@ -667,6 +690,14 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
     >>> np.bitwise_or([True, True], [False, True])
     array([ True,  True])
 
+    The ``|`` operator can be used as a shorthand for ``np.bitwise_or`` on
+    ndarrays.
+
+    >>> x1 = np.array([2, 5, 255])
+    >>> x2 = np.array([4, 4, 4])
+    >>> x1 | x2
+    array([  6,   5, 255])
+
     """)
 
 add_newdoc('numpy.core.umath', 'bitwise_xor',
@@ -719,6 +750,14 @@ add_newdoc('numpy.core.umath', 'bitwise_xor',
     >>> np.bitwise_xor([True, True], [False, True])
     array([ True, False])
 
+    The ``^`` operator can be used as a shorthand for ``np.bitwise_xor`` on
+    ndarrays.
+
+    >>> x1 = np.array([True, True])
+    >>> x2 = np.array([False, True])
+    >>> x1 ^ x2
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'ceil',
@@ -1088,6 +1127,16 @@ add_newdoc('numpy.core.umath', 'divide',
     >>> np.divide(1, 0)
     0
 
+    The ``/`` operator can be used as a shorthand for ``np.divide`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = 2 * np.ones(3)
+    >>> x1 / x2
+    array([[0. , 0.5, 1. ],
+           [1.5, 2. , 2.5],
+           [3. , 3.5, 4. ]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'equal',
@@ -1123,6 +1172,14 @@ add_newdoc('numpy.core.umath', 'equal',
     >>> np.equal(1, np.ones(1))
     array([ True])
 
+    The ``==`` operator can be used as a shorthand for ``np.equal`` on
+    ndarrays.
+
+    >>> a = np.array([2, 4, 6])
+    >>> b = np.array([2, 4, 2])
+    >>> a == b
+    array([ True,  True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'exp',
@@ -1370,6 +1427,13 @@ add_newdoc('numpy.core.umath', 'floor_divide',
     >>> np.floor_divide([1., 2., 3., 4.], 2.5)
     array([ 0.,  0.,  1.,  1.])
 
+    The ``//`` operator can be used as a shorthand for ``np.floor_divide``
+    on ndarrays.
+
+    >>> x1 = np.array([1., 2., 3., 4.])
+    >>> x1 // 2.5
+    array([0., 0., 1., 1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'fmod',
@@ -1458,10 +1522,11 @@ add_newdoc('numpy.core.umath', 'greater',
     >>> np.greater([4,2],[2,2])
     array([ True, False])
 
-    If the inputs are ndarrays, then np.greater is equivalent to '>'.
+    The ``>`` operator can be used as a shorthand for ``np.greater`` on
+    ndarrays.
 
-    >>> a = np.array([4,2])
-    >>> b = np.array([2,2])
+    >>> a = np.array([4, 2])
+    >>> b = np.array([2, 2])
     >>> a > b
     array([ True, False])
 
@@ -1494,6 +1559,14 @@ add_newdoc('numpy.core.umath', 'greater_equal',
     >>> np.greater_equal([4, 2, 1], [2, 2, 2])
     array([ True, True, False])
 
+    The ``>=`` operator can be used as a shorthand for ``np.greater_equal``
+    on ndarrays.
+
+    >>> a = np.array([4, 2, 1])
+    >>> b = np.array([2, 2, 2])
+    >>> a >= b
+    array([ True,  True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'hypot',
@@ -1612,6 +1685,13 @@ add_newdoc('numpy.core.umath', 'invert',
     >>> np.invert(np.array([True, False]))
     array([False,  True])
 
+    The ``~`` operator can be used as a shorthand for ``np.invert`` on
+    ndarrays.
+
+    >>> x1 = np.array([True, False])
+    >>> ~x1
+    array([False,  True])
+
     """)
 
 add_newdoc('numpy.core.umath', 'isfinite',
@@ -1846,6 +1926,14 @@ add_newdoc('numpy.core.umath', 'left_shift',
     >>> print(b, type(b))
     254 <class 'numpy.uint8'>
 
+    The ``<<`` operator can be used as a shorthand for ``np.left_shift`` on
+    ndarrays.
+
+    >>> x1 = 5
+    >>> x2 = np.array([1, 2, 3])
+    >>> x1 << x2
+    array([10, 20, 40])
+
     """)
 
 add_newdoc('numpy.core.umath', 'less',
@@ -1875,11 +1963,18 @@ add_newdoc('numpy.core.umath', 'less',
     >>> np.less([1, 2], [2, 2])
     array([ True, False])
 
+    The ``<`` operator can be used as a shorthand for ``np.less`` on ndarrays.
+
+    >>> a = np.array([1, 2])
+    >>> b = np.array([2, 2])
+    >>> a < b
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'less_equal',
     """
-    Return the truth value of (x1 =< x2) element-wise.
+    Return the truth value of (x1 <= x2) element-wise.
 
     Parameters
     ----------
@@ -1904,6 +1999,14 @@ add_newdoc('numpy.core.umath', 'less_equal',
     >>> np.less_equal([4, 2, 1], [2, 2, 2])
     array([False,  True,  True])
 
+    The ``<=`` operator can be used as a shorthand for ``np.less_equal`` on
+    ndarrays.
+
+    >>> a = np.array([4, 2, 1])
+    >>> b = np.array([2, 2, 2])
+    >>> a <= b
+    array([False,  True,  True])
+
     """)
 
 add_newdoc('numpy.core.umath', 'log',
@@ -2231,6 +2334,15 @@ add_newdoc('numpy.core.umath', 'logical_and',
     >>> np.logical_and(x>1, x<4)
     array([False, False,  True,  True, False])
 
+
+    The ``&`` operator can be used as a shorthand for ``np.logical_and`` on
+    boolean ndarrays.
+
+    >>> a = np.array([True, False])
+    >>> b = np.array([False, False])
+    >>> a & b
+    array([False, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'logical_not',
@@ -2301,6 +2413,14 @@ add_newdoc('numpy.core.umath', 'logical_or',
     >>> np.logical_or(x < 1, x > 3)
     array([ True, False, False, False,  True])
 
+    The ``|`` operator can be used as a shorthand for ``np.logical_or`` on
+    boolean ndarrays.
+
+    >>> a = np.array([True, False])
+    >>> b = np.array([False, False])
+    >>> a | b
+    array([ True, False])
+
     """)
 
 add_newdoc('numpy.core.umath', 'logical_xor',
@@ -2646,8 +2766,8 @@ add_newdoc('numpy.core.umath', 'matmul',
     Raises
     ------
     ValueError
-        If the last dimension of `a` is not the same size as
-        the second-to-last dimension of `b`.
+        If the last dimension of `x1` is not the same size as
+        the second-to-last dimension of `x2`.
 
         If a scalar value is passed in.
 
@@ -2738,6 +2858,14 @@ add_newdoc('numpy.core.umath', 'matmul',
     ...
     ValueError: matmul: Input operand 1 does not have enough dimensions ...
 
+    The ``@`` operator can be used as a shorthand for ``np.matmul`` on
+    ndarrays.
+
+    >>> x1 = np.array([2j, 3j])
+    >>> x2 = np.array([2j, 3j])
+    >>> x1 @ x2
+    (-13+0j)
+
     .. versionadded:: 1.10.0
     """)
 
@@ -2814,6 +2942,16 @@ add_newdoc('numpy.core.umath', 'multiply',
            [  0.,   4.,  10.],
            [  0.,   7.,  16.]])
 
+    The ``*`` operator can be used as a shorthand for ``np.multiply`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 * x2
+    array([[  0.,   1.,   4.],
+           [  0.,   4.,  10.],
+           [  0.,   7.,  16.]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'negative',
@@ -2837,6 +2975,13 @@ add_newdoc('numpy.core.umath', 'negative',
     >>> np.negative([1.,-1.])
     array([-1.,  1.])
 
+    The unary ``-`` operator can be used as a shorthand for ``np.negative`` on
+    ndarrays.
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> -x1
+    array([-1.,  1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'positive',
@@ -2861,6 +3006,20 @@ add_newdoc('numpy.core.umath', 'positive',
     Equivalent to `x.copy()`, but only defined for types that support
     arithmetic.
 
+    Examples
+    --------
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> np.positive(x1)
+    array([ 1., -1.])
+
+    The unary ``+`` operator can be used as a shorthand for ``np.positive`` on
+    ndarrays.
+
+    >>> x1 = np.array(([1., -1.]))
+    >>> +x1
+    array([ 1., -1.])
+
     """)
 
 add_newdoc('numpy.core.umath', 'not_equal',
@@ -2893,6 +3052,15 @@ add_newdoc('numpy.core.umath', 'not_equal',
     array([[False,  True],
            [False,  True]])
 
+    The ``!=`` operator can be used as a shorthand for ``np.not_equal`` on
+    ndarrays.
+
+    >>> a = np.array([1., 2.])
+    >>> b = np.array([1., 3.])
+    >>> a != b
+    array([False,  True])
+
+
     """)
 
 add_newdoc('numpy.core.umath', '_ones_like',
@@ -2936,9 +3104,9 @@ add_newdoc('numpy.core.umath', 'power',
 
     Examples
     --------
-    Cube each element in a list.
+    Cube each element in an array.
 
-    >>> x1 = range(6)
+    >>> x1 = np.arange(6)
     >>> x1
     [0, 1, 2, 3, 4, 5]
     >>> np.power(x1, 3)
@@ -2960,6 +3128,14 @@ add_newdoc('numpy.core.umath', 'power',
     array([[ 0,  1,  8, 27, 16,  5],
            [ 0,  1,  8, 27, 16,  5]])
 
+    The ``**`` operator can be used as a shorthand for ``np.power`` on
+    ndarrays.
+
+    >>> x2 = np.array([1, 2, 3, 3, 2, 1])
+    >>> x1 = np.arange(6)
+    >>> x1 ** x2
+    array([ 0,  1,  8, 27, 16,  5])
+
     """)
 
 add_newdoc('numpy.core.umath', 'float_power',
@@ -3183,6 +3359,13 @@ add_newdoc('numpy.core.umath', 'remainder',
     >>> np.remainder(np.arange(7), 5)
     array([0, 1, 2, 3, 4, 0, 1])
 
+    The ``%`` operator can be used as a shorthand for ``np.remainder`` on
+    ndarrays.
+
+    >>> x1 = np.arange(7)
+    >>> x1 % 5
+    array([0, 1, 2, 3, 4, 0, 1])
+
     """)
 
 add_newdoc('numpy.core.umath', 'divmod',
@@ -3225,6 +3408,13 @@ add_newdoc('numpy.core.umath', 'divmod',
     >>> np.divmod(np.arange(5), 3)
     (array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))
 
+    The `divmod` function can be used as a shorthand for ``np.divmod`` on
+    ndarrays.
+
+    >>> x = np.arange(5)
+    >>> divmod(x, 3)
+    (array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))
+
     """)
 
 add_newdoc('numpy.core.umath', 'right_shift',
@@ -3268,6 +3458,14 @@ add_newdoc('numpy.core.umath', 'right_shift',
     >>> np.right_shift(10, [1,2,3])
     array([5, 2, 1])
 
+    The ``>>`` operator can be used as a shorthand for ``np.right_shift`` on
+    ndarrays.
+
+    >>> x1 = 10
+    >>> x2 = np.array([1,2,3])
+    >>> x1 >> x2
+    array([5, 2, 1])
+
     """)
 
 add_newdoc('numpy.core.umath', 'rint',
@@ -3709,6 +3907,16 @@ add_newdoc('numpy.core.umath', 'subtract',
            [ 3.,  3.,  3.],
            [ 6.,  6.,  6.]])
 
+    The ``-`` operator can be used as a shorthand for ``np.subtract`` on
+    ndarrays.
+
+    >>> x1 = np.arange(9.0).reshape((3, 3))
+    >>> x2 = np.arange(3.0)
+    >>> x1 - x2
+    array([[0., 0., 0.],
+           [3., 3., 3.],
+           [6., 6., 6.]])
+
     """)
 
 add_newdoc('numpy.core.umath', 'tan',
@@ -3851,6 +4059,14 @@ add_newdoc('numpy.core.umath', 'true_divide',
 
     >>> x//4
     array([0, 0, 0, 0, 1])
+
+    The ``/`` operator can be used as a shorthand for ``np.true_divide`` on
+    ndarrays.
+
+    >>> x = np.arange(5)
+    >>> x / 4
+    array([0.  , 0.25, 0.5 , 0.75, 1.  ])
+
     """)
 
 add_newdoc('numpy.core.umath', 'frexp',
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index 1d447b86a..9d7b54a1a 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -114,8 +114,8 @@ def equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -140,8 +140,8 @@ def not_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -167,8 +167,8 @@ def greater_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -193,8 +193,8 @@ def less_equal(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -219,8 +219,8 @@ def greater(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
@@ -245,8 +245,8 @@ def less(x1, x2):
 
     Returns
     -------
-    out : ndarray or bool
-        Output array of bools, or a single bool if x1 and x2 are scalars.
+    out : ndarray
+        Output array of bools.
 
     See Also
     --------
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index f65f4015c..e0942beca 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -1062,6 +1062,17 @@ def einsum(*operands, out=None, optimize=False, **kwargs):
     --------
     einsum_path, dot, inner, outer, tensordot, linalg.multi_dot
 
+    einops:
+        similar verbose interface is provided by
+        `einops <https://github.com/arogozhnikov/einops>`_ package to cover
+        additional operations: transpose, reshape/flatten, repeat/tile,
+        squeeze/unsqueeze and reductions.
+
+    opt_einsum:
+        `opt_einsum <https://optimized-einsum.readthedocs.io/en/stable/>`_
+        optimizes contraction order for einsum-like expressions
+        in backend-agnostic manner.
+
     Notes
     -----
     .. versionadded:: 1.6.0
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index f8c11c015..efb052bc2 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -460,7 +460,7 @@ def repeat(a, repeats, axis=None):
     --------
     tile : Tile an array.
     unique : Find the unique elements of an array.
-    
+
     Examples
     --------
     >>> np.repeat(3, 4)
@@ -1375,7 +1375,7 @@ def resize(a, new_shape):
     reshaped_array : ndarray
         The new array is formed from the data in the old array, repeated
         if necessary to fill out the required number of elements.  The
-        data are repeated in the order that they are stored in memory.
+        data are repeated iterating over the array in C-order.
 
     See Also
     --------
@@ -1392,11 +1392,11 @@ def resize(a, new_shape):
 
     Warning: This functionality does **not** consider axes separately,
     i.e. it does not apply interpolation/extrapolation.
-    It fills the return array with the required number of elements, taken
-    from `a` as they are laid out in memory, disregarding strides and axes.
-    (This is in case the new shape is smaller. For larger, see above.)
-    This functionality is therefore not suitable to resize images,
-    or data where each axis represents a separate and distinct entity.
+    It fills the return array with the required number of elements, iterating
+    over `a` in C-order, disregarding axes (and cycling back from the start if
+    the new shape is larger).  This functionality is therefore not suitable to
+    resize images, or data where each axis represents a separate and distinct
+    entity.
 
     Examples
     --------
@@ -1840,11 +1840,11 @@ def nonzero(a):
     .. note::
 
        When called on a zero-d array or scalar, ``nonzero(a)`` is treated
-       as ``nonzero(atleast1d(a))``.
+       as ``nonzero(atleast_1d(a))``.
 
        .. deprecated:: 1.17.0
 
-          Use `atleast1d` explicitly if this behavior is deliberate.
+          Use `atleast_1d` explicitly if this behavior is deliberate.
 
     Parameters
     ----------
@@ -1941,7 +1941,7 @@ def shape(a):
 
     See Also
     --------
-    alen
+    len
     ndarray.shape : Equivalent array method.
 
     Examples
@@ -2007,8 +2007,8 @@ def compress(condition, a, axis=None, out=None):
     --------
     take, choose, diag, diagonal, select
     ndarray.compress : Equivalent method in ndarray
-    np.extract: Equivalent method when working on 1-D arrays
-    ufuncs-output-type
+    extract: Equivalent method when working on 1-D arrays
+    :ref:`ufuncs-output-type`
 
     Examples
     --------
@@ -2059,15 +2059,10 @@ def clip(a, a_min, a_max, out=None, **kwargs):
     ----------
     a : array_like
         Array containing elements to clip.
-    a_min : scalar or array_like or None
-        Minimum value. If None, clipping is not performed on lower
-        interval edge. Not more than one of `a_min` and `a_max` may be
-        None.
-    a_max : scalar or array_like or None
-        Maximum value. If None, clipping is not performed on upper
-        interval edge. Not more than one of `a_min` and `a_max` may be
-        None. If `a_min` or `a_max` are array_like, then the three
-        arrays will be broadcasted to match their shapes.
+    a_min, a_max : array_like or None
+        Minimum and maximum value. If ``None``, clipping is not performed on
+        the corresponding edge. Only one of `a_min` and `a_max` may be
+        ``None``. Both are broadcast against `a`.
     out : ndarray, optional
         The results will be placed in this array. It may be the input
         array for in-place clipping.  `out` must be of the right shape
@@ -2087,7 +2082,7 @@ def clip(a, a_min, a_max, out=None, **kwargs):
 
     See Also
     --------
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Examples
     --------
@@ -2253,12 +2248,13 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
                           initial=initial, where=where)
 
 
-def _any_dispatcher(a, axis=None, out=None, keepdims=None):
-    return (a, out)
+def _any_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                    where=np._NoValue):
+    return (a, where, out)
 
 
 @array_function_dispatch(_any_dispatcher)
-def any(a, axis=None, out=None, keepdims=np._NoValue):
+def any(a, axis=None, out=None, keepdims=np._NoValue, *, where=np._NoValue):
     """
     Test whether any array element along a given axis evaluates to True.
 
@@ -2283,7 +2279,7 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         the same shape as the expected output and its type is preserved
         (e.g., if it is of type float, then it will remain so, returning
         1.0 for True and 0.0 for False, regardless of the type of `a`).
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2296,6 +2292,12 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in checking for any `True` values.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     any : bool or ndarray
@@ -2327,6 +2329,9 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.any(np.nan)
     True
 
+    >>> np.any([[True, False], [False, False]], where=[[False], [True]])
+    False
+
     >>> o=np.array(False)
     >>> z=np.any([-1, 4, 5], out=o)
     >>> z, o
@@ -2338,15 +2343,17 @@ def any(a, axis=None, out=None, keepdims=np._NoValue):
     (191614240, 191614240)
 
     """
-    return _wrapreduction(a, np.logical_or, 'any', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.logical_or, 'any', axis, None, out,
+                          keepdims=keepdims, where=where)
 
 
-def _all_dispatcher(a, axis=None, out=None, keepdims=None):
-    return (a, out)
+def _all_dispatcher(a, axis=None, out=None, keepdims=None, *,
+                    where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_all_dispatcher)
-def all(a, axis=None, out=None, keepdims=np._NoValue):
+def all(a, axis=None, out=None, keepdims=np._NoValue, *, where=np._NoValue):
     """
     Test whether all array elements along a given axis evaluate to True.
 
@@ -2368,7 +2375,7 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.
         It must have the same shape as the expected output and its
         type is preserved (e.g., if ``dtype(out)`` is float, the result
-        will consist of 0.0's and 1.0's). See `ufuncs-output-type` for more
+        will consist of 0.0's and 1.0's). See :ref:`ufuncs-output-type` for more
         details.
 
     keepdims : bool, optional
@@ -2382,6 +2389,12 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in checking for all `True` values.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     all : ndarray, bool
@@ -2413,13 +2426,17 @@ def all(a, axis=None, out=None, keepdims=np._NoValue):
     >>> np.all([1.0, np.nan])
     True
 
+    >>> np.all([[True, True], [False, True]], where=[[True], [False]])
+    True
+
     >>> o=np.array(False)
     >>> z=np.all([-1, 4, 5], out=o)
     >>> id(z), id(o), z
     (28293632, 28293632, array(True)) # may vary
 
     """
-    return _wrapreduction(a, np.logical_and, 'all', axis, None, out, keepdims=keepdims)
+    return _wrapreduction(a, np.logical_and, 'all', axis, None, out,
+                          keepdims=keepdims, where=where)
 
 
 def _cumsum_dispatcher(a, axis=None, dtype=None, out=None):
@@ -2447,7 +2464,7 @@ def cumsum(a, axis=None, dtype=None, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must
         have the same shape and buffer length as the expected output
-        but the type will be cast if necessary. See `ufuncs-output-type` for
+        but the type will be cast if necessary. See :ref:`ufuncs-output-type` for
         more details.
 
     Returns
@@ -2618,7 +2635,7 @@ def amax(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2743,7 +2760,7 @@ def amin(a, axis=None, out=None, keepdims=np._NoValue, initial=np._NoValue,
     out : ndarray, optional
         Alternative output array in which to place the result.  Must
         be of the same shape and buffer length as the expected output.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -2953,7 +2970,7 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=np._NoValue,
     See Also
     --------
     ndarray.prod : equivalent method
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3049,7 +3066,7 @@ def cumprod(a, axis=None, dtype=None, out=None):
 
     See Also
     --------
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3195,7 +3212,7 @@ def around(a, decimals=0, out=None):
     out : ndarray, optional
         Alternative output array in which to place the result. It must have
         the same shape as the expected output, but the type of the output
-        values will be cast if necessary. See `ufuncs-output-type` for more
+        values will be cast if necessary. See :ref:`ufuncs-output-type` for more
         details.
 
     Returns
@@ -3276,12 +3293,14 @@ def around(a, decimals=0, out=None):
     return _wrapfunc(a, 'round', decimals=decimals, out=out)
 
 
-def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None):
-    return (a, out)
+def _mean_dispatcher(a, axis=None, dtype=None, out=None, keepdims=None, *,
+                     where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_mean_dispatcher)
-def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
+def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue, *,
+         where=np._NoValue):
     """
     Compute the arithmetic mean along the specified axis.
 
@@ -3310,7 +3329,7 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         Alternate output array in which to place the result.  The default
         is ``None``; if provided, it must have the same shape as the
         expected output, but the type will be cast if necessary.
-        See `ufuncs-output-type` for more details.
+        See :ref:`ufuncs-output-type` for more details.
 
     keepdims : bool, optional
         If this is set to True, the axes which are reduced are left
@@ -3323,6 +3342,11 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the mean. See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     m : ndarray, see dtype parameter above
@@ -3371,10 +3395,19 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
     >>> np.mean(a, dtype=np.float64)
     0.55000000074505806 # may vary
 
+    Specifying a where argument:
+    >>> a = np.array([[5, 9, 13], [14, 10, 12], [11, 15, 19]])
+    >>> np.mean(a)
+    12.0
+    >>> np.mean(a, where=[[True], [False], [False]])
+    9.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
+    if where is not np._NoValue:
+        kwargs['where'] = where
     if type(a) is not mu.ndarray:
         try:
             mean = a.mean
@@ -3387,13 +3420,14 @@ def mean(a, axis=None, dtype=None, out=None, keepdims=np._NoValue):
                           out=out, **kwargs)
 
 
-def _std_dispatcher(
-        a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
-    return (a, out)
+def _std_dispatcher(a, axis=None, dtype=None, out=None, ddof=None,
+                    keepdims=None, *, where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_std_dispatcher)
-def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
+def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
+        where=np._NoValue):
     """
     Compute the standard deviation along the specified axis.
 
@@ -3436,6 +3470,12 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the standard deviation.
+        See `~numpy.ufunc.reduce` for details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     standard_deviation : ndarray, see dtype parameter above.
@@ -3445,12 +3485,12 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     var, mean, nanmean, nanstd, nanvar
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
     The standard deviation is the square root of the average of the squared
-    deviations from the mean, i.e., ``std = sqrt(mean(x))``, where 
+    deviations from the mean, i.e., ``std = sqrt(mean(x))``, where
     ``x = abs(a - a.mean())**2``.
 
     The average squared deviation is typically calculated as ``x.sum() / N``,
@@ -3495,11 +3535,20 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     >>> np.std(a, dtype=np.float64)
     0.44999999925494177 # may vary
 
+    Specifying a where argument:
+
+    >>> a = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]])
+    >>> np.std(a)
+    2.614064523559687 # may vary
+    >>> np.std(a, where=[[True], [True], [False]])
+    2.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
-
+    if where is not np._NoValue:
+        kwargs['where'] = where
     if type(a) is not mu.ndarray:
         try:
             std = a.std
@@ -3512,13 +3561,14 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
                          **kwargs)
 
 
-def _var_dispatcher(
-        a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
-    return (a, out)
+def _var_dispatcher(a, axis=None, dtype=None, out=None, ddof=None,
+                    keepdims=None, *, where=None):
+    return (a, where, out)
 
 
 @array_function_dispatch(_var_dispatcher)
-def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
+def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue, *,
+        where=np._NoValue):
     """
     Compute the variance along the specified axis.
 
@@ -3562,6 +3612,12 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
         sub-class' method does not implement `keepdims` any
         exceptions will be raised.
 
+    where : array_like of bool, optional
+        Elements to include in the variance. See `~numpy.ufunc.reduce` for
+        details.
+
+        .. versionadded:: 1.20.0
+
     Returns
     -------
     variance : ndarray, see dtype parameter above
@@ -3571,7 +3627,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     See Also
     --------
     std, mean, nanmean, nanstd, nanvar
-    ufuncs-output-type
+    :ref:`ufuncs-output-type`
 
     Notes
     -----
@@ -3619,10 +3675,20 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
     0.2025
 
+    Specifying a where argument:
+
+    >>> a = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]])
+    >>> np.var(a)
+    6.833333333333333 # may vary
+    >>> np.var(a, where=[[True], [True], [False]])
+    4.0
+
     """
     kwargs = {}
     if keepdims is not np._NoValue:
         kwargs['keepdims'] = keepdims
+    if where is not np._NoValue:
+        kwargs['where'] = where
 
     if type(a) is not mu.ndarray:
         try:
diff --git a/numpy/core/fromnumeric.pyi b/numpy/core/fromnumeric.pyi
new file mode 100644
index 000000000..66eb3bfb8
--- /dev/null
+++ b/numpy/core/fromnumeric.pyi
@@ -0,0 +1,483 @@
+import sys
+import datetime as dt
+from typing import Optional, Union, Sequence, Tuple, Any, overload, TypeVar
+
+from numpy import (
+    ndarray,
+    number,
+    integer,
+    bool_,
+    generic,
+    _OrderKACF,
+    _OrderACF,
+    _ArrayLikeBool,
+    _ArrayLikeIntOrBool,
+    _ModeKind,
+    _PartitionKind,
+    _SortKind,
+    _SortSide,
+)
+from numpy.typing import (
+    DTypeLike,
+    ArrayLike,
+    _ShapeLike,
+    _Shape,
+    _IntLike,
+    _BoolLike,
+    _NumberLike,
+)
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+# Various annotations for scalars
+
+# While dt.datetime and dt.timedelta are not technically part of NumPy,
+# they are one of the rare few builtin scalars which serve as valid return types.
+# See https://github.com/numpy/numpy-stubs/pull/67#discussion_r412604113.
+_ScalarNumpy = Union[generic, dt.datetime, dt.timedelta]
+_ScalarBuiltin = Union[str, bytes, dt.date, dt.timedelta, bool, int, float, complex]
+_Scalar = Union[_ScalarBuiltin, _ScalarNumpy]
+
+# Integers and booleans can generally be used interchangeably
+_ScalarIntOrBool = TypeVar("_ScalarIntOrBool", bound=Union[integer, bool_])
+_ScalarGeneric = TypeVar("_ScalarGeneric", bound=generic)
+_ScalarGenericDT = TypeVar(
+    "_ScalarGenericDT", bound=Union[dt.datetime, dt.timedelta, generic]
+)
+
+_Number = TypeVar("_Number", bound=number)
+
+# The signature of take() follows a common theme with its overloads:
+# 1. A generic comes in; the same generic comes out
+# 2. A scalar comes in; a generic comes out
+# 3. An array-like object comes in; some keyword ensures that a generic comes out
+# 4. An array-like object comes in; an ndarray or generic comes out
+@overload
+def take(
+    a: _ScalarGenericDT,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarGenericDT: ...
+@overload
+def take(
+    a: _Scalar,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarNumpy: ...
+@overload
+def take(
+    a: ArrayLike,
+    indices: int,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarNumpy: ...
+@overload
+def take(
+    a: ArrayLike,
+    indices: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> Union[_ScalarNumpy, ndarray]: ...
+def reshape(a: ArrayLike, newshape: _ShapeLike, order: _OrderACF = ...) -> ndarray: ...
+@overload
+def choose(
+    a: _ScalarIntOrBool,
+    choices: ArrayLike,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> _ScalarIntOrBool: ...
+@overload
+def choose(
+    a: Union[_IntLike, _BoolLike], choices: ArrayLike, out: Optional[ndarray] = ..., mode: _ModeKind = ...
+) -> Union[integer, bool_]: ...
+@overload
+def choose(
+    a: _ArrayLikeIntOrBool,
+    choices: ArrayLike,
+    out: Optional[ndarray] = ...,
+    mode: _ModeKind = ...,
+) -> ndarray: ...
+def repeat(
+    a: ArrayLike, repeats: _ArrayLikeIntOrBool, axis: Optional[int] = ...
+) -> ndarray: ...
+def put(
+    a: ndarray, ind: _ArrayLikeIntOrBool, v: ArrayLike, mode: _ModeKind = ...
+) -> None: ...
+def swapaxes(a: ArrayLike, axis1: int, axis2: int) -> ndarray: ...
+def transpose(
+    a: ArrayLike, axes: Union[None, Sequence[int], ndarray] = ...
+) -> ndarray: ...
+def partition(
+    a: ArrayLike,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argpartition(
+    a: generic,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> integer: ...
+@overload
+def argpartition(
+    a: _ScalarBuiltin,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argpartition(
+    a: ArrayLike,
+    kth: _ArrayLikeIntOrBool,
+    axis: Optional[int] = ...,
+    kind: _PartitionKind = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+def sort(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    kind: Optional[_SortKind] = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+def argsort(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    kind: Optional[_SortKind] = ...,
+    order: Union[None, str, Sequence[str]] = ...,
+) -> ndarray: ...
+@overload
+def argmax(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
+@overload
+def argmax(
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
+) -> Union[integer, ndarray]: ...
+@overload
+def argmin(a: ArrayLike, axis: None = ..., out: Optional[ndarray] = ...) -> integer: ...
+@overload
+def argmin(
+    a: ArrayLike, axis: int = ..., out: Optional[ndarray] = ...
+) -> Union[integer, ndarray]: ...
+@overload
+def searchsorted(
+    a: ArrayLike,
+    v: _Scalar,
+    side: _SortSide = ...,
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
+) -> integer: ...
+@overload
+def searchsorted(
+    a: ArrayLike,
+    v: ArrayLike,
+    side: _SortSide = ...,
+    sorter: Optional[_ArrayLikeIntOrBool] = ...,  # 1D int array
+) -> ndarray: ...
+def resize(a: ArrayLike, new_shape: _ShapeLike) -> ndarray: ...
+@overload
+def squeeze(a: _ScalarGeneric, axis: Optional[_ShapeLike] = ...) -> _ScalarGeneric: ...
+@overload
+def squeeze(a: ArrayLike, axis: Optional[_ShapeLike] = ...) -> ndarray: ...
+def diagonal(
+    a: ArrayLike, offset: int = ..., axis1: int = ..., axis2: int = ...  # >= 2D array
+) -> ndarray: ...
+def trace(
+    a: ArrayLike,  # >= 2D array
+    offset: int = ...,
+    axis1: int = ...,
+    axis2: int = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> Union[number, ndarray]: ...
+def ravel(a: ArrayLike, order: _OrderKACF = ...) -> ndarray: ...
+def nonzero(a: ArrayLike) -> Tuple[ndarray, ...]: ...
+def shape(a: ArrayLike) -> _Shape: ...
+def compress(
+    condition: ArrayLike,  # 1D bool array
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+@overload
+def clip(
+    a: _Number,
+    a_min: ArrayLike,
+    a_max: Optional[ArrayLike],
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> _Number: ...
+@overload
+def clip(
+    a: _Number,
+    a_min: None,
+    a_max: ArrayLike,
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> _Number: ...
+@overload
+def clip(
+    a: ArrayLike,
+    a_min: ArrayLike,
+    a_max: Optional[ArrayLike],
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> Union[number, ndarray]: ...
+@overload
+def clip(
+    a: ArrayLike,
+    a_min: None,
+    a_max: ArrayLike,
+    out: Optional[ndarray] = ...,
+    **kwargs: Any,
+) -> Union[number, ndarray]: ...
+@overload
+def sum(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def sum(
+    a: ArrayLike,
+    axis: _ShapeLike = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def all(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> bool_: ...
+@overload
+def all(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[bool_, ndarray]: ...
+@overload
+def any(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> bool_: ...
+@overload
+def any(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[bool_, ndarray]: ...
+def cumsum(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+@overload
+def ptp(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> _Number: ...
+@overload
+def ptp(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def ptp(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def amax(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def amax(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def amax(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def amin(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def amin(
+    a: ArrayLike,
+    axis: None = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def amin(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+
+# TODO: `np.prod()``: For object arrays `initial` does not necessarily
+# have to be a numerical scalar.
+# The only requirement is that it is compatible
+# with the `.__mul__()` method(s) of the passed array's elements.
+
+# Note that the same situation holds for all wrappers around
+# `np.ufunc.reduce`, e.g. `np.sum()` (`.__add__()`).
+@overload
+def prod(
+    a: _Number,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> _Number: ...
+@overload
+def prod(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: Literal[False] = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> number: ...
+@overload
+def prod(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+    initial: _NumberLike = ...,
+    where: _ArrayLikeBool = ...,
+) -> Union[number, ndarray]: ...
+def cumprod(
+    a: ArrayLike,
+    axis: Optional[int] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+) -> ndarray: ...
+def ndim(a: ArrayLike) -> int: ...
+def size(a: ArrayLike, axis: Optional[int] = ...) -> int: ...
+@overload
+def around(
+    a: _Number, decimals: int = ..., out: Optional[ndarray] = ...
+) -> _Number: ...
+@overload
+def around(
+    a: _NumberLike, decimals: int = ..., out: Optional[ndarray] = ...
+) -> number: ...
+@overload
+def around(
+    a: ArrayLike, decimals: int = ..., out: Optional[ndarray] = ...
+) -> ndarray: ...
+@overload
+def mean(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def mean(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def std(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    ddof: int = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def std(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    ddof: int = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
+@overload
+def var(
+    a: ArrayLike,
+    axis: None = ...,
+    dtype: DTypeLike = ...,
+    out: None = ...,
+    ddof: int = ...,
+    keepdims: Literal[False] = ...,
+) -> number: ...
+@overload
+def var(
+    a: ArrayLike,
+    axis: Optional[_ShapeLike] = ...,
+    dtype: DTypeLike = ...,
+    out: Optional[ndarray] = ...,
+    ddof: int = ...,
+    keepdims: bool = ...,
+) -> Union[number, ndarray]: ...
diff --git a/numpy/core/function_base.py b/numpy/core/function_base.py
index f57e95742..e940ac230 100644
--- a/numpy/core/function_base.py
+++ b/numpy/core/function_base.py
@@ -34,6 +34,11 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     .. versionchanged:: 1.16.0
         Non-scalar `start` and `stop` are now supported.
 
+    .. versionchanged:: 1.20.0
+        Values are rounded towards ``-inf`` instead of ``0`` when an
+        integer ``dtype`` is specified. The old behavior can
+        still be obtained with ``np.linspace(start, stop, num).astype(int)``
+
     Parameters
     ----------
     start : array_like
@@ -161,6 +166,9 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None,
     if axis != 0:
         y = _nx.moveaxis(y, 0, axis)
 
+    if _nx.issubdtype(dtype, _nx.integer):
+        _nx.floor(y, out=y)
+
     if retstep:
         return y.astype(dtype, copy=False), step
     else:
@@ -199,7 +207,7 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,
     endpoint : boolean, optional
         If true, `stop` is the last sample. Otherwise, it is not included.
         Default is True.
-    base : float, optional
+    base : array_like, optional
         The base of the log space. The step size between the elements in
         ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform.
         Default is 10.0.
@@ -363,7 +371,7 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
             6.12323400e-17+1.00000000e+00j,  7.07106781e-01+7.07106781e-01j,
             1.00000000e+00+0.00000000e+00j])
 
-    Graphical illustration of ``endpoint`` parameter:
+    Graphical illustration of `endpoint` parameter:
 
     >>> import matplotlib.pyplot as plt
     >>> N = 10
diff --git a/numpy/core/function_base.pyi b/numpy/core/function_base.pyi
new file mode 100644
index 000000000..1490bed4a
--- /dev/null
+++ b/numpy/core/function_base.pyi
@@ -0,0 +1,56 @@
+import sys
+from typing import overload, Tuple, Union, Sequence, Any
+
+from numpy import ndarray, inexact
+from numpy.typing import ArrayLike, DTypeLike, _SupportsArray, _NumberLike
+
+if sys.version_info >= (3, 8):
+    from typing import SupportsIndex, Literal
+else:
+    from typing_extensions import Literal, Protocol
+
+    class SupportsIndex(Protocol):
+        def __index__(self) -> int: ...
+
+# TODO: wait for support for recursive types
+_ArrayLikeNested = Sequence[Sequence[Any]]
+_ArrayLikeNumber = Union[
+    _NumberLike, Sequence[_NumberLike], ndarray, _SupportsArray, _ArrayLikeNested
+]
+@overload
+def linspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    retstep: Literal[False] = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
+@overload
+def linspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    retstep: Literal[True] = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> Tuple[ndarray, inexact]: ...
+def logspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    base: _ArrayLikeNumber = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
+def geomspace(
+    start: _ArrayLikeNumber,
+    stop: _ArrayLikeNumber,
+    num: SupportsIndex = ...,
+    endpoint: bool = ...,
+    dtype: DTypeLike = ...,
+    axis: SupportsIndex = ...,
+) -> ndarray: ...
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index 6dce88df3..14a31988f 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -134,8 +134,7 @@ typedef struct {
         char obval;
 } PyScalarObject;
 
-#define PyStringScalarObject PyStringObject
-#define PyStringScalarObject PyStringObject
+#define PyStringScalarObject PyBytesObject
 typedef struct {
         /* note that the PyObject_HEAD macro lives right here */
         PyUnicodeObject base;
@@ -150,6 +149,7 @@ typedef struct {
         PyArray_Descr *descr;
         int flags;
         PyObject *base;
+        void *_buffer_info;  /* private buffer info, tagged to allow warning */
 } PyVoidScalarObject;
 
 /* Macros
diff --git a/numpy/core/include/numpy/libdivide/LICENSE.txt b/numpy/core/include/numpy/libdivide/LICENSE.txt
new file mode 100644
index 000000000..d72a7c388
--- /dev/null
+++ b/numpy/core/include/numpy/libdivide/LICENSE.txt
@@ -0,0 +1,21 @@
+  zlib License
+  ------------
+
+  Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+  Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
diff --git a/numpy/core/include/numpy/libdivide/libdivide.h b/numpy/core/include/numpy/libdivide/libdivide.h
new file mode 100644
index 000000000..81057b7b4
--- /dev/null
+++ b/numpy/core/include/numpy/libdivide/libdivide.h
@@ -0,0 +1,2079 @@
+// libdivide.h - Optimized integer division
+// https://libdivide.com
+//
+// Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+//
+// libdivide is dual-licensed under the Boost or zlib licenses.
+// You may use libdivide under the terms of either of these.
+// See LICENSE.txt for more details.
+
+#ifndef LIBDIVIDE_H
+#define LIBDIVIDE_H
+
+#define LIBDIVIDE_VERSION "3.0"
+#define LIBDIVIDE_VERSION_MAJOR 3
+#define LIBDIVIDE_VERSION_MINOR 0
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+    #include <cstdlib>
+    #include <cstdio>
+    #include <type_traits>
+#else
+    #include <stdlib.h>
+    #include <stdio.h>
+#endif
+
+#if defined(LIBDIVIDE_AVX512)
+    #include <immintrin.h>
+#elif defined(LIBDIVIDE_AVX2)
+    #include <immintrin.h>
+#elif defined(LIBDIVIDE_SSE2)
+    #include <emmintrin.h>
+#endif
+
+#if defined(_MSC_VER)
+    #include <intrin.h>
+    // disable warning C4146: unary minus operator applied
+    // to unsigned type, result still unsigned
+    #pragma warning(disable: 4146)
+    #define LIBDIVIDE_VC
+#endif
+
+#if !defined(__has_builtin)
+    #define __has_builtin(x) 0
+#endif
+
+#if defined(__SIZEOF_INT128__)
+    #define HAS_INT128_T
+    // clang-cl on Windows does not yet support 128-bit division
+    #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
+        #define HAS_INT128_DIV
+    #endif
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+    #define LIBDIVIDE_X86_64
+#endif
+
+#if defined(__i386__)
+    #define LIBDIVIDE_i386
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define LIBDIVIDE_GCC_STYLE_ASM
+#endif
+
+#if defined(__cplusplus) || defined(LIBDIVIDE_VC)
+    #define LIBDIVIDE_FUNCTION __FUNCTION__
+#else
+    #define LIBDIVIDE_FUNCTION __func__
+#endif
+
+#define LIBDIVIDE_ERROR(msg) \
+    do { \
+        fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", \
+            __LINE__, LIBDIVIDE_FUNCTION, msg); \
+        abort(); \
+    } while (0)
+
+#if defined(LIBDIVIDE_ASSERTIONS_ON)
+    #define LIBDIVIDE_ASSERT(x) \
+        do { \
+            if (!(x)) { \
+                fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \
+                    __LINE__, LIBDIVIDE_FUNCTION, #x); \
+                abort(); \
+            } \
+        } while (0)
+#else
+    #define LIBDIVIDE_ASSERT(x)
+#endif
+
+#ifdef __cplusplus
+namespace libdivide {
+#endif
+
+// pack divider structs to prevent compilers from padding.
+// This reduces memory usage by up to 43% when using a large
+// array of libdivide dividers and improves performance
+// by up to 10% because of reduced memory bandwidth.
+#pragma pack(push, 1)
+
+struct libdivide_u32_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u32_branchfree_t {
+    uint32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s32_branchfree_t {
+    int32_t magic;
+    uint8_t more;
+};
+
+struct libdivide_u64_branchfree_t {
+    uint64_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s64_branchfree_t {
+    int64_t magic;
+    uint8_t more;
+};
+
+#pragma pack(pop)
+
+// Explanation of the "more" field:
+//
+// * Bits 0-5 is the shift value (for shift path or mult path).
+// * Bit 6 is the add indicator for mult path.
+// * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
+//   divisor indicator so that we can efficiently use sign extension to
+//   create a bitmask with all bits set to 1 (if the divisor is negative)
+//   or 0 (if the divisor is positive).
+//
+// u32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s32: [0-4] shift value
+//      [5] ignored
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// u64: [0-5] shift value
+//      [6] add indicator
+//      magic number of 0 indicates shift path
+//
+// s64: [0-5] shift value
+//      [6] add indicator
+//      [7] indicates negative divisor
+//      magic number of 0 indicates shift path
+//
+// In s32 and s64 branchfree modes, the magic number is negated according to
+// whether the divisor is negated. In branchfree strategy, it is not negated.
+
+enum {
+    LIBDIVIDE_32_SHIFT_MASK = 0x1F,
+    LIBDIVIDE_64_SHIFT_MASK = 0x3F,
+    LIBDIVIDE_ADD_MARKER = 0x40,
+    LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
+};
+
+static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d);
+static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
+static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d);
+static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
+
+static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
+static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
+static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
+static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
+
+static inline int32_t  libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
+static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
+static inline int64_t  libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
+static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom);
+
+static inline int32_t  libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom);
+static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
+static inline int64_t  libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom);
+static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
+
+static inline int32_t  libdivide_s32_recover(const struct libdivide_s32_t *denom);
+static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
+static inline int64_t  libdivide_s64_recover(const struct libdivide_s64_t *denom);
+static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
+
+static inline int32_t  libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom);
+static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom);
+static inline int64_t  libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom);
+static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
+    uint64_t xl = x, yl = y;
+    uint64_t rl = xl * yl;
+    return (uint32_t)(rl >> 32);
+}
+
+static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+    int64_t xl = x, yl = y;
+    int64_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int32_t)(rl >> 32);
+}
+
+static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
+    return __umulh(x, y);
+#elif defined(HAS_INT128_T)
+    __uint128_t xl = x, yl = y;
+    __uint128_t rl = xl * yl;
+    return (uint64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t x1 = (uint32_t)(x >> 32);
+    uint32_t y0 = (uint32_t)(y & mask);
+    uint32_t y1 = (uint32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    uint64_t x0y1 = x0 * (uint64_t)y1;
+    uint64_t x1y0 = x1 * (uint64_t)y0;
+    uint64_t x1y1 = x1 * (uint64_t)y1;
+    uint64_t temp = x1y0 + x0y0_hi;
+    uint64_t temp_lo = temp & mask;
+    uint64_t temp_hi = temp >> 32;
+
+    return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
+#endif
+}
+
+static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
+    return __mulh(x, y);
+#elif defined(HAS_INT128_T)
+    __int128_t xl = x, yl = y;
+    __int128_t rl = xl * yl;
+    return (int64_t)(rl >> 64);
+#else
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    uint32_t mask = 0xFFFFFFFF;
+    uint32_t x0 = (uint32_t)(x & mask);
+    uint32_t y0 = (uint32_t)(y & mask);
+    int32_t x1 = (int32_t)(x >> 32);
+    int32_t y1 = (int32_t)(y >> 32);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
+    int64_t t = x1 * (int64_t)y0 + x0y0_hi;
+    int64_t w1 = x0 * (int64_t)y1 + (t & mask);
+
+    return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
+#endif
+}
+
+static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val);
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, val)) {
+        return 31 - result;
+    }
+    return 0;
+#else
+    if (val == 0)
+        return 32;
+    int32_t result = 8;
+    uint32_t hi = 0xFFU << 24;
+    while ((val & hi) == 0) {
+        hi >>= 8;
+        result += 8;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clzll)
+    // Fast way to count leading zeros
+    return __builtin_clzll(val);
+#elif defined(LIBDIVIDE_VC) && defined(_WIN64)
+    unsigned long result;
+    if (_BitScanReverse64(&result, val)) {
+        return 63 - result;
+    }
+    return 0;
+#else
+    uint32_t hi = val >> 32;
+    uint32_t lo = val & 0xFFFFFFFF;
+    if (hi != 0) return libdivide_count_leading_zeros32(hi);
+    return 32 + libdivide_count_leading_zeros32(lo);
+#endif
+}
+
+// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
+// uint {v}. The result must fit in 32 bits.
+// Returns the quotient directly and the remainder in *r
+static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
+#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \
+     defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint32_t result;
+    __asm__("divl %[v]"
+            : "=a"(result), "=d"(*r)
+            : [v] "r"(v), "a"(u0), "d"(u1)
+            );
+    return result;
+#else
+    uint64_t n = ((uint64_t)u1 << 32) | u0;
+    uint32_t result = (uint32_t)(n / v);
+    *r = (uint32_t)(n - result * (uint64_t)v);
+    return result;
+#endif
+}
+
+// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
+// uint {v}. The result must fit in 64 bits.
+// Returns the quotient directly and the remainder in *r
+static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
+#if defined(LIBDIVIDE_X86_64) && \
+    defined(LIBDIVIDE_GCC_STYLE_ASM)
+    uint64_t result;
+    __asm__("divq %[v]"
+            : "=a"(result), "=d"(*r)
+            : [v] "r"(v), "a"(u0), "d"(u1)
+            );
+    return result;
+#elif defined(HAS_INT128_T) && \
+      defined(HAS_INT128_DIV)
+    __uint128_t n = ((__uint128_t)u1 << 64) | u0;
+    uint64_t result = (uint64_t)(n / v);
+    *r = (uint64_t)(n - result * (__uint128_t)v);
+    return result;
+#else
+    // Code taken from Hacker's Delight:
+    // http://www.hackersdelight.org/HDcode/divlu.c.
+    // License permits inclusion here per:
+    // http://www.hackersdelight.org/permissions.htm
+
+    const uint64_t b = (1ULL << 32); // Number base (32 bits)
+    uint64_t un1, un0; // Norm. dividend LSD's
+    uint64_t vn1, vn0; // Norm. divisor digits
+    uint64_t q1, q0; // Quotient digits
+    uint64_t un64, un21, un10; // Dividend digit pairs
+    uint64_t rhat; // A remainder
+    int32_t s; // Shift amount for norm
+
+    // If overflow, set rem. to an impossible value,
+    // and return the largest possible quotient
+    if (u1 >= v) {
+        *r = (uint64_t) -1;
+        return (uint64_t) -1;
+    }
+
+    // count leading zeros
+    s = libdivide_count_leading_zeros64(v);
+    if (s > 0) {
+        // Normalize divisor
+        v = v << s;
+        un64 = (u1 << s) | (u0 >> (64 - s));
+        un10 = u0 << s; // Shift dividend left
+    } else {
+        // Avoid undefined behavior of (u0 >> 64).
+        // The behavior is undefined if the right operand is
+        // negative, or greater than or equal to the length
+        // in bits of the promoted left operand.
+        un64 = u1;
+        un10 = u0;
+    }
+
+    // Break divisor up into two 32-bit digits
+    vn1 = v >> 32;
+    vn0 = v & 0xFFFFFFFF;
+
+    // Break right half of dividend into two digits
+    un1 = un10 >> 32;
+    un0 = un10 & 0xFFFFFFFF;
+
+    // Compute the first quotient digit, q1
+    q1 = un64 / vn1;
+    rhat = un64 - q1 * vn1;
+
+    while (q1 >= b || q1 * vn0 > b * rhat + un1) {
+        q1 = q1 - 1;
+        rhat = rhat + vn1;
+        if (rhat >= b)
+            break;
+    }
+
+     // Multiply and subtract
+    un21 = un64 * b + un1 - q1 * v;
+
+    // Compute the second quotient digit
+    q0 = un21 / vn1;
+    rhat = un21 - q0 * vn1;
+
+    while (q0 >= b || q0 * vn0 > b * rhat + un0) {
+        q0 = q0 - 1;
+        rhat = rhat + vn1;
+        if (rhat >= b)
+            break;
+    }
+
+    *r = (un21 * b + un0 - q0 * v) >> s;
+    return q1 * b + q0;
+#endif
+}
+
+// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
+static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
+    if (signed_shift > 0) {
+        uint32_t shift = signed_shift;
+        *u1 <<= shift;
+        *u1 |= *u0 >> (64 - shift);
+        *u0 <<= shift;
+    }
+    else if (signed_shift < 0) {
+        uint32_t shift = -signed_shift;
+        *u0 >>= shift;
+        *u0 |= *u1 << (64 - shift);
+        *u1 >>= shift;
+    }
+}
+
+// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
+static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
+#if defined(HAS_INT128_T) && \
+    defined(HAS_INT128_DIV)
+    __uint128_t ufull = u_hi;
+    __uint128_t vfull = v_hi;
+    ufull = (ufull << 64) | u_lo;
+    vfull = (vfull << 64) | v_lo;
+    uint64_t res = (uint64_t)(ufull / vfull);
+    __uint128_t remainder = ufull - (vfull * res);
+    *r_lo = (uint64_t)remainder;
+    *r_hi = (uint64_t)(remainder >> 64);
+    return res;
+#else
+    // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
+    // We want to compute u / v
+    typedef struct { uint64_t hi; uint64_t lo; } u128_t;
+    u128_t u = {u_hi, u_lo};
+    u128_t v = {v_hi, v_lo};
+
+    if (v.hi == 0) {
+        // divisor v is a 64 bit value, so we just need one 128/64 division
+        // Note that we are simpler than Hacker's Delight here, because we know
+        // the quotient fits in 64 bits whereas Hacker's Delight demands a full
+        // 128 bit quotient
+        *r_hi = 0;
+        return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
+    }
+    // Here v >= 2**64
+    // We know that v.hi != 0, so count leading zeros is OK
+    // We have 0 <= n <= 63
+    uint32_t n = libdivide_count_leading_zeros64(v.hi);
+
+    // Normalize the divisor so its MSB is 1
+    u128_t v1t = v;
+    libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
+    uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64
+
+    // To ensure no overflow
+    u128_t u1 = u;
+    libdivide_u128_shift(&u1.hi, &u1.lo, -1);
+
+    // Get quotient from divide unsigned insn.
+    uint64_t rem_ignored;
+    uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
+
+    // Undo normalization and division of u by 2.
+    u128_t q0 = {0, q1};
+    libdivide_u128_shift(&q0.hi, &q0.lo, n);
+    libdivide_u128_shift(&q0.hi, &q0.lo, -63);
+
+    // Make q0 correct or too small by 1
+    // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
+    if (q0.hi != 0 || q0.lo != 0) {
+        q0.hi -= (q0.lo == 0); // borrow
+        q0.lo -= 1;
+    }
+
+    // Now q0 is correct.
+    // Compute q0 * v as q0v
+    // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
+    // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
+    //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
+    // Each term is 128 bit
+    // High half of full product (upper 128 bits!) are dropped
+    u128_t q0v = {0, 0};
+    q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
+    q0v.lo = q0.lo*v.lo;
+
+    // Compute u - q0v as u_q0v
+    // This is the remainder
+    u128_t u_q0v = u;
+    u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow
+    u_q0v.lo -= q0v.lo;
+
+    // Check if u_q0v >= v
+    // This checks if our remainder is larger than the divisor
+    if ((u_q0v.hi > v.hi) ||
+        (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
+        // Increment q0
+        q0.lo += 1;
+        q0.hi += (q0.lo == 0); // carry
+
+        // Subtract v from remainder
+        u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
+        u_q0v.lo -= v.lo;
+    }
+
+    *r_hi = u_q0v.hi;
+    *r_lo = u_q0v.lo;
+
+    LIBDIVIDE_ASSERT(q0.hi == 0);
+    return q0.lo;
+#endif
+}
+
+////////// UINT32
+
+static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u32_t result;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint8_t more;
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint32_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < (1U << floor_log_2_d))) {
+            // This power works
+            more = floor_log_2_d;
+        } else {
+            // We have to use the general 33-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
+    return libdivide_internal_u32_gen(d, 0);
+}
+
+struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
+    struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
+    return ret;
+}
+
+uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    }
+    else {
+        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint32_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
+        }
+        else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+    uint32_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1U << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(32 + shift)
+        // Therefore we have d = 2^(32 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint32_t hi_dividend = 1U << shift;
+        uint32_t rem_ignored;
+        return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = 1ULL << (32 + shift);
+        uint64_t d = (1ULL << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1U << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
+        // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
+        // Also note that shift may be as high as 31, so shift + 1 will
+        // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
+        // then double the quotient and remainder.
+        uint64_t half_n = 1ULL << (32 + shift);
+        uint64_t d = (1ULL << 32) | denom->magic;
+        // Note that the quotient is guaranteed <= 32 bits, but the remainder
+        // may need 33!
+        uint32_t half_q = (uint32_t)(half_n / d);
+        uint64_t rem = half_n % d;
+        // We computed 2^(32+shift)/(m+2^32)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+/////////// UINT64
+
+static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u64_t result;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    } else {
+        uint64_t proposed_m, rem;
+        uint8_t more;
+        // (1 << (64 + floor_log_2_d)) / d
+        proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint64_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d;
+        } else {
+            // We have to use the general 65-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder. By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+                more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases,
+        // which is why we do it outside of the if statement.
+    }
+    return result;
+}
+
+struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
+    return libdivide_internal_u64_gen(d, 0);
+}
+
+struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
+    struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
+    return ret;
+}
+
+uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return numer >> more;
+    }
+    else {
+        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint64_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
+        }
+        else {
+             // All upper bits are 0,
+             // don't need to mask them off.
+            return q >> more;
+        }
+    }
+}
+
+uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+    uint64_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1ULL << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(64 + shift)
+        // Therefore we have d = 2^(64 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint64_t hi_dividend = 1ULL << shift;
+        uint64_t rem_ignored;
+        return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return 1ULL << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
+        // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
+        // libdivide_u32_recover for more on what we do here.
+        // TODO: do something better than 128 bit math
+
+        // Full n is a (potentially) 129 bit value
+        // half_n is a 128 bit value
+        // Compute the hi half of half_n. Low half is 0.
+        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        // d is a 65 bit value. The high bit is always set to 1.
+        const uint64_t d_hi = 1, d_lo = denom->magic;
+        // Note that the quotient is guaranteed <= 64 bits,
+        // but the remainder may need 65!
+        uint64_t r_hi, r_lo;
+        uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
+        // We computed 2^(64+shift)/(m+2^64)
+        // Double the remainder ('dr') and check if that is larger than d
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
+        uint64_t dr_lo = r_lo + r_lo;
+        uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
+        int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
+        uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
+        return full_q + 1;
+    }
+}
+
+/////////// SINT32
+
+static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s32_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint32_t ud = (uint32_t)d;
+    uint32_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint32_t rem, proposed_m;
+        proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint32_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1U << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d - 1;
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint32_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+
+        proposed_m += 1;
+        int32_t magic = (int32_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
+    return libdivide_internal_s32_gen(d, 0);
+}
+
+struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
+    struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
+    struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+
+    if (!denom->magic) {
+        uint32_t sign = (int8_t)more >> 7;
+        uint32_t mask = (1U << shift) - 1;
+        uint32_t uq = numer + ((numer >> 31) & mask);
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int32_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint32_t)numer ^ sign) - sign;
+        }
+        int32_t q = (int32_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int32_t sign = (int8_t)more >> 7;
+    int32_t magic = denom->magic;
+    int32_t q = libdivide_mullhi_s32(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    uint32_t q_sign = (uint32_t)(q >> 31);
+    q += q_sign & ((1U << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+    if (!denom->magic) {
+        uint32_t absD = 1U << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int32_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
+            ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int32_t result = 1U << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30
+        uint32_t q = (uint32_t)(n / d);
+        int32_t result = (int32_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
+    return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
+}
+
+///////////// SINT64
+
+static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s64_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set.  This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint64_t ud = (uint64_t)d;
+    uint64_t absD = (d < 0) ? -ud : ud;
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and non-branchfree cases are the same
+        result.magic = 0;
+        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+    } else {
+        // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint8_t more;
+        uint64_t rem, proposed_m;
+        proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint64_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+            // This power works
+            more = floor_log_2_d - 1;
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int32_t.
+            proposed_m += proposed_m;
+            const uint64_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
+            // also set ADD_MARKER this is an annoying optimization that
+            // enables algorithm #4 to avoid the mask. However we always set it
+            // in the branchfree case
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        proposed_m += 1;
+        int64_t magic = (int64_t)proposed_m;
+
+        // Mark if we are negative
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
+    return libdivide_internal_s64_gen(d, 0);
+}
+
+struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
+    struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
+    struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
+    return ret;
+}
+
+int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+
+    if (!denom->magic) { // shift path
+        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t uq = numer + ((numer >> 63) & mask);
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        // must be arithmetic shift and then sign-extend
+        int64_t sign = (int8_t)more >> 7;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int64_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint64_t)numer ^ sign) - sign;
+        }
+        int64_t q = (int64_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int64_t sign = (int8_t)more >> 7;
+    int64_t magic = denom->magic;
+    int64_t q = libdivide_mullhi_s64(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2.
+    uint64_t is_power_of_2 = (magic == 0);
+    uint64_t q_sign = (uint64_t)(q >> 63);
+    q += q_sign & ((1ULL << shift) - is_power_of_2);
+
+    // Arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    if (denom->magic == 0) { // shift path
+        uint64_t absD = 1ULL << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int64_t)absD;
+    } else {
+        // Unsigned math is much easier
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER)
+            ? denom->magic > 0 : denom->magic < 0;
+
+        uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint64_t n_hi = 1ULL << shift, n_lo = 0;
+        uint64_t rem_ignored;
+        uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
+        int64_t result = (int64_t)(q + 1);
+        if (negative_divisor) {
+            result = -result;
+        }
+        return result;
+    }
+}
+
+int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
+    return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
+}
+
+#if defined(LIBDIVIDE_AVX512)
+
+static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom);
+static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom);
+static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom);
+static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom);
+
+static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static inline __m512i libdivide_s64_signbits(__m512i v) {;
+    return _mm512_srai_epi64(v, 63);
+}
+
+static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) {
+    return _mm512_srai_epi64(v, amt);
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) {
+    __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
+    __m512i a1X3X = _mm512_srli_epi64(a, 32);
+    __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
+    __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
+    return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) {
+    __m512i lomask = _mm512_set1_epi64(0xffffffff);
+    __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1);
+    __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1);
+    __m512i w0 = _mm512_mul_epu32(x, y);
+    __m512i w1 = _mm512_mul_epu32(x, yh);
+    __m512i w2 = _mm512_mul_epu32(xh, y);
+    __m512i w3 = _mm512_mul_epu32(xh, yh);
+    __m512i w0h = _mm512_srli_epi64(w0, 32);
+    __m512i s1 = _mm512_add_epi64(w1, w0h);
+    __m512i s1l = _mm512_and_si512(s1, lomask);
+    __m512i s1h = _mm512_srli_epi64(s1, 32);
+    __m512i s2 = _mm512_add_epi64(w2, s1l);
+    __m512i s2h = _mm512_srli_epi64(s2, 32);
+    __m512i hi = _mm512_add_epi64(w3, s1h);
+            hi = _mm512_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) {
+    __m512i p = libdivide_mullhi_u64_vector(x, y);
+    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y);
+    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x);
+    p = _mm512_sub_epi64(p, t1);
+    p = _mm512_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi32(numers, more);
+    }
+    else {
+        __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+            return _mm512_srli_epi32(t, shift);
+        }
+        else {
+            return _mm512_srli_epi32(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic));
+    __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
+    return _mm512_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm512_srli_epi64(numers, more);
+    }
+    else {
+        __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+            return _mm512_srli_epi64(t, shift);
+        }
+        else {
+            return _mm512_srli_epi64(q, more);
+        }
+    }
+}
+
+__m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic));
+    __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
+    return _mm512_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm512_srai_epi32(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
+        return q;
+    }
+    else {
+        __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+    __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic));
+    q = _mm512_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31
+    __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm512_srai_epi32(q, shift); // q >>= shift
+    q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
+        return q;
+    }
+    else {
+        __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic));
+    q = _mm512_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2);
+    q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#elif defined(LIBDIVIDE_AVX2)
+
+static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom);
+static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom);
+static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom);
+static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom);
+
+static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
+static inline __m256i libdivide_s64_signbits(__m256i v) {
+    __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm256_srai_epi64 (from AVX512).
+static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) {
+    const int b = 64 - amt;
+    __m256i m = _mm256_set1_epi64x(1ULL << (b - 1));
+    __m256i x = _mm256_srli_epi64(v, amt);
+    __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// b is one 32-bit value repeated.
+static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) {
+    __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
+    __m256i a1X3X = _mm256_srli_epi64(a, 32);
+    __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
+    __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
+    return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) {
+    __m256i lomask = _mm256_set1_epi64x(0xffffffff);
+    __m256i xh = _mm256_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
+    __m256i yh = _mm256_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
+    __m256i w0 = _mm256_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
+    __m256i w1 = _mm256_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
+    __m256i w2 = _mm256_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
+    __m256i w3 = _mm256_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
+    __m256i w0h = _mm256_srli_epi64(w0, 32);
+    __m256i s1 = _mm256_add_epi64(w1, w0h);
+    __m256i s1l = _mm256_and_si256(s1, lomask);
+    __m256i s1h = _mm256_srli_epi64(s1, 32);
+    __m256i s2 = _mm256_add_epi64(w2, s1l);
+    __m256i s2h = _mm256_srli_epi64(s2, 32);
+    __m256i hi = _mm256_add_epi64(w3, s1h);
+            hi = _mm256_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) {
+    __m256i p = libdivide_mullhi_u64_vector(x, y);
+    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y);
+    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x);
+    p = _mm256_sub_epi64(p, t1);
+    p = _mm256_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi32(numers, more);
+    }
+    else {
+        __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+            return _mm256_srli_epi32(t, shift);
+        }
+        else {
+            return _mm256_srli_epi32(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic));
+    __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
+    return _mm256_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi64(numers, more);
+    }
+    else {
+        __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+            return _mm256_srli_epi64(t, shift);
+        }
+        else {
+            return _mm256_srli_epi64(q, more);
+        }
+    }
+}
+
+__m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic));
+    __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
+    return _mm256_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm256_srai_epi32(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
+        return q;
+    }
+    else {
+        __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+    __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic));
+    q = _mm256_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31
+    __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm256_srai_epi32(q, shift); // q >>= shift
+    q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
+        return q;
+    }
+    else {
+        __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic));
+    q = _mm256_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2);
+    q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#elif defined(LIBDIVIDE_SSE2)
+
+static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
+static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
+static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
+static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
+
+static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom);
+static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom);
+static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom);
+static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+// Implementation of _mm_srai_epi64(v, 63) (from AVX512).
+static inline __m128i libdivide_s64_signbits(__m128i v) {
+    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Implementation of _mm_srai_epi64 (from AVX512).
+static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
+    const int b = 64 - amt;
+    __m128i m = _mm_set1_epi64x(1ULL << (b - 1));
+    __m128i x = _mm_srli_epi64(v, amt);
+    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32-bit value repeated.
+static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) {
+    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
+    __m128i a1X3X = _mm_srli_epi64(a, 32);
+    __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
+    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
+    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// SSE2 does not have a signed multiplication instruction, but we can convert
+// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
+// repeated four times.
+static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) {
+    __m128i p = libdivide_mullhi_u32_vector(a, b);
+    // t1 = (a >> 31) & y, arithmetic shift
+    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
+    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
+    p = _mm_sub_epi32(p, t1);
+    p = _mm_sub_epi32(p, t2);
+    return p;
+}
+
+// Here, y is assumed to contain one 64-bit value repeated.
+// https://stackoverflow.com/a/28827013
+static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) {
+    __m128i lomask = _mm_set1_epi64x(0xffffffff);
+    __m128i xh = _mm_shuffle_epi32(x, 0xB1);        // x0l, x0h, x1l, x1h
+    __m128i yh = _mm_shuffle_epi32(y, 0xB1);        // y0l, y0h, y1l, y1h
+    __m128i w0 = _mm_mul_epu32(x, y);               // x0l*y0l, x1l*y1l
+    __m128i w1 = _mm_mul_epu32(x, yh);              // x0l*y0h, x1l*y1h
+    __m128i w2 = _mm_mul_epu32(xh, y);              // x0h*y0l, x1h*y0l
+    __m128i w3 = _mm_mul_epu32(xh, yh);             // x0h*y0h, x1h*y1h
+    __m128i w0h = _mm_srli_epi64(w0, 32);
+    __m128i s1 = _mm_add_epi64(w1, w0h);
+    __m128i s1l = _mm_and_si128(s1, lomask);
+    __m128i s1h = _mm_srli_epi64(s1, 32);
+    __m128i s2 = _mm_add_epi64(w2, s1l);
+    __m128i s2h = _mm_srli_epi64(s2, 32);
+    __m128i hi = _mm_add_epi64(w3, s1h);
+            hi = _mm_add_epi64(hi, s2h);
+
+    return hi;
+}
+
+// y is one 64-bit value repeated.
+static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) {
+    __m128i p = libdivide_mullhi_u64_vector(x, y);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
+    p = _mm_sub_epi64(p, t1);
+    p = _mm_sub_epi64(p, t2);
+    return p;
+}
+
+////////// UINT32
+
+__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi32(numers, more);
+    }
+    else {
+        __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+            return _mm_srli_epi32(t, shift);
+        }
+        else {
+            return _mm_srli_epi32(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic));
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+    return _mm_srli_epi32(t, denom->more);
+}
+
+////////// UINT64
+
+__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi64(numers, more);
+    }
+    else {
+        __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+            return _mm_srli_epi64(t, shift);
+        }
+        else {
+            return _mm_srli_epi64(q, more);
+        }
+    }
+}
+
+__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic));
+    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+    return _mm_srli_epi64(t, denom->more);
+}
+
+////////// SINT32
+
+__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm_srai_epi32(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
+        return q;
+    }
+    else {
+        __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
+        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+    __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic));
+    q = _mm_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
+    __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2);
+    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm_srai_epi32(q, shift); // q >>= shift
+    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
+__m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
+    int64_t magic = denom->magic;
+    if (magic == 0) { // shift path
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
+        __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+         // q = (q ^ sign) - sign;
+        q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
+        return q;
+    }
+    else {
+        __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= denom->mult_path.shift
+        q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
+        q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) {
+    int64_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
+
+     // libdivide_mullhi_s64(numers, magic);
+    __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic));
+    q = _mm_add_epi64(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
+    __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2);
+    q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
+    q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
+    q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+#endif
+
+/////////// C++ stuff
+
+#ifdef __cplusplus
+
+// The C++ divider class is templated on both an integer type
+// (like uint64_t) and an algorithm type.
+// * BRANCHFULL is the default algorithm type.
+// * BRANCHFREE is the branchfree algorithm type.
+enum {
+    BRANCHFULL,
+    BRANCHFREE
+};
+
+#if defined(LIBDIVIDE_AVX512)
+    #define LIBDIVIDE_VECTOR_TYPE __m512i
+#elif defined(LIBDIVIDE_AVX2)
+    #define LIBDIVIDE_VECTOR_TYPE __m256i
+#elif defined(LIBDIVIDE_SSE2)
+    #define LIBDIVIDE_VECTOR_TYPE __m128i
+#endif
+
+#if !defined(LIBDIVIDE_VECTOR_TYPE)
+    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO)
+#else
+    #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
+        LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \
+            return libdivide_##ALGO##_do_vector(n, &denom); \
+        }
+#endif
+
+// The DISPATCHER_GEN() macro generates C++ methods (for the given integer
+// and algorithm types) that redirect to libdivide's C API.
+#define DISPATCHER_GEN(T, ALGO) \
+    libdivide_##ALGO##_t denom; \
+    dispatcher() { } \
+    dispatcher(T d) \
+        : denom(libdivide_##ALGO##_gen(d)) \
+    { } \
+    T divide(T n) const { \
+        return libdivide_##ALGO##_do(n, &denom); \
+    } \
+    LIBDIVIDE_DIVIDE_VECTOR(ALGO) \
+    T recover() const { \
+        return libdivide_##ALGO##_recover(&denom); \
+    }
+
+// The dispatcher selects a specific division algorithm for a given
+// type and ALGO using partial template specialization.
+template<bool IS_INTEGRAL, bool IS_SIGNED, int SIZEOF, int ALGO> struct dispatcher { };
+
+template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFULL> { DISPATCHER_GEN(int32_t, s32) };
+template<> struct dispatcher<true, true, sizeof(int32_t), BRANCHFREE> { DISPATCHER_GEN(int32_t, s32_branchfree) };
+template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFULL> { DISPATCHER_GEN(uint32_t, u32) };
+template<> struct dispatcher<true, false, sizeof(uint32_t), BRANCHFREE> { DISPATCHER_GEN(uint32_t, u32_branchfree) };
+template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFULL> { DISPATCHER_GEN(int64_t, s64) };
+template<> struct dispatcher<true, true, sizeof(int64_t), BRANCHFREE> { DISPATCHER_GEN(int64_t, s64_branchfree) };
+template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFULL> { DISPATCHER_GEN(uint64_t, u64) };
+template<> struct dispatcher<true, false, sizeof(uint64_t), BRANCHFREE> { DISPATCHER_GEN(uint64_t, u64_branchfree) };
+
+// This is the main divider class for use by the user (C++ API).
+// The actual division algorithm is selected using the dispatcher struct
+// based on the integer and algorithm template parameters.
+template<typename T, int ALGO = BRANCHFULL>
+class divider {
+public:
+    // We leave the default constructor empty so that creating
+    // an array of dividers and then initializing them
+    // later doesn't slow us down.
+    divider() { }
+
+    // Constructor that takes the divisor as a parameter
+    divider(T d) : div(d) { }
+
+    // Divides n by the divisor
+    T divide(T n) const {
+        return div.divide(n);
+    }
+
+    // Recovers the divisor, returns the value that was
+    // used to initialize this divider object.
+    T recover() const {
+        return div.recover();
+    }
+
+    bool operator==(const divider<T, ALGO>& other) const {
+        return div.denom.magic == other.denom.magic &&
+               div.denom.more == other.denom.more;
+    }
+
+    bool operator!=(const divider<T, ALGO>& other) const {
+        return !(*this == other);
+    }
+
+#if defined(LIBDIVIDE_VECTOR_TYPE)
+    // Treats the vector as packed integer values with the same type as
+    // the divider (e.g. s32, u32, s64, u64) and divides each of
+    // them by the divider, returning the packed quotients.
+    LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const {
+        return div.divide(n);
+    }
+#endif
+
+private:
+    // Storage for the actual divisor
+    dispatcher<std::is_integral<T>::value,
+               std::is_signed<T>::value, sizeof(T), ALGO> div;
+};
+
+// Overload of operator / for scalar division
+template<typename T, int ALGO>
+T operator/(T n, const divider<T, ALGO>& div) {
+    return div.divide(n);
+}
+
+// Overload of operator /= for scalar division
+template<typename T, int ALGO>
+T& operator/=(T& n, const divider<T, ALGO>& div) {
+    n = div.divide(n);
+    return n;
+}
+
+#if defined(LIBDIVIDE_VECTOR_TYPE)
+    // Overload of operator / for vector division
+    template<typename T, int ALGO>
+    LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider<T, ALGO>& div) {
+        return div.divide(n);
+    }
+    // Overload of operator /= for vector division
+    template<typename T, int ALGO>
+    LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider<T, ALGO>& div) {
+        n = div.divide(n);
+        return n;
+    }
+#endif
+
+// libdivdie::branchfree_divider<T>
+template <typename T>
+using branchfree_divider = divider<T, BRANCHFREE>;
+
+} // namespace libdivide
+
+#endif // __cplusplus
+
+#endif // LIBDIVIDE_H
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index bbcf468c1..63e8bf974 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -210,6 +210,7 @@ typedef enum {
 
 /* For specifying allowed casting in operations which support it */
 typedef enum {
+        _NPY_ERROR_OCCURRED_IN_CAST = -1,
         /* Only allow identical types */
         NPY_NO_CASTING=0,
         /* Allow identical and byte swapped types */
@@ -219,7 +220,14 @@ typedef enum {
         /* Allow safe casts or casts within the same kind */
         NPY_SAME_KIND_CASTING=3,
         /* Allow any casts */
-        NPY_UNSAFE_CASTING=4
+        NPY_UNSAFE_CASTING=4,
+        /*
+         * Flag to allow signalling that a cast is a view, this flag is not
+         * valid when requesting a cast of specific safety.
+         * _NPY_CAST_IS_VIEW|NPY_EQUIV_CASTING means the same as NPY_NO_CASTING.
+         */
+        // TODO-DTYPES: Needs to be documented.
+        _NPY_CAST_IS_VIEW = 1 << 16,
 } NPY_CASTING;
 
 typedef enum {
@@ -701,6 +709,7 @@ typedef struct tagPyArrayObject_fields {
     int flags;
     /* For weak references */
     PyObject *weakreflist;
+    void *_buffer_info;  /* private buffer info, tagged to allow warning */
 } PyArrayObject_fields;
 
 /*
@@ -720,7 +729,18 @@ typedef struct tagPyArrayObject {
 } PyArrayObject;
 #endif
 
-#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields))
+/*
+ * Removed 2020-Nov-25, NumPy 1.20
+ * #define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields))
+ *
+ * The above macro was removed as it gave a false sense of a stable ABI
+ * with respect to the structures size.  If you require a runtime constant,
+ * you can use `PyArray_Type.tp_basicsize` instead.  Otherwise, please
+ * see the PyArrayObject documentation or ask the NumPy developers for
+ * information on how to correctly replace the macro in a way that is
+ * compatible with multiple NumPy versions.
+ */
+
 
 /* Array Flags Object */
 typedef struct PyArrayFlagsObject {
@@ -1759,8 +1779,8 @@ typedef struct {
 } npy_stride_sort_item;
 
 /************************************************************
- * This is the form of the struct that's returned pointed by the
- * PyCObject attribute of an array __array_struct__. See
+ * This is the form of the struct that's stored in the
+ * PyCapsule returned by an array's __array_struct__ attribute. See
  * https://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full
  * documentation.
  ************************************************************/
@@ -1839,6 +1859,10 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
             PyArray_DTypeMeta *cls, PyTypeObject *obj);
 
     typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
+    typedef PyArray_DTypeMeta *(common_dtype_function)(
+            PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtyep2);
+    typedef PyArray_Descr *(common_instance_function)(
+            PyArray_Descr *dtype1, PyArray_Descr *dtyep2);
 
     /*
      * While NumPy DTypes would not need to be heap types the plan is to
@@ -1894,6 +1918,14 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
         discover_descr_from_pyobject_function *discover_descr_from_pyobject;
         is_known_scalar_type_function *is_known_scalar_type;
         default_descr_function *default_descr;
+        common_dtype_function *common_dtype;
+        common_instance_function *common_instance;
+        /*
+         * Dictionary of ArrayMethods representing most possible casts
+         * (structured and object are exceptions).
+         * This should potentially become a weak mapping in the future.
+         */
+        PyObject *castingimpls;
     };
 
 #endif  /* NPY_INTERNAL_BUILD */
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index 7e6b01924..191cd244f 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -28,6 +28,30 @@ extern "C" {
  * PyInt -> PyLong
  */
 
+
+/*
+ * This is a renamed copy of the Python non-limited API function _PyLong_AsInt. It is
+ * included here because it is missing from the PyPy API. It completes the PyLong_As*
+ * group of functions and can be useful in replacing PyInt_Check.
+ */
+static NPY_INLINE int
+Npy__PyLong_AsInt(PyObject *obj)
+{
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(obj, &overflow);
+
+    /* INT_MAX and INT_MIN are defined in Python.h */
+    if (overflow || result > INT_MAX || result < INT_MIN) {
+        /* XXX: could be cute and give a different
+           message for overflow == -1 */
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)result;
+}
+
+
 #if defined(NPY_PY3K)
 /* Return True only if the long fits in a C long */
 static NPY_INLINE int PyInt_Check(PyObject *op) {
@@ -39,6 +63,7 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
     return (overflow == 0);
 }
 
+
 #define PyInt_FromLong PyLong_FromLong
 #define PyInt_AsLong PyLong_AsLong
 #define PyInt_AS_LONG PyLong_AsLong
@@ -65,6 +90,8 @@ static NPY_INLINE int PyInt_Check(PyObject *op) {
     #define Py_SET_TYPE(obj, type) ((Py_TYPE(obj) = (type)), (void)0)
     /* Introduced in https://github.com/python/cpython/commit/b10dc3e7a11fcdb97e285882eba6da92594f90f9 */
     #define Py_SET_SIZE(obj, size) ((Py_SIZE(obj) = (size)), (void)0)
+    /* Introduced in https://github.com/python/cpython/commit/c86a11221df7e37da389f9c6ce6e47ea22dc44ff */
+    #define Py_SET_REFCNT(obj, refcnt) ((Py_REFCNT(obj) = (refcnt)), (void)0)
 #endif
 
 
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 5706e0576..c8495db8e 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -1,22 +1,34 @@
 #ifndef _NPY_COMMON_H_
 #define _NPY_COMMON_H_
 
+/* need Python.h for npy_intp, npy_uintp */
+#include <Python.h>
+
 /* numpconfig.h is auto-generated */
 #include "numpyconfig.h"
 #ifdef HAVE_NPY_CONFIG_H
 #include <npy_config.h>
 #endif
 
-/* need Python.h for npy_intp, npy_uintp */
-#include <Python.h>
-
+// compile time environment variables
+#ifndef NPY_RELAXED_STRIDES_CHECKING
+    #define NPY_RELAXED_STRIDES_CHECKING 0
+#endif
+#ifndef NPY_RELAXED_STRIDES_DEBUG
+    #define NPY_RELAXED_STRIDES_DEBUG 0
+#endif
+#ifndef NPY_USE_NEW_CASTINGIMPL
+    #define NPY_USE_NEW_CASTINGIMPL 0
+#endif
 /*
  * using static inline modifiers when defining npy_math functions
  * allows the compiler to make optimizations when possible
  */
-#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
 #ifndef NPY_INLINE_MATH
-#define NPY_INLINE_MATH 1
+#if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
+    #define NPY_INLINE_MATH 1
+#else
+    #define NPY_INLINE_MATH 0
 #endif
 #endif
 
@@ -262,11 +274,10 @@ typedef Py_uintptr_t npy_uintp;
 #define constchar char
 
 /* NPY_INTP_FMT Note:
- *      Unlike the other NPY_*_FMT macros which are used with
- *      PyOS_snprintf, NPY_INTP_FMT is used with PyErr_Format and
- *      PyString_Format. These functions use different formatting
- *      codes which are portably specified according to the Python
- *      documentation. See ticket #1795.
+ *      Unlike the other NPY_*_FMT macros, which are used with PyOS_snprintf,
+ *      NPY_INTP_FMT is used with PyErr_Format and PyUnicode_FromFormat. Those
+ *      functions use different formatting codes that are portably specified
+ *      according to the Python documentation. See issue gh-2388.
  */
 #if NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_INT
         #define NPY_INTP NPY_INT
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index 509e23a51..4dbf9d84e 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -24,7 +24,6 @@
 #define _NPY_CPUARCH_H_
 
 #include "numpyconfig.h"
-#include <string.h> /* for memcpy */
 
 #if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
     /*
@@ -111,8 +110,6 @@
     information about your platform (OS, CPU and compiler)
 #endif
 
-#define NPY_COPY_PYOBJECT_PTR(dst, src) memcpy(dst, src, sizeof(PyObject *))
-
 #if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64))
 #define NPY_CPU_HAVE_UNALIGNED_ACCESS 1
 #else
diff --git a/numpy/core/include/numpy/npy_interrupt.h b/numpy/core/include/numpy/npy_interrupt.h
index 40cb7ac5e..bcb539326 100644
--- a/numpy/core/include/numpy/npy_interrupt.h
+++ b/numpy/core/include/numpy/npy_interrupt.h
@@ -1,79 +1,18 @@
-
-/* Signal handling:
-
-This header file defines macros that allow your code to handle
-interrupts received during processing.  Interrupts that
-could reasonably be handled:
-
-SIGINT, SIGABRT, SIGALRM, SIGSEGV
-
-****Warning***************
-
-Do not allow code that creates temporary memory or increases reference
-counts of Python objects to be interrupted unless you handle it
-differently.
-
-**************************
-
-The mechanism for handling interrupts is conceptually simple:
-
-  - replace the signal handler with our own home-grown version
-     and store the old one.
-  - run the code to be interrupted -- if an interrupt occurs
-     the handler should basically just cause a return to the
-     calling function for finish work.
-  - restore the old signal handler
-
-Of course, every code that allows interrupts must account for
-returning via the interrupt and handle clean-up correctly.  But,
-even still, the simple paradigm is complicated by at least three
-factors.
-
- 1) platform portability (i.e. Microsoft says not to use longjmp
-     to return from signal handling.  They have a __try  and __except
-     extension to C instead but what about mingw?).
-
- 2) how to handle threads: apparently whether signals are delivered to
-    every thread of the process or the "invoking" thread is platform
-    dependent. --- we don't handle threads for now.
-
- 3) do we need to worry about re-entrance.  For now, assume the
-    code will not call-back into itself.
-
-Ideas:
-
- 1) Start by implementing an approach that works on platforms that
-    can use setjmp and longjmp functionality and does nothing
-    on other platforms.
-
- 2) Ignore threads --- i.e. do not mix interrupt handling and threads
-
- 3) Add a default signal_handler function to the C-API but have the rest
-    use macros.
-
-
-Simple Interface:
-
-
-In your C-extension: around a block of code you want to be interruptible
-with a SIGINT
-
-NPY_SIGINT_ON
-[code]
-NPY_SIGINT_OFF
-
-In order for this to work correctly, the
-[code] block must not allocate any memory or alter the reference count of any
-Python objects.  In other words [code] must be interruptible so that continuation
-after NPY_SIGINT_OFF will only be "missing some computations"
-
-Interrupt handling does not work well with threads.
-
-*/
-
-/* Add signal handling macros
-   Make the global variable and signal handler part of the C-API
-*/
+/*
+ * This API is only provided because it is part of publicly exported
+ * headers. Its use is considered DEPRECATED, and it will be removed
+ * eventually.
+ * (This includes the _PyArray_SigintHandler and _PyArray_GetSigintBuf
+ * functions which are however, public API, and not headers.)
+ *
+ * Instead of using these non-threadsafe macros consider periodically
+ * querying `PyErr_CheckSignals()` or `PyOS_InterruptOccurred()` will work.
+ * Both of these require holding the GIL, although cpython could add a
+ * version of `PyOS_InterruptOccurred()` which does not. Such a version
+ * actually exists as private API in Python 3.10, and backported to 3.9 and 3.8,
+ * see also https://bugs.python.org/issue41037 and
+ * https://github.com/python/cpython/pull/20599).
+ */
 
 #ifndef NPY_INTERRUPT_H
 #define NPY_INTERRUPT_H
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index a07f49501..7d71c36cc 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -5,14 +5,12 @@
 extern "C" {
 #endif
 
+#include <numpy/npy_common.h>
+
 #include <math.h>
 #ifdef __SUNPRO_CC
 #include <sunmath.h>
 #endif
-#ifdef HAVE_NPY_CONFIG_H
-#include <npy_config.h>
-#endif
-#include <numpy/npy_common.h>
 
 /* By adding static inline specifiers to npy_math function definitions when
    appropriate, compiler is given the opportunity to optimize */
@@ -213,7 +211,7 @@ double npy_spacing(double x);
 
 /* use builtins to avoid function calls in tight loops
  * only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISNAN
+#ifdef HAVE___BUILTIN_ISNAN
     #define npy_isnan(x) __builtin_isnan(x)
 #else
     #ifndef NPY_HAVE_DECL_ISNAN
@@ -229,7 +227,7 @@ double npy_spacing(double x);
 
 
 /* only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISFINITE
+#ifdef HAVE___BUILTIN_ISFINITE
     #define npy_isfinite(x) __builtin_isfinite(x)
 #else
     #ifndef NPY_HAVE_DECL_ISFINITE
@@ -244,7 +242,7 @@ double npy_spacing(double x);
 #endif
 
 /* only available if npy_config.h is available (= numpys own build) */
-#if HAVE___BUILTIN_ISINF
+#ifdef HAVE___BUILTIN_ISINF
     #define npy_isinf(x) __builtin_isinf(x)
 #else
     #ifndef NPY_HAVE_DECL_ISINF
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 8eaf446b7..a1b1de0ef 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -41,6 +41,7 @@
 #define NPY_1_17_API_VERSION 0x00000008
 #define NPY_1_18_API_VERSION 0x00000008
 #define NPY_1_19_API_VERSION 0x00000008
-#define NPY_1_20_API_VERSION 0x00000008
+#define NPY_1_20_API_VERSION 0x0000000e
+#define NPY_1_21_API_VERSION 0x0000000e
 
 #endif
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index cb025736e..892ad2540 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -1,8 +1,8 @@
+from contextlib import nullcontext
+
 import numpy as np
 from .numeric import uint8, ndarray, dtype
-from numpy.compat import (
-    os_fspath, contextlib_nullcontext, is_pathlib_path
-)
+from numpy.compat import os_fspath, is_pathlib_path
 from numpy.core.overrides import set_module
 
 __all__ = ['memmap']
@@ -37,7 +37,10 @@ class memmap(ndarray):
     This class may at some point be turned into a factory function
     which returns a view into an mmap buffer.
 
-    Delete the memmap instance to close the memmap file.
+    Flush the memmap instance to write the changes to the file. Currently there
+    is no API to close the underlying ``mmap``. It is tricky to ensure the
+    resource is actually closed, since it may be shared between different
+    memmap instances.
 
 
     Parameters
@@ -97,7 +100,7 @@ class memmap(ndarray):
     flush
         Flush any changes in memory to file on disk.
         When you delete a memmap object, flush is called first to write
-        changes to disk before removing the object.
+        changes to disk.
 
 
     See also
@@ -109,7 +112,7 @@ class memmap(ndarray):
     The memmap object can be used anywhere an ndarray is accepted.
     Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
     ``True``.
-    
+
     Memory-mapped files cannot be larger than 2GB on 32-bit systems.
 
     When a memmap causes a file to be created or extended beyond its
@@ -148,9 +151,9 @@ class memmap(ndarray):
     >>> fp.filename == path.abspath(filename)
     True
 
-    Deletion flushes memory changes to disk before removing the object:
+    Flushes memory changes to disk in order to read them back
 
-    >>> del fp
+    >>> fp.flush()
 
     Load the memmap and verify data was stored:
 
@@ -220,7 +223,7 @@ class memmap(ndarray):
             raise ValueError("shape must be given")
 
         if hasattr(filename, 'read'):
-            f_ctx = contextlib_nullcontext(filename)
+            f_ctx = nullcontext(filename)
         else:
             f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
 
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 0becc2393..f736973de 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -38,7 +38,7 @@ __all__ = [
     'nested_iters', 'normalize_axis_index', 'packbits',
     'promote_types', 'putmask', 'ravel_multi_index', 'result_type', 'scalar',
     'set_datetimeparse_function', 'set_legacy_print_mode', 'set_numeric_ops',
-    'set_string_function', 'set_typeDict', 'shares_memory', 'test_interrupt',
+    'set_string_function', 'set_typeDict', 'shares_memory',
     'tracemalloc_domain', 'typeinfo', 'unpackbits', 'unravel_index', 'vdot',
     'where', 'zeros']
 
@@ -90,14 +90,14 @@ def empty_like(prototype, dtype=None, order=None, subok=None, shape=None):
         .. versionadded:: 1.6.0
     order : {'C', 'F', 'A', or 'K'}, optional
         Overrides the memory layout of the result. 'C' means C-order,
-        'F' means F-order, 'A' means 'F' if ``prototype`` is Fortran
-        contiguous, 'C' otherwise. 'K' means match the layout of ``prototype``
+        'F' means F-order, 'A' means 'F' if `prototype` is Fortran
+        contiguous, 'C' otherwise. 'K' means match the layout of `prototype`
         as closely as possible.
 
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `prototype`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -141,9 +141,9 @@ def empty_like(prototype, dtype=None, order=None, subok=None, shape=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.concatenate)
-def concatenate(arrays, axis=None, out=None):
+def concatenate(arrays, axis=None, out=None, *, dtype=None, casting=None):
     """
-    concatenate((a1, a2, ...), axis=0, out=None)
+    concatenate((a1, a2, ...), axis=0, out=None, dtype=None, casting="same_kind")
 
     Join a sequence of arrays along an existing axis.
 
@@ -159,6 +159,16 @@ def concatenate(arrays, axis=None, out=None):
         If provided, the destination to place the result. The shape must be
         correct, matching that of what concatenate would have returned if no
         out argument were specified.
+    dtype : str or dtype
+        If provided, the destination array will have this dtype. Cannot be
+        provided together with `out`.
+
+        .. versionadded:: 1.20.0
+
+    casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
+        Controls what kind of data casting may occur. Defaults to 'same_kind'.
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -396,7 +406,7 @@ def lexsort(keys, axis=None):
     for the primary sort order, the second-to-last key for the secondary sort
     order, and so on. The keys argument must be a sequence of objects that
     can be converted to arrays of the same shape. If a 2D array is provided
-    for the keys argument, it's rows are interpreted as the sorting keys and
+    for the keys argument, its rows are interpreted as the sorting keys and
     sorting is according to the last row, second last row etc.
 
     Parameters
@@ -989,7 +999,7 @@ def ravel_multi_index(multi_index, dims, mode=None, order=None):
 
 
 @array_function_from_c_func_and_dispatcher(_multiarray_umath.unravel_index)
-def unravel_index(indices, shape=None, order=None, dims=None):
+def unravel_index(indices, shape=None, order=None):
     """
     unravel_index(indices, shape, order='C')
 
@@ -1035,9 +1045,6 @@ def unravel_index(indices, shape=None, order=None, dims=None):
     (3, 1, 4, 1)
 
     """
-    if dims is not None:
-        warnings.warn("'shape' argument should be used instead of 'dims'",
-                      DeprecationWarning, stacklevel=3)
     return (indices,)
 
 
@@ -1090,7 +1097,7 @@ def putmask(a, mask, values):
 
     Parameters
     ----------
-    a : array_like
+    a : ndarray
         Target array.
     mask : array_like
         Boolean mask array. It has to be the same shape as `a`.
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index e77161f59..c95c48d71 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -21,7 +21,7 @@ from .multiarray import (
 from . import overrides
 from . import umath
 from . import shape_base
-from .overrides import set_module
+from .overrides import set_array_function_like_doc, set_module
 from .umath import (multiply, invert, sin, PINF, NAN)
 from . import numerictypes
 from .numerictypes import longlong, intc, int_, float_, complex_, bool_
@@ -95,7 +95,7 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -141,8 +141,13 @@ def zeros_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
+def _ones_dispatcher(shape, dtype=None, order=None, *, like=None):
+    return(like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def ones(shape, dtype=None, order='C'):
+def ones(shape, dtype=None, order='C', *, like=None):
     """
     Return a new array of given shape and type, filled with ones.
 
@@ -157,6 +162,9 @@ def ones(shape, dtype=None, order='C'):
         Whether to store multi-dimensional data in row-major
         (C-style) or column-major (Fortran-style) order in
         memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -189,11 +197,19 @@ def ones(shape, dtype=None, order='C'):
            [1.,  1.]])
 
     """
+    if like is not None:
+        return _ones_with_like(shape, dtype=dtype, order=order, like=like)
+
     a = empty(shape, dtype, order)
     multiarray.copyto(a, 1, casting='unsafe')
     return a
 
 
+_ones_with_like = array_function_dispatch(
+    _ones_dispatcher
+)(ones)
+
+
 def _ones_like_dispatcher(a, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
@@ -221,7 +237,7 @@ def ones_like(a, dtype=None, order='K', subok=True, shape=None):
         .. versionadded:: 1.6.0
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -265,8 +281,13 @@ def ones_like(a, dtype=None, order='K', subok=True, shape=None):
     return res
 
 
+def _full_dispatcher(shape, fill_value, dtype=None, order=None, *, like=None):
+    return(like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def full(shape, fill_value, dtype=None, order='C'):
+def full(shape, fill_value, dtype=None, order='C', *, like=None):
     """
     Return a new array of given shape and type, filled with `fill_value`.
 
@@ -282,6 +303,9 @@ def full(shape, fill_value, dtype=None, order='C'):
     order : {'C', 'F'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -309,6 +333,9 @@ def full(shape, fill_value, dtype=None, order='C'):
            [1, 2]])
 
     """
+    if like is not None:
+        return _full_with_like(shape, fill_value, dtype=dtype, order=order, like=like)
+
     if dtype is None:
         fill_value = asarray(fill_value)
         dtype = fill_value.dtype
@@ -317,6 +344,11 @@ def full(shape, fill_value, dtype=None, order='C'):
     return a
 
 
+_full_with_like = array_function_dispatch(
+    _full_dispatcher
+)(full)
+
+
 def _full_like_dispatcher(a, fill_value, dtype=None, order=None, subok=None, shape=None):
     return (a,)
 
@@ -342,7 +374,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
         as possible.
     subok : bool, optional.
         If True, then the newly created array will use the sub-class
-        type of 'a', otherwise it will be a base-class array. Defaults
+        type of `a`, otherwise it will be a base-class array. Defaults
         to True.
     shape : int or sequence of ints, optional.
         Overrides the shape of the result. If order='K' and the number of
@@ -377,7 +409,7 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
 
     >>> y = np.arange(6, dtype=np.double)
     >>> np.full_like(y, 0.1)
-    array([0.1,  0.1,  0.1,  0.1,  0.1,  0.1])
+    array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
 
     """
     res = empty_like(a, dtype=dtype, order=order, subok=subok, shape=shape)
@@ -1754,8 +1786,13 @@ def indices(dimensions, dtype=int, sparse=False):
     return res
 
 
+def _fromfunction_dispatcher(function, shape, *, dtype=None, like=None, **kwargs):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def fromfunction(function, shape, *, dtype=float, **kwargs):
+def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
     """
     Construct an array by executing a function over each coordinate.
 
@@ -1776,6 +1813,9 @@ def fromfunction(function, shape, *, dtype=float, **kwargs):
     dtype : data-type, optional
         Data-type of the coordinate arrays passed to `function`.
         By default, `dtype` is float.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -1806,10 +1846,18 @@ def fromfunction(function, shape, *, dtype=float, **kwargs):
            [2, 3, 4]])
 
     """
+    if like is not None:
+        return _fromfunction_with_like(function, shape, dtype=dtype, like=like, **kwargs)
+
     args = indices(shape, dtype=dtype)
     return function(*args, **kwargs)
 
 
+_fromfunction_with_like = array_function_dispatch(
+    _fromfunction_dispatcher
+)(fromfunction)
+
+
 def _frombuffer(buf, dtype, shape, order):
     return frombuffer(buf, dtype=dtype).reshape(shape, order=order)
 
@@ -2082,8 +2130,13 @@ def _maketup(descr, val):
         return tuple(res)
 
 
+def _identity_dispatcher(n, dtype=None, *, like=None):
+    return (like,)
+
+
+@set_array_function_like_doc
 @set_module('numpy')
-def identity(n, dtype=None):
+def identity(n, dtype=None, *, like=None):
     """
     Return the identity array.
 
@@ -2096,6 +2149,9 @@ def identity(n, dtype=None):
         Number of rows (and columns) in `n` x `n` output.
     dtype : data-type, optional
         Data-type of the output.  Defaults to ``float``.
+    ${ARRAY_FUNCTION_LIKE}
+
+        .. versionadded:: 1.20.0
 
     Returns
     -------
@@ -2111,8 +2167,16 @@ def identity(n, dtype=None):
            [0.,  0.,  1.]])
 
     """
+    if like is not None:
+        return _identity_with_like(n, dtype=dtype, like=like)
+
     from numpy import eye
-    return eye(n, dtype=dtype)
+    return eye(n, dtype=dtype, like=like)
+
+
+_identity_with_like = array_function_dispatch(
+    _identity_dispatcher
+)(identity)
 
 
 def _allclose_dispatcher(a, b, rtol=None, atol=None, equal_nan=None):
@@ -2173,6 +2237,8 @@ def allclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     ``allclose(a, b)`` to evaluate to True.  The same is true for
     `equal` but not `array_equal`.
 
+    `allclose` is not defined for non-numeric data types.
+
     Examples
     --------
     >>> np.allclose([1e10,1e-7], [1.00001e10,1e-8])
@@ -2252,6 +2318,8 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     `atol` should be carefully selected for the use case at hand. A zero value
     for `atol` will result in `False` if either `a` or `b` is zero.
 
+    `isclose` is not defined for non-numeric data types.
+
     Examples
     --------
     >>> np.isclose([1e10,1e-7], [1.00001e10,1e-8])
diff --git a/numpy/core/numeric.pyi b/numpy/core/numeric.pyi
new file mode 100644
index 000000000..d91cb31c2
--- /dev/null
+++ b/numpy/core/numeric.pyi
@@ -0,0 +1,189 @@
+import sys
+from typing import (
+    Any,
+    Optional,
+    Union,
+    Sequence,
+    Tuple,
+    Callable,
+    List,
+    overload,
+    TypeVar,
+    Iterable,
+)
+
+from numpy import ndarray, generic, dtype, bool_, signedinteger, _OrderKACF, _OrderCF
+from numpy.typing import ArrayLike, DTypeLike, _ShapeLike
+
+if sys.version_info >= (3, 8):
+    from typing import Literal
+else:
+    from typing_extensions import Literal
+
+_T = TypeVar("_T")
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+_CorrelateMode = Literal["valid", "same", "full"]
+
+@overload
+def zeros_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def zeros_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def ones(
+    shape: _ShapeLike,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> ndarray: ...
+@overload
+def ones_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def ones_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+@overload
+def empty_like(
+    a: _ArrayType,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def empty_like(
+    a: ArrayLike,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: DTypeLike = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> ndarray: ...
+@overload
+def full_like(
+    a: _ArrayType,
+    fill_value: Any,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: Literal[True] = ...,
+    shape: None = ...,
+) -> _ArrayType: ...
+@overload
+def full_like(
+    a: ArrayLike,
+    fill_value: Any,
+    dtype: DTypeLike = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+@overload
+def count_nonzero(
+    a: ArrayLike, axis: None = ..., *, keepdims: Literal[False] = ...
+) -> int: ...
+@overload
+def count_nonzero(
+    a: ArrayLike, axis: _ShapeLike = ..., *, keepdims: bool = ...
+) -> Union[signedinteger[Any], ndarray]: ...  # TODO: np.intp
+def isfortran(a: Union[ndarray, generic]) -> bool: ...
+def argwhere(a: ArrayLike) -> ndarray: ...
+def flatnonzero(a: ArrayLike) -> ndarray: ...
+def correlate(a: ArrayLike, v: ArrayLike, mode: _CorrelateMode = ...) -> ndarray: ...
+def convolve(a: ArrayLike, v: ArrayLike, mode: _CorrelateMode = ...) -> ndarray: ...
+@overload
+def outer(a: ArrayLike, b: ArrayLike, out: None = ...) -> ndarray: ...
+@overload
+def outer(a: ArrayLike, b: ArrayLike, out: _ArrayType = ...) -> _ArrayType: ...
+def tensordot(
+    a: ArrayLike,
+    b: ArrayLike,
+    axes: Union[int, Tuple[_ShapeLike, _ShapeLike]] = ...,
+) -> ndarray: ...
+def roll(
+    a: ArrayLike,
+    shift: _ShapeLike,
+    axis: Optional[_ShapeLike] = ...,
+) -> ndarray: ...
+def rollaxis(a: ndarray, axis: int, start: int = ...) -> ndarray: ...
+def moveaxis(
+    a: ndarray,
+    source: _ShapeLike,
+    destination: _ShapeLike,
+) -> ndarray: ...
+def cross(
+    a: ArrayLike,
+    b: ArrayLike,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: Optional[int] = ...,
+) -> ndarray: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike = ...,
+    sparse: Literal[False] = ...,
+) -> ndarray: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike = ...,
+    sparse: Literal[True] = ...,
+) -> Tuple[ndarray, ...]: ...
+def fromfunction(
+    function: Callable[..., _T],
+    shape: Sequence[int],
+    *,
+    dtype: DTypeLike = ...,
+    like: ArrayLike = ...,
+    **kwargs: Any,
+) -> _T: ...
+def isscalar(element: Any) -> bool: ...
+def binary_repr(num: int, width: Optional[int] = ...) -> str: ...
+def base_repr(number: int, base: int = ..., padding: int = ...) -> str: ...
+def identity(n: int, dtype: DTypeLike = ..., *, like: ArrayLike = ...) -> ndarray: ...
+def allclose(
+    a: ArrayLike,
+    b: ArrayLike,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> bool: ...
+def isclose(
+    a: ArrayLike,
+    b: ArrayLike,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> Union[bool_, ndarray]: ...
+def array_equal(a1: ArrayLike, a2: ArrayLike) -> bool: ...
+def array_equiv(a1: ArrayLike, a2: ArrayLike) -> bool: ...
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 2a015f48f..e705dd3ea 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -358,13 +358,15 @@ def issubsctype(arg1, arg2):
 
 @set_module('numpy')
 def issubdtype(arg1, arg2):
-    """
+    r"""
     Returns True if first argument is a typecode lower/equal in type hierarchy.
 
+    This is like the builtin :func:`issubclass`, but for `dtype`\ s.
+
     Parameters
     ----------
     arg1, arg2 : dtype_like
-        dtype or string representing a typecode.
+        `dtype` or object coercible to one
 
     Returns
     -------
@@ -372,15 +374,45 @@ def issubdtype(arg1, arg2):
 
     See Also
     --------
+    :ref:`arrays.scalars` : Overview of the numpy type hierarchy.
     issubsctype, issubclass_
-    numpy.core.numerictypes : Overview of numpy type hierarchy.
 
     Examples
     --------
-    >>> np.issubdtype('S1', np.string_)
+    `issubdtype` can be used to check the type of arrays:
+
+    >>> ints = np.array([1, 2, 3], dtype=np.int32)
+    >>> np.issubdtype(ints.dtype, np.integer)
+    True
+    >>> np.issubdtype(ints.dtype, np.floating)
+    False
+
+    >>> floats = np.array([1, 2, 3], dtype=np.float32)
+    >>> np.issubdtype(floats.dtype, np.integer)
+    False
+    >>> np.issubdtype(floats.dtype, np.floating)
     True
+
+    Similar types of different sizes are not subdtypes of each other:
+
     >>> np.issubdtype(np.float64, np.float32)
     False
+    >>> np.issubdtype(np.float32, np.float64)
+    False
+
+    but both are subtypes of `floating`:
+
+    >>> np.issubdtype(np.float64, np.floating)
+    True
+    >>> np.issubdtype(np.float32, np.floating)
+    True
+
+    For convenience, dtype-like objects are allowed too:
+
+    >>> np.issubdtype('S1', np.string_)
+    True
+    >>> np.issubdtype('i4', np.signedinteger)
+    True
 
     """
     if not issubclass_(arg1, generic):
diff --git a/numpy/core/numerictypes.pyi b/numpy/core/numerictypes.pyi
new file mode 100644
index 000000000..192015ff1
--- /dev/null
+++ b/numpy/core/numerictypes.pyi
@@ -0,0 +1,29 @@
+from typing import TypeVar, Optional, Type, Union, Tuple, Sequence, overload, Any
+
+from numpy import generic, ndarray, dtype
+from numpy.typing import DTypeLike
+
+_DefaultType = TypeVar("_DefaultType")
+
+def maximum_sctype(t: DTypeLike) -> dtype: ...
+def issctype(rep: object) -> bool: ...
+@overload
+def obj2sctype(rep: object) -> Optional[generic]: ...
+@overload
+def obj2sctype(rep: object, default: None) -> Optional[generic]: ...
+@overload
+def obj2sctype(
+    rep: object, default: Type[_DefaultType]
+) -> Union[generic, Type[_DefaultType]]: ...
+def issubclass_(arg1: object, arg2: Union[object, Tuple[object, ...]]) -> bool: ...
+def issubsctype(
+    arg1: Union[ndarray, DTypeLike], arg2: Union[ndarray, DTypeLike]
+) -> bool: ...
+def issubdtype(arg1: DTypeLike, arg2: DTypeLike) -> bool: ...
+def sctype2char(sctype: object) -> str: ...
+def find_common_type(
+    array_types: Sequence[DTypeLike], scalar_types: Sequence[DTypeLike]
+) -> dtype: ...
+
+# TODO: Add annotations for the following objects:
+# typeDict, nbytes, cast, ScalarType & typecodes
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index 816b11293..c2b5fb7fa 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -12,6 +12,27 @@ from numpy.compat._inspect import getargspec
 ARRAY_FUNCTION_ENABLED = bool(
     int(os.environ.get('NUMPY_EXPERIMENTAL_ARRAY_FUNCTION', 1)))
 
+array_function_like_doc = (
+    """like : array_like
+        Reference object to allow the creation of arrays which are not
+        NumPy arrays. If an array-like passed in as ``like`` supports
+        the ``__array_function__`` protocol, the result will be defined
+        by it. In this case, it ensures the creation of an array object
+        compatible with that passed in via this argument.
+
+        .. note::
+            The ``like`` keyword is an experimental feature pending on
+            acceptance of :ref:`NEP 35 <NEP35>`."""
+)
+
+def set_array_function_like_doc(public_api):
+    if public_api.__doc__ is not None:
+        public_api.__doc__ = public_api.__doc__.replace(
+            "${ARRAY_FUNCTION_LIKE}",
+            array_function_like_doc,
+        )
+    return public_api
+
 
 add_docstring(
     implement_array_function,
diff --git a/numpy/core/records.py b/numpy/core/records.py
index 0d3fd9118..00d456658 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -36,12 +36,11 @@ Record arrays allow us to access fields as properties::
 import os
 import warnings
 from collections import Counter, OrderedDict
+from contextlib import nullcontext
 
 from . import numeric as sb
 from . import numerictypes as nt
-from numpy.compat import (
-    isfileobj, os_fspath, contextlib_nullcontext
-)
+from numpy.compat import os_fspath
 from numpy.core.overrides import set_module
 from .arrayprint import get_printoptions
 
@@ -374,7 +373,7 @@ class recarray(ndarray):
 
     See Also
     --------
-    rec.fromrecords : Construct a record array from data.
+    core.records.fromrecords : Construct a record array from data.
     record : fundamental data-type for `recarray`.
     format_parser : determine a data-type from formats, names, titles.
 
@@ -630,7 +629,7 @@ def fromarrays(arrayList, dtype=None, shape=None, formats=None,
     >>> x1[1]=34
     >>> r.a
     array([1, 2, 3, 4])
-    
+
     >>> x1 = np.array([1, 2, 3, 4])
     >>> x2 = np.array(['a', 'dd', 'xyz', '12'])
     >>> x3 = np.array([1.1, 2, 3,4])
@@ -847,13 +846,12 @@ def fromstring(datastring, dtype=None, shape=None, offset=0, formats=None,
     return _array
 
 def get_remaining_size(fd):
+    pos = fd.tell()
     try:
-        fn = fd.fileno()
-    except AttributeError:
-        return os.path.getsize(fd.name) - fd.tell()
-    st = os.fstat(fn)
-    size = st.st_size - fd.tell()
-    return size
+        fd.seek(0, 2)
+        return fd.tell() - pos
+    finally:
+        fd.seek(pos, 0)
 
 def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
              names=None, titles=None, aligned=False, byteorder=None):
@@ -911,9 +909,11 @@ def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
     elif isinstance(shape, int):
         shape = (shape,)
 
-    if isfileobj(fd):
+    if hasattr(fd, 'readinto'):
+        # GH issue 2504. fd supports io.RawIOBase or io.BufferedIOBase interface.
+        # Example of fd: gzip, BytesIO, BufferedReader
         # file already opened
-        ctx = contextlib_nullcontext(fd)
+        ctx = nullcontext(fd)
     else:
         # open file
         ctx = open(os_fspath(fd), 'rb')
@@ -957,7 +957,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     """
     Construct a record array from a wide-variety of objects.
 
-    A general-purpose record array constructor that dispatches to the 
+    A general-purpose record array constructor that dispatches to the
     appropriate `recarray` creation function based on the inputs (see Notes).
 
     Parameters
@@ -995,7 +995,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     `obj` is a string, then call the `fromstring` constructor. If `obj` is a
     list or a tuple, then if the first object is an `~numpy.ndarray`, call
     `fromarrays`, otherwise call `fromrecords`. If `obj` is a
-    `~numpy.recarray`, then make a copy of the data in the recarray 
+    `~numpy.recarray`, then make a copy of the data in the recarray
     (if ``copy=True``) and use the new formats, names, and titles. If `obj`
     is a file, then call `fromfile`. Finally, if obj is an `ndarray`, then
     return ``obj.view(recarray)``, making a copy of the data if ``copy=True``.
@@ -1036,7 +1036,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
     array('def', dtype='<U3')
     """
 
-    if ((isinstance(obj, (type(None), str)) or isfileobj(obj)) and
+    if ((isinstance(obj, (type(None), str)) or hasattr(obj, 'readinto')) and
            formats is None and dtype is None):
         raise ValueError("Must define formats (or dtype) if object is "
                          "None, string, or an open file")
@@ -1078,7 +1078,7 @@ def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
             new = new.copy()
         return new
 
-    elif isfileobj(obj):
+    elif hasattr(obj, 'readinto'):
         return fromfile(obj, dtype=dtype, shape=shape, offset=offset)
 
     elif isinstance(obj, ndarray):
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index aede12080..2ec5e1a64 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -9,7 +9,7 @@ from os.path import join
 
 from numpy.distutils import log
 from distutils.dep_util import newer
-from distutils.sysconfig import get_config_var
+from sysconfig import get_config_var
 from numpy.compat import npy_load_module
 from setup_common import *  # noqa: F403
 
@@ -23,6 +23,11 @@ NPY_RELAXED_STRIDES_CHECKING = (os.environ.get('NPY_RELAXED_STRIDES_CHECKING', "
 NPY_RELAXED_STRIDES_DEBUG = (os.environ.get('NPY_RELAXED_STRIDES_DEBUG', "0") != "0")
 NPY_RELAXED_STRIDES_DEBUG = NPY_RELAXED_STRIDES_DEBUG and NPY_RELAXED_STRIDES_CHECKING
 
+# Set to True to use the new casting implementation as much as implemented.
+# Allows running the full test suit to exercise the new machinery until
+# it is used as default and the old version is eventually deleted.
+NPY_USE_NEW_CASTINGIMPL = os.environ.get('NPY_USE_NEW_CASTINGIMPL', "0") != "0"
+
 # XXX: ugly, we use a class to avoid calling twice some expensive functions in
 # config.h/numpyconfig.h. I don't see a better way because distutils force
 # config.h generation inside an Extension class, and as such sharing
@@ -102,7 +107,7 @@ def win32_checks(deflist):
     if a == "Intel" or a == "AMD64":
         deflist.append('FORCE_NO_LONG_DOUBLE_FORMATTING')
 
-def check_math_capabilities(config, moredefs, mathlibs):
+def check_math_capabilities(config, ext, moredefs, mathlibs):
     def check_func(func_name):
         return config.check_func(func_name, libraries=mathlibs,
                                  decl=True, call=True)
@@ -167,6 +172,14 @@ def check_math_capabilities(config, moredefs, mathlibs):
     for dec, fn in OPTIONAL_FUNCTION_ATTRIBUTES:
         if config.check_gcc_function_attribute(dec, fn):
             moredefs.append((fname2def(fn), 1))
+            if fn == 'attribute_target_avx512f':
+                # GH-14787: Work around GCC<8.4 bug when compiling with AVX512
+                # support on Windows-based platforms
+                if (sys.platform in ('win32', 'cygwin') and
+                        config.check_compiler_gcc() and
+                        not config.check_gcc_version_at_least(8, 4)):
+                    ext.extra_compile_args.extend(
+                            ['-ffixed-xmm%s' % n for n in range(16, 32)])
 
     for dec, fn, code, header in OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS:
         if config.check_gcc_function_attribute_with_intrinsics(dec, fn, code,
@@ -434,7 +447,7 @@ def configuration(parent_package='',top_path=None):
             mathlibs = check_mathlib(config_cmd)
             moredefs.append(('MATHLIB', ','.join(mathlibs)))
 
-            check_math_capabilities(config_cmd, moredefs, mathlibs)
+            check_math_capabilities(config_cmd, ext, moredefs, mathlibs)
             moredefs.extend(cocache.check_ieee_macros(config_cmd)[0])
             moredefs.extend(cocache.check_complex(config_cmd, mathlibs)[0])
 
@@ -460,6 +473,10 @@ def configuration(parent_package='',top_path=None):
             if NPY_RELAXED_STRIDES_DEBUG:
                 moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
 
+            # Use the new experimental casting implementation in NumPy 1.20:
+            if NPY_USE_NEW_CASTINGIMPL:
+                moredefs.append(('NPY_USE_NEW_CASTINGIMPL', 1))
+
             # Get long double representation
             rep = check_long_double_representation(config_cmd)
             moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
@@ -618,6 +635,7 @@ def configuration(parent_package='',top_path=None):
     config.add_include_dirs(join('src', 'multiarray'))
     config.add_include_dirs(join('src', 'umath'))
     config.add_include_dirs(join('src', 'npysort'))
+    config.add_include_dirs(join('src', '_simd'))
 
     config.add_define_macros([("NPY_INTERNAL_BUILD", "1")]) # this macro indicates that Numpy build is in process
     config.add_define_macros([("HAVE_NPY_CONFIG_H", "1")])
@@ -687,26 +705,6 @@ def configuration(parent_package='',top_path=None):
             subst_dict)
 
     #######################################################################
-    #                         npysort library                             #
-    #######################################################################
-
-    # This library is created for the build but it is not installed
-    npysort_sources = [join('src', 'common', 'npy_sort.h.src'),
-                       join('src', 'npysort', 'quicksort.c.src'),
-                       join('src', 'npysort', 'mergesort.c.src'),
-                       join('src', 'npysort', 'timsort.c.src'),
-                       join('src', 'npysort', 'heapsort.c.src'),
-                       join('src', 'npysort', 'radixsort.c.src'),
-                       join('src', 'common', 'npy_partition.h.src'),
-                       join('src', 'npysort', 'selection.c.src'),
-                       join('src', 'common', 'npy_binsearch.h.src'),
-                       join('src', 'npysort', 'binsearch.c.src'),
-                       ]
-    config.add_library('npysort',
-                       sources=npysort_sources,
-                       include_dirs=[])
-
-    #######################################################################
     #                     multiarray_tests module                         #
     #######################################################################
 
@@ -780,6 +778,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'arraytypes.h'),
             join('src', 'multiarray', 'arrayfunction_override.h'),
             join('src', 'multiarray', 'array_coercion.h'),
+            join('src', 'multiarray', 'array_method.h'),
             join('src', 'multiarray', 'npy_buffer.h'),
             join('src', 'multiarray', 'calculation.h'),
             join('src', 'multiarray', 'common.h'),
@@ -790,9 +789,12 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'descriptor.h'),
             join('src', 'multiarray', 'dtypemeta.h'),
             join('src', 'multiarray', 'dragon4.h'),
+            join('src', 'multiarray', 'einsum_debug.h'),
+            join('src', 'multiarray', 'einsum_sumprod.h'),
             join('src', 'multiarray', 'getset.h'),
             join('src', 'multiarray', 'hashdescr.h'),
             join('src', 'multiarray', 'iterators.h'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.h'),
             join('src', 'multiarray', 'mapping.h'),
             join('src', 'multiarray', 'methods.h'),
             join('src', 'multiarray', 'multiarraymodule.h'),
@@ -825,7 +827,7 @@ def configuration(parent_package='',top_path=None):
             join('include', 'numpy', 'npy_1_7_deprecated_api.h'),
             # add library sources as distuils does not consider libraries
             # dependencies
-            ] + npysort_sources + npymath_sources
+            ] + npymath_sources
 
     multiarray_src = [
             join('src', 'multiarray', 'abstractdtypes.c'),
@@ -833,6 +835,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'arrayobject.c'),
             join('src', 'multiarray', 'arraytypes.c.src'),
             join('src', 'multiarray', 'array_coercion.c'),
+            join('src', 'multiarray', 'array_method.c'),
             join('src', 'multiarray', 'array_assign_scalar.c'),
             join('src', 'multiarray', 'array_assign_array.c'),
             join('src', 'multiarray', 'arrayfunction_override.c'),
@@ -853,11 +856,13 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
+            join('src', 'multiarray', 'einsum_sumprod.c.src'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
             join('src', 'multiarray', 'item_selection.c'),
             join('src', 'multiarray', 'iterators.c'),
+            join('src', 'multiarray', 'legacy_dtype_implementation.c'),
             join('src', 'multiarray', 'lowlevel_strided_loops.c.src'),
             join('src', 'multiarray', 'mapping.c'),
             join('src', 'multiarray', 'methods.c'),
@@ -877,6 +882,16 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'typeinfo.c'),
             join('src', 'multiarray', 'usertypes.c'),
             join('src', 'multiarray', 'vdot.c'),
+            join('src', 'common', 'npy_sort.h.src'),
+            join('src', 'npysort', 'quicksort.c.src'),
+            join('src', 'npysort', 'mergesort.c.src'),
+            join('src', 'npysort', 'timsort.c.src'),
+            join('src', 'npysort', 'heapsort.c.src'),
+            join('src', 'npysort', 'radixsort.c.src'),
+            join('src', 'common', 'npy_partition.h.src'),
+            join('src', 'npysort', 'selection.c.src'),
+            join('src', 'common', 'npy_binsearch.h.src'),
+            join('src', 'npysort', 'binsearch.c.src'),
             ]
 
     #######################################################################
@@ -902,6 +917,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
@@ -927,7 +943,7 @@ def configuration(parent_package='',top_path=None):
 
     config.add_extension('_multiarray_umath',
                          sources=multiarray_src + umath_src +
-                                 npymath_sources + common_src +
+                                 common_src +
                                  [generate_config_h,
                                   generate_numpyconfig_h,
                                   generate_numpy_api,
@@ -938,7 +954,7 @@ def configuration(parent_package='',top_path=None):
                                  ],
                          depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
-                         libraries=['npymath', 'npysort'],
+                         libraries=['npymath'],
                          extra_info=extra_info)
 
     #######################################################################
@@ -973,6 +989,28 @@ def configuration(parent_package='',top_path=None):
     config.add_extension('_operand_flag_tests',
                     sources=[join('src', 'umath', '_operand_flag_tests.c.src')])
 
+    #######################################################################
+    #                        SIMD module                                  #
+    #######################################################################
+
+    config.add_extension('_simd', sources=[
+        join('src', 'common', 'npy_cpu_features.c.src'),
+        join('src', '_simd', '_simd.c'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd.dispatch.c.src'),
+    ], depends=[
+        join('src', 'common', 'npy_cpu_dispatch.h'),
+        join('src', 'common', 'simd', 'simd.h'),
+        join('src', '_simd', '_simd.h'),
+        join('src', '_simd', '_simd_inc.h.src'),
+        join('src', '_simd', '_simd_data.inc.src'),
+        join('src', '_simd', '_simd_arg.inc'),
+        join('src', '_simd', '_simd_convert.inc'),
+        join('src', '_simd', '_simd_easyintrin.inc'),
+        join('src', '_simd', '_simd_vector.inc'),
+    ])
+
     config.add_subpackage('tests')
     config.add_data_dir('tests/data')
     config.add_data_dir('tests/examples')
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 8c0149497..2d85e0718 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -40,7 +40,8 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000c - 1.14.x
 # 0x0000000c - 1.15.x
 # 0x0000000d - 1.16.x
-# 0x0000000e - 1.19.x
+# 0x0000000d - 1.19.x
+# 0x0000000e - 1.20.x
 C_API_VERSION = 0x0000000e
 
 class MismatchCAPIWarning(Warning):
@@ -50,7 +51,7 @@ def is_released(config):
     """Return True if a released version of numpy is detected."""
     from distutils.version import LooseVersion
 
-    v = config.get_version('../version.py')
+    v = config.get_version('../_version.py')
     if v is None:
         raise ValueError("Could not get version")
     pv = LooseVersion(vstring=v).version
@@ -178,6 +179,9 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
 # gcc 4.8.4 support attributes but not with intrisics
 # tested via "#include<%s> int %s %s(void *){code; return 0;};" % (header, attribute, name, code)
 # function name will be converted to HAVE_<upper-case-name> preprocessor macro
+# The _mm512_castps_si512 instruction is specific check for AVX-512F support
+# in gcc-4.9 which is missing a subset of intrinsics. See
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
 OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fma")))',
                                 'attribute_target_avx2_with_intrinsics',
                                 '__m256 temp = _mm256_set1_ps(1.0); temp = \
@@ -185,11 +189,12 @@ OPTIONAL_FUNCTION_ATTRIBUTES_WITH_INTRINSICS = [('__attribute__((target("avx2,fm
                                 'immintrin.h'),
                                 ('__attribute__((target("avx512f")))',
                                 'attribute_target_avx512f_with_intrinsics',
-                                '__m512 temp = _mm512_set1_ps(1.0)',
+                                '__m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0))',
                                 'immintrin.h'),
                                 ('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
                                 'attribute_target_avx512_skx_with_intrinsics',
                                 '__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
+                                __m512i temp = _mm512_castps_si512(_mm512_set1_ps(1.0));\
                                 _mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
                                 'immintrin.h'),
                                 ]
diff --git a/numpy/core/shape_base.py b/numpy/core/shape_base.py
index 7a76bbf9d..e4dc30d4c 100644
--- a/numpy/core/shape_base.py
+++ b/numpy/core/shape_base.py
@@ -539,7 +539,8 @@ def _accumulate(values):
 def _concatenate_shapes(shapes, axis):
     """Given array shapes, return the resulting shape and slices prefixes.
 
-    These help in nested concatation.
+    These help in nested concatenation.
+    
     Returns
     -------
     shape: tuple of int
diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi
new file mode 100644
index 000000000..b20598b1a
--- /dev/null
+++ b/numpy/core/shape_base.pyi
@@ -0,0 +1,41 @@
+import sys
+from typing import TypeVar, overload, List, Sequence
+
+from numpy import ndarray
+from numpy.typing import ArrayLike
+
+if sys.version_info >= (3, 8):
+    from typing import SupportsIndex
+else:
+    from typing_extensions import Protocol
+    class SupportsIndex(Protocol):
+        def __index__(self) -> int: ...
+
+_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+
+@overload
+def atleast_1d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_1d(*arys: ArrayLike) -> List[ndarray]: ...
+
+@overload
+def atleast_2d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_2d(*arys: ArrayLike) -> List[ndarray]: ...
+
+@overload
+def atleast_3d(__arys: ArrayLike) -> ndarray: ...
+@overload
+def atleast_3d(*arys: ArrayLike) -> List[ndarray]: ...
+
+def vstack(tup: Sequence[ArrayLike]) -> ndarray: ...
+def hstack(tup: Sequence[ArrayLike]) -> ndarray: ...
+@overload
+def stack(
+    arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: None = ...
+) -> ndarray: ...
+@overload
+def stack(
+    arrays: Sequence[ArrayLike], axis: SupportsIndex = ..., out: _ArrayType = ...
+) -> _ArrayType: ...
+def block(arrays: ArrayLike) -> ndarray: ...
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
new file mode 100644
index 000000000..b1fdd4478
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.c
@@ -0,0 +1,73 @@
+#include "_simd.h"
+
+PyMODINIT_FUNC PyInit__simd(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_name = "numpy.core._simd",
+        .m_size = -1
+    };
+    if (npy_cpu_init() < 0) {
+        return NULL;
+    }
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    PyObject *targets = PyDict_New();
+    if (targets == NULL) {
+        goto err;
+    }
+    if (PyModule_AddObject(m, "targets", targets) < 0) {
+        Py_DECREF(targets);
+        goto err;
+    }
+    // add keys for non-supported optimizations with None value
+    #define ATTACH_MODULE(TESTED_FEATURES, TARGET_NAME, MAKE_MSVC_HAPPY)       \
+        {                                                                      \
+            PyObject *simd_mod;                                                \
+            if (!TESTED_FEATURES) {                                            \
+                Py_INCREF(Py_None);                                            \
+                simd_mod = Py_None;                                            \
+            } else {                                                           \
+                simd_mod = NPY_CAT(simd_create_module_, TARGET_NAME)();        \
+                if (simd_mod == NULL) {                                        \
+                    goto err;                                                  \
+                }                                                              \
+            }                                                                  \
+            const char *target_name = NPY_TOSTRING(TARGET_NAME);               \
+            if (PyDict_SetItemString(targets, target_name, simd_mod) < 0) {    \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+            Py_INCREF(simd_mod);                                               \
+            if (PyModule_AddObject(m, target_name, simd_mod) < 0) {            \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    #define ATTACH_BASELINE_MODULE(MAKE_MSVC_HAPPY)                            \
+        {                                                                      \
+            PyObject *simd_mod = simd_create_module();                         \
+            if (simd_mod == NULL) {                                            \
+                goto err;                                                      \
+            }                                                                  \
+            if (PyDict_SetItemString(targets, "baseline", simd_mod) < 0) {     \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+            Py_INCREF(simd_mod);                                               \
+            if (PyModule_AddObject(m, "baseline", simd_mod) < 0) {             \
+                Py_DECREF(simd_mod);                                           \
+                goto err;                                                      \
+            }                                                                  \
+        }
+
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, ATTACH_MODULE, MAKE_MSVC_HAPPY)
+    NPY__CPU_DISPATCH_BASELINE_CALL(ATTACH_BASELINE_MODULE, MAKE_MSVC_HAPPY)
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
new file mode 100644
index 000000000..e3dbcdece
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -0,0 +1,612 @@
+/*@targets $werror #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_data.inc"
+#include "_simd_convert.inc"
+#include "_simd_vector.inc"
+#include "_simd_arg.inc"
+#include "_simd_easyintrin.inc"
+
+//#########################################################################
+//## Defining NPYV intrinsics as module functions
+//#########################################################################
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
+/**end repeat1**/
+/**begin repeat1
+ * # intrin = store, storea, stores, storel, storeh#
+ */
+// special definition due to the nature of @intrin@
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:@intrin@_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
+SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+
+// Partial Store
+static PyObject *
+simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_@sfx@(
+        seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+// Non-contiguous Load
+/**begin repeat1
+ * #intrin = loadn, loadn_till, loadn_tillz#
+ * #till   = 0,     1,          1#
+ * #fill   = 0,     1,          0#
+ * #format = ,    O&O&,         O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if @fill@
+    simd_arg fill_arg = {.dtype = simd_data_@sfx@};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&:@intrin@_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if @fill@
+        ,simd_arg_converter, &fill_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@sfx@ rvec = npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        , nlane_arg.data.u32
+    #endif
+    #if @fill@
+        , fill_arg.data.@sfx@
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+
+// Non-contiguous Store
+/**begin repeat1
+ * #intrin = storen, storen_till#
+ * #till   = 0,      1#
+ * #format = ,       O&#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+#if @till@
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "@format@O&O&O&:storen_@sfx@",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if @till@
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_@sfx@ *seq_ptr = seq_arg.data.q@sfx@;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len -1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "@intrin@_@sfx@(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_@intrin@_@sfx@(
+        seq_ptr, stride
+    #if @till@
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.v@sfx@
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+/**end repeat1**/
+#endif // @ncont_sup@
+
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_1(setall_@sfx@, v@sfx@, @sfx@)
+SIMD_IMPL_INTRIN_3(select_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_IMPL_INTRIN_1(reinterpret_@sfx_to@_@sfx@, v@sfx_to@, v@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_@sfx@ and npy_set_@sfx@.
+*/
+/**begin repeat1
+ * #intrin = setf, set#
+ */
+static PyObject *
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_@sfx@ *data = simd_sequence_from_iterable(args, simd_data_q@sfx@, npyv_nlanes_@sfx@);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.v@sfx@ = npyv_@intrin@_@sfx@(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_v@sfx@);
+}
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = combine, zip#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @rev64_sup@
+SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+SIMD_IMPL_INTRIN_2(shl_@sfx@, v@sfx@, v@sfx@, u8)
+SIMD_IMPL_INTRIN_2(shr_@sfx@, v@sfx@, v@sfx@, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_@sfx@, v@sfx@, v@sfx@, @shl_imm@)
+SIMD_IMPL_INTRIN_2IMM(shri_@sfx@, v@sfx@, v@sfx@, @shr_imm@)
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+SIMD_IMPL_INTRIN_1(not_@sfx@, v@sfx@, v@sfx@)
+
+/**begin repeat1
+ * #intrin  = cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_@sfx@_@bsfx@, v@sfx@,  v@bsfx@)
+SIMD_IMPL_INTRIN_1(cvt_@bsfx@_@sfx@, v@bsfx@, v@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_IMPL_INTRIN_2(mul_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_IMPL_INTRIN_2(div_@sfx@, v@sfx@, v@sfx@, v@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin  = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#if @sum_sup@
+SIMD_IMPL_INTRIN_1(sum_@sfx@, @sfx@, v@sfx@)
+#endif // sum_sup
+
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
+/**end repeat1**/
+#endif
+
+#endif // simd_sup
+/**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_IMPL_INTRIN_1(tobits_@bsfx@, u64, v@bsfx@)
+/**end repeat**/
+
+//#########################################################################
+//## Attach module functions
+//#########################################################################
+static PyMethodDef simd__intrinsics_methods[] = {
+/**begin repeat
+ * #sfx       = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #bsfx      = b8, b8, b16, b16, b32, b32, b64, b64, b32, b64#
+ * #simd_sup  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ * #fp_only   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sat_sup   = 1,  1,  1,   1,   0,   0,   0,   0,   0,   0#
+ * #mul_sup   = 1,  1,  1,   1,   1,   1,   0,   0,   1,   1#
+ * #div_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #fused_sup = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #sum_sup   = 0,  0,  0,   0,   0,   0,   0,   0,   1,   1#
+ * #rev64_sup = 1,  1,  1,   1,   1,   1,   0,   0,   1,   0#
+ * #ncont_sup = 0,  0,  0,   0,   1,   1,   1,   1,   1,   1#
+ * #shl_imm   = 0,  0,  15,  15,  31,  31,  63,  63,  0,   0#
+ * #shr_imm   = 0,  0,  16,  16,  32,  32,  64,  64,  0,   0#
+ */
+#if @simd_sup@
+
+/***************************
+ * Memory
+ ***************************/
+/**begin repeat1
+ * # intrin = load, loada, loads, loadl, store, storea, stores, storel, storeh#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if @ncont_sup@
+/**begin repeat1
+ * #intrin = load_till, load_tillz, loadn, loadn_till, loadn_tillz,
+ *           store_till, storen, storen_till#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // ncont_sup
+
+/***************************
+ * Misc
+ ***************************/
+/**begin repeat1
+ * #sfx_to     = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+ * #simd_sup2  = 1,  1,  1,   1,   1,   1,   1,   1,   1,   NPY_SIMD_F64#
+ */
+#if @simd_sup2@
+SIMD_INTRIN_DEF(reinterpret_@sfx_to@_@sfx@)
+#endif // simd_sup2
+/**end repeat1**/
+
+/**begin repeat1
+ * # intrin = set, setf, setall, zero, select#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Reorder
+ ***************************/
+/**begin repeat1
+ * # intrin = combinel, combineh, combine, zip#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @rev64_sup@
+SIMD_INTRIN_DEF(rev64_@sfx@)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if @shl_imm@ > 0
+/**begin repeat1
+ * # intrin = shl, shr, shli, shri#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // shl_imm
+
+/**begin repeat1
+ * #intrin  = and, or, xor, not, cmpeq, cmpneq, cmpgt, cmpge, cmplt, cmple#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_@sfx@_@bsfx@)
+SIMD_INTRIN_DEF(cvt_@bsfx@_@sfx@)
+
+/***************************
+ * Arithmetic
+ ***************************/
+/**begin repeat1
+ * #intrin  = add, sub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+
+#if @sat_sup@
+/**begin repeat1
+ * #intrin  = adds, subs#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // sat_sup
+
+#if @mul_sup@
+SIMD_INTRIN_DEF(mul_@sfx@)
+#endif // mul_sup
+
+#if @div_sup@
+SIMD_INTRIN_DEF(div_@sfx@)
+#endif // div_sup
+
+#if @fused_sup@
+/**begin repeat1
+ * #intrin  = muladd, mulsub, nmuladd, nmulsub#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif // fused_sup
+
+#if @sum_sup@
+SIMD_INTRIN_DEF(sum_@sfx@)
+#endif // sum_sup
+
+/***************************
+ * Math
+ ***************************/
+#if @fp_only@
+/**begin repeat1
+ * #intrin  = sqrt, recip, abs, square#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+#endif // simd_sup
+/**end repeat**/
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_INTRIN_DEF(cleanup)
+
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+/**begin repeat
+ * #bsfx = b8, b16, b32, b64#
+ */
+SIMD_INTRIN_DEF(tobits_@bsfx@)
+/**end repeat**/
+
+/************************************************************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+//#########################################################################
+//## Defining a separate module for each target
+//#########################################################################
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+        .m_size = -1,
+    #ifdef NPY__CPU_TARGET_CURRENT
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+    #else
+        .m_name = "numpy.core._simd.baseline",
+    #endif
+    #if NPY_SIMD
+        .m_methods = simd__intrinsics_methods
+    #else
+        .m_methods = NULL
+    #endif
+    };
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+        goto err;
+    }
+#if NPY_SIMD
+    if (PySIMDVectorType_Init(m)) {
+        goto err;
+    }
+    /**begin repeat
+     * #sfx = u8, s8, u16, s16, u32, s32, u64, s64, f32, f64#
+     */
+    if (PyModule_AddIntConstant(m, "nlanes_@sfx@", npyv_nlanes_@sfx@)) {
+        goto err;
+    }
+    /**end repeat**/
+#endif // NPY_SIMD
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
diff --git a/numpy/core/src/_simd/_simd.h b/numpy/core/src/_simd/_simd.h
new file mode 100644
index 000000000..d9905c801
--- /dev/null
+++ b/numpy/core/src/_simd/_simd.h
@@ -0,0 +1,30 @@
+/**
+ * A module to expose the NumPy C SIMD vectorization interface "NPYV" for testing purposes.
+ *
+ * Please keep this module independent from other c-extension modules,
+ * since NPYV intrinsics may be involved in their functionality,
+ * which increases the degree of complexity in tracking and detecting errors.
+ *
+ * TODO: Add an independent sphinx doc.
+ *
+ * Please add any new NPYV intrinsics in '_simd.dispatch.c.src'.
+ */
+#ifndef _SIMD_SIMD_H_
+#define _SIMD_SIMD_H_
+
+#include <Python.h>
+#include "numpy/npy_common.h"
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+// autogenerated, required for CPU dispatch macros
+#include "_simd.dispatch.h"
+#endif
+/**
+ * Create a new module for each required optimization which contains all NPYV intrinsics,
+ *
+ * If required optimization is not supported by NPYV, the module will still provides
+ * access to NPYV constants NPY_SIMD, NPY_SIMD_F64, and NPY_SIMD_WIDTH but without
+ * any intrinsics.
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_VISIBILITY_HIDDEN PyObject *simd_create_module, (void))
+#endif // _SIMD_SIMD_H_
diff --git a/numpy/core/src/_simd/_simd_arg.inc b/numpy/core/src/_simd/_simd_arg.inc
new file mode 100644
index 000000000..f5bcf5487
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_arg.inc
@@ -0,0 +1,85 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        arg->data = simd_scalar_from_number(obj, arg->dtype);
+    }
+    else if (info->is_sequence) {
+        unsigned min_seq_size = simd_data_getinfo(info->to_vector)->nlanes;
+        arg->data.qu8 = simd_sequence_from_iterable(obj, arg->dtype, min_seq_size);
+    }
+    else if (info->is_vectorx) {
+        arg->data = simd_vectorx_from_tuple(obj, arg->dtype);
+    }
+    else if (info->is_vector) {
+        arg->data = PySIMDVector_AsData((PySIMDVectorObject*)obj, arg->dtype);
+    } else {
+        arg->data.u64 = 0;
+        PyErr_Format(PyExc_RuntimeError,
+            "unhandled arg from obj type id:%d, name:%s", arg->dtype, info->pyname
+        );
+        return -1;
+    }
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
+}
+
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg)
+{
+    assert(arg->dtype != 0);
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_scalar) {
+        return simd_scalar_to_number(arg->data, arg->dtype);
+    }
+    if (info->is_sequence) {
+        return simd_sequence_to_list(arg->data.qu8, arg->dtype);
+    }
+    if (info->is_vectorx) {
+        return simd_vectorx_to_tuple(arg->data, arg->dtype);
+    }
+    if (info->is_vector) {
+        return (PyObject*)PySIMDVector_FromData(arg->data, arg->dtype);
+    }
+    PyErr_Format(PyExc_RuntimeError,
+        "unhandled arg to object type id:%d, name:%s", arg->dtype, info->pyname
+    );
+    return NULL;
+}
+
+static void
+simd_arg_free(simd_arg *arg)
+{
+    const simd_data_info *info = simd_data_getinfo(arg->dtype);
+    if (info->is_sequence) {
+        simd_sequence_free(arg->data.qu8);
+    }
+}
+
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg)
+{
+    if (obj != NULL) {
+        if (simd_arg_from_obj(obj, arg) < 0) {
+            return 0;
+        }
+        arg->obj = obj;
+        return Py_CLEANUP_SUPPORTED;
+    } else {
+        simd_arg_free(arg);
+    }
+    return 1;
+}
diff --git a/numpy/core/src/_simd/_simd_convert.inc b/numpy/core/src/_simd/_simd_convert.inc
new file mode 100644
index 000000000..73869ef1f
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_convert.inc
@@ -0,0 +1,210 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Protected Definitions
+ ************************************/
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    simd_data data;
+    if (info->is_float) {
+        data.f64 = PyFloat_AsDouble(obj);
+        if (dtype == simd_data_f32){
+            data.f32 = (float)data.f64;
+        }
+    } else {
+        data.u64 = PyLong_AsUnsignedLongLongMask(obj);
+    }
+    return data;
+}
+
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_scalar && info->lane_size > 0);
+    if (info->is_float) {
+        if (dtype == simd_data_f32) {
+            return PyFloat_FromDouble(data.f32);
+        }
+        return PyFloat_FromDouble(data.f64);
+    }
+    int leftb = (sizeof(npyv_lanetype_u64) - info->lane_size) * 8;
+    data.u64 <<= leftb;
+    if (info->is_signed) {
+        return PyLong_FromLongLong(data.s64 >> leftb);
+    }
+    return PyLong_FromUnsignedLongLong(data.u64 >> leftb);
+}
+
+typedef struct {
+    Py_ssize_t len;
+    void *ptr;
+} simd__alloc_data;
+
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(len > 0 && info->is_sequence && info->lane_size > 0);
+    size_t size = sizeof(simd__alloc_data) + len * info->lane_size + NPY_SIMD_WIDTH;
+    void *ptr = malloc(size);
+    if (ptr == NULL) {
+        return PyErr_NoMemory();
+    }
+    // align the pointer
+    simd__alloc_data *a_ptr = (simd__alloc_data *)(
+        ((uintptr_t)ptr + sizeof(simd__alloc_data) + NPY_SIMD_WIDTH) & ~(uintptr_t)(NPY_SIMD_WIDTH-1)
+    );
+    a_ptr[-1].len = len;
+    a_ptr[-1].ptr = ptr;
+    return a_ptr;
+}
+
+static Py_ssize_t
+simd_sequence_len(void const *ptr)
+{
+    return ((simd__alloc_data const*)ptr)[-1].len;
+}
+
+static void
+simd_sequence_free(void *ptr)
+{
+    free(((simd__alloc_data *)ptr)[-1].ptr);
+}
+
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_sequence && info->lane_size > 0);
+    PyObject *seq_obj = PySequence_Fast(obj, "expected a sequence");
+    if (seq_obj == NULL) {
+        return NULL;
+    }
+    Py_ssize_t seq_size = PySequence_Fast_GET_SIZE(seq_obj);
+    if (seq_size < min_size) {
+        PyErr_Format(PyExc_ValueError,
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            min_size, seq_size
+        );
+        return NULL;
+    }
+    npyv_lanetype_u8 *dst = simd_sequence_new(seq_size, dtype);
+    if (dst == NULL) {
+        return NULL;
+    }
+    PyObject **seq_items = PySequence_Fast_ITEMS(seq_obj);
+    for (Py_ssize_t i = 0; i < seq_size; ++i) {
+        simd_data data = simd_scalar_from_number(seq_items[i], info->to_scalar);
+        npyv_lanetype_u8 *sdst = dst + i * info->lane_size;
+        memcpy(sdst, &data.u64, info->lane_size);
+    }
+    Py_DECREF(seq_obj);
+
+    if (PyErr_Occurred()) {
+        simd_sequence_free(dst);
+        return NULL;
+    }
+    return dst;
+}
+
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    if (!PySequence_Check(obj)) {
+        PyErr_Format(PyExc_TypeError,
+            "a sequence object is required to fill %s", info->pyname
+        );
+        return -1;
+    }
+    const npyv_lanetype_u8 *src = ptr;
+    Py_ssize_t seq_len = simd_sequence_len(ptr);
+    for (Py_ssize_t i = 0; i < seq_len; ++i) {
+        const npyv_lanetype_u8 *ssrc = src + i * info->lane_size;
+        simd_data data;
+        memcpy(&data.u64, ssrc, info->lane_size);
+        PyObject *item = simd_scalar_to_number(data, info->to_scalar);
+        if (item == NULL) {
+            return -1;
+        }
+        int res = PySequence_SetItem(obj, i, item);
+        Py_DECREF(item);
+        if (res < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype)
+{
+    PyObject *list = PyList_New(simd_sequence_len(ptr));
+    if (list == NULL) {
+        return NULL;
+    }
+    if (simd_sequence_fill_iterable(list, ptr, dtype) < 0) {
+        Py_DECREF(list);
+        return NULL;
+    }
+    return list;
+}
+
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    simd_data data = {.u64 = 0};
+    if (!PyTuple_Check(obj) || PyTuple_GET_SIZE(obj) != info->is_vectorx) {
+        PyErr_Format(PyExc_TypeError,
+            "a tuple of %d vector type %s is required",
+            info->is_vectorx, simd_data_getinfo(info->to_vector)->pyname
+        );
+        return data;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        PyObject *item = PyTuple_GET_ITEM(obj, i);
+        // get the max multi-vec and let the compiler do the rest
+        data.vu64x3.val[i] = PySIMDVector_AsData((PySIMDVectorObject*)item, info->to_vector).vu64;
+        if (PyErr_Occurred()) {
+            return data;
+        }
+    }
+    return data;
+}
+
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    // NPYV currently only supports x2 and x3
+    assert(info->is_vectorx > 1 && info->is_vectorx < 4);
+
+    PyObject *tuple = PyTuple_New(info->is_vectorx);
+    if (tuple == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < info->is_vectorx; ++i) {
+        // get the max multi-vector and let the compiler handle the rest
+        simd_data vdata = {.vu64 = data.vu64x3.val[i]};
+        PyObject *item = (PyObject*)PySIMDVector_FromData(vdata, info->to_vector);
+        if (item == NULL) {
+            // TODO: improve log add item number
+            Py_DECREF(tuple);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(tuple, i, item);
+    }
+    return tuple;
+}
diff --git a/numpy/core/src/_simd/_simd_data.inc.src b/numpy/core/src/_simd/_simd_data.inc.src
new file mode 100644
index 000000000..5c796487c
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_data.inc.src
@@ -0,0 +1,93 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+    [simd_data_none] = {.pyname="none"},
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig  = 0*4, 1*4, 0*2#
+     * #fp   = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_@sfx@] = {
+        .pyname="@name@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_scalar=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     * #name = int*8, float, float#
+     */
+    [simd_data_q@sfx@] = {
+        .pyname="[@name@]", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_sequence=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@] = {
+        .pyname="npyv_@sfx@", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vector=1, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // boolean vectors, treated as unsigned and converted internally
+    // to add compatibility among all SIMD extensions
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64#
+     * #bsfx = b8, b16, b32, b64#
+     */
+    [simd_data_v@bsfx@] = {
+        .pyname="npyv_@bsfx@", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=npyv_nlanes_@sfx@, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x2] = {
+        .pyname="npyv_@sfx@x2", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=2, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     * #sig = 0*4, 1*4, 0*2#
+     * #fp  = 0*4, 0*4, 1*2#
+     */
+    [simd_data_v@sfx@x3] = {
+        .pyname="npyv_@sfx@x3", .is_unsigned=!@sig@&&!@fp@, .is_signed=@sig@, .is_float=@fp@,
+        .is_vectorx=3, .to_scalar = simd_data_@sfx@, .to_vector = simd_data_v@sfx@,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_@sfx@)
+    },
+    /**end repeat**/
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
new file mode 100644
index 000000000..54e7ccf01
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -0,0 +1,214 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+#define SIMD_INTRIN_DEF(NAME) \
+    { NPY_TOSTRING(NAME), simd__intrin_##NAME, METH_VARARGS, NULL } , // comma
+
+#define SIMD_IMPL_INTRIN_0(NAME, RET)                     \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (!PyArg_ParseTuple(                            \
+            args, ":" NPY_TOSTRING(NAME))                 \
+        ) return NULL;                                    \
+        simd_arg a = {                                    \
+            .dtype = simd_data_##RET,                     \
+            .data  = {.RET = npyv_##NAME()},              \
+        };                                                \
+        return simd_arg_to_obj(&a);                       \
+    }
+
+#define SIMD_IMPL_INTRIN_0N(NAME)                         \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        if (!PyArg_ParseTuple(                            \
+            args, ":" NPY_TOSTRING(NAME))                 \
+        ) return NULL;                                    \
+        npyv_##NAME();                                    \
+        Py_RETURN_NONE;                                   \
+    }
+
+#define SIMD_IMPL_INTRIN_1(NAME, RET, IN0)                \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg = {.dtype = simd_data_##IN0};        \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&:"NPY_TOSTRING(NAME),                \
+            simd_arg_converter, &arg                      \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg.data.IN0                                  \
+        )};                                               \
+        simd_arg_free(&arg);                              \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD_IMPL_INTRIN_2(NAME, RET, IN0, IN1)           \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&:"NPY_TOSTRING(NAME),              \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1                  \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD__REPEAT_2IMM(C, NAME, IN0) \
+    C == arg2.data.u8 ? NPY_CAT(npyv_, NAME)(arg1.data.IN0, C) :
+
+#define SIMD_IMPL_INTRIN_2IMM(NAME, RET, IN0, CONST_RNG)  \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_u8};          \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&:"NPY_TOSTRING(NAME),              \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2                     \
+        )) return NULL;                                   \
+        simd_data data;                                   \
+        data.RET = NPY_CAT(SIMD__IMPL_COUNT_, CONST_RNG)( \
+            SIMD__REPEAT_2IMM, NAME, IN0                  \
+        ) npyv_##NAME(arg1.data.IN0, 0);                  \
+        simd_arg_free(&arg1);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
+#define SIMD_IMPL_INTRIN_3(NAME, RET, IN0, IN1, IN2)      \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&:"NPY_TOSTRING(NAME),            \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2                                 \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+/**
+ * Helper macros for repeating and expand a certain macro.
+ * Mainly used for converting a scalar to an immediate constant.
+ */
+#define SIMD__IMPL_COUNT_7(FN, ...)      \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_8(FN, ...)      \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(8,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_16(FN, ...)      \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(16,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_32(FN, ...)      \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(32,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_48(FN, ...)      \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(48,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63(FN, ...)     \
+    NPY_EXPAND(FN(0,  __VA_ARGS__))      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__)
+
+#define SIMD__IMPL_COUNT_64(FN, ...)      \
+    SIMD__IMPL_COUNT_63_(FN, __VA_ARGS__) \
+    NPY_EXPAND(FN(64,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_7_(FN, ...)                                \
+                                    NPY_EXPAND(FN(1,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(2,  __VA_ARGS__)) NPY_EXPAND(FN(3,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(4,  __VA_ARGS__)) NPY_EXPAND(FN(5,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(6,  __VA_ARGS__)) NPY_EXPAND(FN(7,  __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_15_(FN, ...)                               \
+    SIMD__IMPL_COUNT_7_(FN, __VA_ARGS__)                            \
+    NPY_EXPAND(FN(8,  __VA_ARGS__)) NPY_EXPAND(FN(9,  __VA_ARGS__)) \
+    NPY_EXPAND(FN(10, __VA_ARGS__)) NPY_EXPAND(FN(11, __VA_ARGS__)) \
+    NPY_EXPAND(FN(12, __VA_ARGS__)) NPY_EXPAND(FN(13, __VA_ARGS__)) \
+    NPY_EXPAND(FN(14, __VA_ARGS__)) NPY_EXPAND(FN(15, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_31_(FN, ...)                               \
+    SIMD__IMPL_COUNT_15_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(16, __VA_ARGS__)) NPY_EXPAND(FN(17, __VA_ARGS__)) \
+    NPY_EXPAND(FN(18, __VA_ARGS__)) NPY_EXPAND(FN(19, __VA_ARGS__)) \
+    NPY_EXPAND(FN(20, __VA_ARGS__)) NPY_EXPAND(FN(21, __VA_ARGS__)) \
+    NPY_EXPAND(FN(22, __VA_ARGS__)) NPY_EXPAND(FN(23, __VA_ARGS__)) \
+    NPY_EXPAND(FN(24, __VA_ARGS__)) NPY_EXPAND(FN(25, __VA_ARGS__)) \
+    NPY_EXPAND(FN(26, __VA_ARGS__)) NPY_EXPAND(FN(27, __VA_ARGS__)) \
+    NPY_EXPAND(FN(28, __VA_ARGS__)) NPY_EXPAND(FN(29, __VA_ARGS__)) \
+    NPY_EXPAND(FN(30, __VA_ARGS__)) NPY_EXPAND(FN(31, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_47_(FN, ...)                               \
+    SIMD__IMPL_COUNT_31_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(32, __VA_ARGS__)) NPY_EXPAND(FN(33, __VA_ARGS__)) \
+    NPY_EXPAND(FN(34, __VA_ARGS__)) NPY_EXPAND(FN(35, __VA_ARGS__)) \
+    NPY_EXPAND(FN(36, __VA_ARGS__)) NPY_EXPAND(FN(37, __VA_ARGS__)) \
+    NPY_EXPAND(FN(38, __VA_ARGS__)) NPY_EXPAND(FN(39, __VA_ARGS__)) \
+    NPY_EXPAND(FN(40, __VA_ARGS__)) NPY_EXPAND(FN(41, __VA_ARGS__)) \
+    NPY_EXPAND(FN(42, __VA_ARGS__)) NPY_EXPAND(FN(43, __VA_ARGS__)) \
+    NPY_EXPAND(FN(44, __VA_ARGS__)) NPY_EXPAND(FN(45, __VA_ARGS__)) \
+    NPY_EXPAND(FN(46, __VA_ARGS__)) NPY_EXPAND(FN(47, __VA_ARGS__))
+
+#define SIMD__IMPL_COUNT_63_(FN, ...)                               \
+    SIMD__IMPL_COUNT_47_(FN, __VA_ARGS__)                           \
+    NPY_EXPAND(FN(48, __VA_ARGS__)) NPY_EXPAND(FN(49, __VA_ARGS__)) \
+    NPY_EXPAND(FN(50, __VA_ARGS__)) NPY_EXPAND(FN(51, __VA_ARGS__)) \
+    NPY_EXPAND(FN(52, __VA_ARGS__)) NPY_EXPAND(FN(53, __VA_ARGS__)) \
+    NPY_EXPAND(FN(54, __VA_ARGS__)) NPY_EXPAND(FN(55, __VA_ARGS__)) \
+    NPY_EXPAND(FN(56, __VA_ARGS__)) NPY_EXPAND(FN(57, __VA_ARGS__)) \
+    NPY_EXPAND(FN(58, __VA_ARGS__)) NPY_EXPAND(FN(59, __VA_ARGS__)) \
+    NPY_EXPAND(FN(60, __VA_ARGS__)) NPY_EXPAND(FN(61, __VA_ARGS__)) \
+    NPY_EXPAND(FN(62, __VA_ARGS__)) NPY_EXPAND(FN(63, __VA_ARGS__))
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
new file mode 100644
index 000000000..9858fc0dc
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -0,0 +1,421 @@
+#ifndef _SIMD_SIMD_INC_H_
+#define _SIMD_SIMD_INC_H_
+
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ @sfx@;
+    /**end repeat**/
+    // sequence
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    npyv_lanetype_@sfx@ *q@sfx@;
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx  = u8, u16, u32, u64, s8, s16, s32, s64, f32, b8, b16, b32, b64#
+     */
+    npyv_@sfx@ v@sfx@;
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x2 v@sfx@x2;
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32#
+     */
+    npyv_@sfx@x3 v@sfx@x3;
+    /**end repeat**/
+#if NPY_SIMD_F64
+    npyv_f64    vf64;
+    npyv_f64x2  vf64x2;
+    npyv_f64x3  vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+    simd_data_none = 0,
+    // scalars
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_@sfx@,
+    /**end repeat**/
+    // sequences
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_q@sfx@,
+    /**end repeat**/
+    // vectors
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64, b8, b16, b32, b64#
+     */
+    simd_data_v@sfx@,
+    /**end repeat**/
+    // multi-vectors x2
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x2,
+    /**end repeat**/
+    // multi-vectors x3
+    /**begin repeat
+     * #sfx = u8, u16, u32, u64, s8, s16, s32, s64, f32, f64#
+     */
+    simd_data_v@sfx@x3,
+    /**end repeat**/
+    simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+    // type name compatible with python style
+    const char *pyname;
+    // returns '1' if the type represent a unsigned integer
+    int is_unsigned:1;
+    // returns '1' if the type represent a signed integer
+    int is_signed:1;
+    // returns '1' if the type represent a single or double precision
+    int is_float:1;
+    // returns '1' if the type represent a boolean
+    int is_bool:1;
+    // returns '1' if the type represent a sequence
+    int is_sequence:1;
+    // returns '1' if the type represent a scalar
+    int is_scalar:1;
+    // returns '1' if the type represent a vector
+    int is_vector:1;
+    // returns the len of multi-vector if the type reprsent x2 or x3 vector
+    // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+    int is_vectorx;
+    // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+    simd_data_type to_scalar;
+    // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+    // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+    // e.g. simd_data_vb8 -> simd_data_vu8
+    simd_data_type to_vector;
+    // number of vector lanes
+    int nlanes;
+    // sizeof lane type
+    int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ **  const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ **  if (info->is_vector && info->is_unsigned) {
+ **     ...
+ **  }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+    PyObject_HEAD
+    // vector type id
+    simd_data_type dtype;
+    // vector data, aligned for safe casting
+    npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} PySIMDVectorObject;
+/**
+ * Create a Python obj(PySIMDVectorObject) from a NPYV vector based on the contents
+ * of `data`(simd_data) and according to the vector data type `dtype`
+ * on range(simd_data_[vu8:vf64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.vu8 = npyv_setall_u8(0xff)};
+ ** PySIMDVectorObject *obj = PySIMDVector_FromData(data, simd_data_vu8);
+ ** if (obj != NULL) {
+ **    printf("I have a valid vector obj and first element is \n", obj->data[0]);
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype);
+/**
+ * Return a NPYV vector(simd_data) representation of `obj`(PySIMDVectorObject) and
+ * according to the vector data type `dtype` on range (simd_data_[vu8:vf64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = PySIMDVector_AsData(vec_obj, simd_data_vf32);
+ ** if (!PyErr_Occurred()) {
+ **    npyv_f32 add_1 = npyv_add_f32(data.vf32, npyv_setall_f32(1));
+ **    ...
+ ** }
+ */
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *obj, simd_data_type dtype);
+/**
+ * initialize and register PySIMDVectorType to certain PyModule,
+ * PySIMDVectorType can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+PySIMDVectorType_Init(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_number(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ **    printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_number(data, simd_data_s32);
+ ** if (obj != NULL) {
+ **    printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ **    // aligned store
+ **    npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ **    printf("The first element of my array %f\n", aligned_ptr[0]);
+ **    simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_iterable()`.
+ */
+static Py_ssize_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_iterable()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_iterable(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ **     npyv_u32 a = npyv_load_u32(ptr);
+ **     ...
+ **     simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_tuple(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ **     npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ **     ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+    simd_data_type dtype;
+    simd_data data;
+    // set by simd_arg_converter()
+    PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ **     // fails to convert a python sequence object to C array of uint8
+ **     return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_arg_free(&arg);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ **     // fails to convert a python simd_vector to NPYV vector
+ **     return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ **    // fails convert C uint32 to Python integer.
+ **    return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Converter function used similar to simd_arg_from_obj() but
+ * used with PyArg_Parse*().
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *  - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Example:
+ **  simd_arg seq_f32 = {.dtype = simd_data_qf32};
+ **  simd_arg vec_f32 = {.dtype = simd_data_vf32};
+ **  if (!PyArg_ParseTuple(
+ **     args, "O&O&:add_sum_f32",
+ **     simd_arg_converter, &seq_f32,
+ **     simd_arg_converter, &vec_f32
+ **  )) {
+ **     // fail
+ **     return;
+ **  }
+ **  npyv_f32 load_a = npyv_load_f32(seq_f32.data.qf32);
+ **  npyv_f32 sum = npyv_add_f32(load_a, vec_f32.data.vf32);
+ **  ...
+ **  simd_arg_free(&seq_f32);
+ */
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg);
+/**
+ * Free the allocated C array, if the arg hold sequence data type.
+ */
+static void
+simd_arg_free(simd_arg *arg);
+
+#endif // NPY_SIMD
+#endif // _SIMD_SIMD_INC_H_
diff --git a/numpy/core/src/_simd/_simd_vector.inc b/numpy/core/src/_simd/_simd_vector.inc
new file mode 100644
index 000000000..2a1378f22
--- /dev/null
+++ b/numpy/core/src/_simd/_simd_vector.inc
@@ -0,0 +1,178 @@
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static Py_ssize_t
+simd__vector_length(PySIMDVectorObject *self)
+{
+    return simd_data_getinfo(self->dtype)->nlanes;
+}
+static PyObject *
+simd__vector_item(PySIMDVectorObject *self, Py_ssize_t i)
+{
+    const simd_data_info *info = simd_data_getinfo(self->dtype);
+    int nlanes = info->nlanes;
+    if (i >= nlanes) {
+        PyErr_SetString(PyExc_IndexError, "vector index out of range");
+        return NULL;
+    }
+    npyv_lanetype_u8 *src = self->data + i * info->lane_size;
+    simd_data data;
+    memcpy(&data.u64, src, info->lane_size);
+    return simd_scalar_to_number(data, info->to_scalar);
+}
+
+static PySequenceMethods simd__vector_as_sequence = {
+    .sq_length = (lenfunc) simd__vector_length,
+    .sq_item = (ssizeargfunc) simd__vector_item
+};
+
+static PyObject *
+simd__vector_name(PySIMDVectorObject *self)
+{
+    return PyUnicode_FromString(simd_data_getinfo(self->dtype)->pyname);
+}
+static PyGetSetDef simd__vector_getset[] = {
+    { "__name__", (getter)simd__vector_name, NULL, NULL, NULL },
+    { NULL, NULL, NULL, NULL, NULL }
+};
+
+static PyObject *
+simd__vector_repr(PySIMDVectorObject *self)
+{
+    PyObject *obj = PySequence_List((PyObject*)self);
+    if (obj != NULL) {
+        const char *type_name = simd_data_getinfo(self->dtype)->pyname;
+        PyObject *repr = PyUnicode_FromFormat("<%s of %R>", type_name, obj);
+        Py_DECREF(obj);
+        return repr;
+    }
+    return obj;
+}
+static PyObject *
+simd__vector_compare(PyObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *obj;
+    if (PyTuple_Check(other)) {
+        obj = PySequence_Tuple(self);
+    } else if (PyList_Check(other)) {
+        obj = PySequence_List(self);
+    } else {
+        obj = PySequence_Fast(self, "invalid argument, expected a vector");
+    }
+    if (obj != NULL) {
+        PyObject *rich = PyObject_RichCompare(obj, other, cmp_op);
+        Py_DECREF(obj);
+        return rich;
+    }
+    return obj;
+}
+static PyTypeObject PySIMDVectorType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = NPY_TOSTRING(NPY_CPU_DISPATCH_CURFX(VECTOR)),
+    .tp_basicsize = sizeof(PySIMDVectorObject),
+    .tp_repr = (reprfunc)simd__vector_repr,
+    .tp_as_sequence = &simd__vector_as_sequence,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_richcompare = simd__vector_compare,
+    .tp_getset = simd__vector_getset
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    PySIMDVectorObject *vec = PyObject_New(PySIMDVectorObject, &PySIMDVectorType);
+    if (vec == NULL) {
+        return (PySIMDVectorObject*)PyErr_NoMemory();
+    }
+    vec->dtype = dtype;
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(dtype) {
+        case simd_data_vb8:
+            data.vu8 = npyv_cvt_u8_b8(data.vb8);
+            break;
+        case simd_data_vb16:
+            data.vu16 = npyv_cvt_u16_b16(data.vb16);
+            break;
+        case simd_data_vb32:
+            data.vu32 = npyv_cvt_u32_b32(data.vb32);
+            break;
+        default:
+            data.vu64 = npyv_cvt_u64_b64(data.vb64);
+        }
+    }
+    npyv_store_u8(vec->data, data.vu8);
+    return vec;
+}
+
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *vec, simd_data_type dtype)
+{
+    const simd_data_info *info = simd_data_getinfo(dtype);
+    assert(info->is_vector && info->nlanes > 0);
+
+    simd_data data = {.u64 = 0};
+    if (!PyObject_IsInstance(
+        (PyObject *)vec, (PyObject *)&PySIMDVectorType
+    )) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required", info->pyname
+        );
+        return data;
+    }
+    if (vec->dtype != dtype) {
+        PyErr_Format(PyExc_TypeError,
+            "a vector type %s is required, got(%s)",
+            info->pyname, simd_data_getinfo(vec->dtype)->pyname
+        );
+        return data;
+    }
+
+    data.vu8 = npyv_load_u8(vec->data);
+    if (info->is_bool) {
+        // boolean vectors are internally treated as unsigned
+        // vectors to add compatibility among all SIMD extensions
+        switch(dtype) {
+        case simd_data_vb8:
+            data.vb8 = npyv_cvt_b8_u8(data.vu8);
+            break;
+        case simd_data_vb16:
+            data.vb16 = npyv_cvt_b16_u16(data.vu16);
+            break;
+        case simd_data_vb32:
+            data.vb32 = npyv_cvt_b32_u32(data.vu32);
+            break;
+        default:
+            data.vb64 = npyv_cvt_b64_u64(data.vu64);
+        }
+    }
+    return data;
+}
+
+static int
+PySIMDVectorType_Init(PyObject *module)
+{
+    Py_INCREF(&PySIMDVectorType);
+    if (PyType_Ready(&PySIMDVectorType)) {
+        return -1;
+    }
+    if (PyModule_AddObject(
+        module, "vector_type",(PyObject *)&PySIMDVectorType
+    )) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c
index d626d1260..c55f6bdb4 100644
--- a/numpy/core/src/common/array_assign.c
+++ b/numpy/core/src/common/array_assign.c
@@ -14,7 +14,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
-
 #include "npy_config.h"
 #include "npy_pycompat.h"
 
@@ -65,19 +64,22 @@ broadcast_strides(int ndim, npy_intp const *shape,
     return 0;
 
 broadcast_error: {
-        PyObject *errmsg;
-
-        errmsg = PyUString_FromFormat("could not broadcast %s from shape ",
-                                strides_name);
-        PyUString_ConcatAndDel(&errmsg,
-                build_shape_string(strides_ndim, strides_shape));
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" into shape "));
-        PyUString_ConcatAndDel(&errmsg,
-                build_shape_string(ndim, shape));
-        PyErr_SetObject(PyExc_ValueError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *shape1 = convert_shape_to_string(strides_ndim,
+                                                   strides_shape, "");
+        if (shape1 == NULL) {
+            return -1;
+        }
 
+        PyObject *shape2 = convert_shape_to_string(ndim, shape, "");
+        if (shape2 == NULL) {
+            Py_DECREF(shape1);
+            return -1;
+        }
+        PyErr_Format(PyExc_ValueError,
+                "could not broadcast %s from shape %S into shape %S",
+                strides_name, shape1, shape2);
+        Py_DECREF(shape1);
+        Py_DECREF(shape2);
         return -1;
     }
 }
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index f2f12a55b..12aa61822 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -30,10 +30,9 @@
  * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
  *
  */
-typedef void (PyArray_StridedUnaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src, npy_intp src_stride,
-                                    npy_intp N, npy_intp src_itemsize,
-                                    NpyAuxData *transferdata);
+typedef int (PyArray_StridedUnaryOp)(
+        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
+        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
 
 /*
  * This is for pointers to functions which behave exactly as
@@ -43,31 +42,10 @@ typedef void (PyArray_StridedUnaryOp)(char *dst, npy_intp dst_stride,
  * In particular, the 'i'-th element is operated on if and only if
  * mask[i*mask_stride] is true.
  */
-typedef void (PyArray_MaskedStridedUnaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src, npy_intp src_stride,
-                                    npy_bool *mask, npy_intp mask_stride,
-                                    npy_intp N, npy_intp src_itemsize,
-                                    NpyAuxData *transferdata);
-
-/*
- * This function pointer is for binary operations that input two
- * arbitrarily strided one-dimensional array segments and output
- * an arbitrarily strided array segment of the same size.
- * It may be a fully general function, or a specialized function
- * when the strides or item size have particular known values.
- *
- * Examples of binary operations are the basic arithmetic operations,
- * logical operators AND, OR, and many others.
- *
- * The 'transferdata' parameter is slightly special, following a
- * generic auxiliary data pattern defined in ndarraytypes.h
- * Use NPY_AUXDATA_CLONE and NPY_AUXDATA_FREE to deal with this data.
- *
- */
-typedef void (PyArray_StridedBinaryOp)(char *dst, npy_intp dst_stride,
-                                    char *src0, npy_intp src0_stride,
-                                    char *src1, npy_intp src1_stride,
-                                    npy_intp N, NpyAuxData *transferdata);
+typedef int (PyArray_MaskedStridedUnaryOp)(
+        char *dst, npy_intp dst_stride, char *src, npy_intp src_stride,
+        npy_bool *mask, npy_intp mask_stride,
+        npy_intp N, npy_intp src_itemsize, NpyAuxData *transferdata);
 
 /*
  * Gives back a function pointer to a specialized function for copying
@@ -271,6 +249,7 @@ PyArray_CastRawArrays(npy_intp count,
  * The return value is the number of elements it couldn't copy.  A return value
  * of 0 means all elements were copied, a larger value means the end of
  * the n-dimensional array was reached before 'count' elements were copied.
+ * A negative return value indicates an error occurred.
  *
  * ndim:
  *      The number of dimensions of the n-dimensional array.
diff --git a/numpy/core/src/common/npy_binsearch.h.src b/numpy/core/src/common/npy_binsearch.h.src
index ce3b34b0e..052c44482 100644
--- a/numpy/core/src/common/npy_binsearch.h.src
+++ b/numpy/core/src/common/npy_binsearch.h.src
@@ -40,12 +40,12 @@ typedef struct {
  *         cfloat, cdouble, clongdouble, datetime, timedelta#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
                         npy_intp arr_len, npy_intp key_len,
                         npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
                         PyArrayObject *unused);
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            const char *sort, char *ret,
                            npy_intp arr_len, npy_intp key_len,
@@ -54,12 +54,12 @@ argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            PyArrayObject *unused);
 /**end repeat1**/
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 npy_binsearch_@side@(const char *arr, const char *key, char *ret,
                      npy_intp arr_len, npy_intp key_len,
                      npy_intp arr_str, npy_intp key_str,
                      npy_intp ret_str, PyArrayObject *cmp);
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 npy_argbinsearch_@side@(const char *arr, const char *key,
                         const char *sort, char *ret,
                         npy_intp arr_len, npy_intp key_len,
diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h
index 97308238a..072993ec2 100644
--- a/numpy/core/src/common/npy_cblas.h
+++ b/numpy/core/src/common/npy_cblas.h
@@ -47,8 +47,10 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 
 #ifdef HAVE_BLAS_ILP64
 #define CBLAS_INT npy_int64
+#define CBLAS_INT_MAX NPY_MAX_INT64
 #else
 #define CBLAS_INT int
+#define CBLAS_INT_MAX INT_MAX
 #endif
 
 #define BLASNAME(name) CBLAS_FUNC(name)
@@ -59,6 +61,39 @@ enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
 #undef BLASINT
 #undef BLASNAME
 
+
+/*
+ * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
+ * (BLAS won't handle negative or zero strides the way we want).
+ */
+static NPY_INLINE CBLAS_INT
+blas_stride(npy_intp stride, unsigned itemsize)
+{
+    /*
+     * Should probably check pointer alignment also, but this may cause
+     * problems if we require complex to be 16 byte aligned.
+     */
+    if (stride > 0 && (stride % itemsize) == 0) {
+        stride /= itemsize;
+        if (stride <= CBLAS_INT_MAX) {
+            return stride;
+        }
+    }
+    return 0;
+}
+
+/*
+ * Define a chunksize for CBLAS.
+ *
+ * The chunksize is the greatest power of two less than CBLAS_INT_MAX.
+ */
+#if NPY_MAX_INTP > CBLAS_INT_MAX
+# define NPY_CBLAS_CHUNK  (CBLAS_INT_MAX / 2 + 1)
+#else
+# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index 27328aa73..61cc3c7f1 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -19,6 +19,15 @@
 
 #endif
 
+/* Disable broken functions on z/OS */
+#if defined (__MVS__)
+
+#undef HAVE_POWF
+#undef HAVE_EXPF
+#undef HAVE___THREAD
+
+#endif
+
 /* Disable broken MS math functions */
 #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__MINGW32_VERSION)
 
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 846d1ebb9..a0f82fa3d 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -17,7 +17,7 @@
  * NumPy module's attributes `__cpu_baseline__` and `__cpu_dispaٍtch__`.
  */
 /**
- * Note: Always gaurd the genreated headers within 'NPY_DISABLE_OPTIMIZATION',
+ * Note: Always guard the generated headers within 'NPY_DISABLE_OPTIMIZATION',
  * due the nature of command argument '--disable-optimization',
  * which is explicitly disabling the module ccompiler_opt.
  */
@@ -29,7 +29,7 @@
          * It's better anyway to take them off and use built-in types(__vector, __pixel, __bool) instead,
          * since c99 supports bool variables which may lead to ambiguous errors.
         */
-        // backup 'bool' before including '_cpu_dispatch.h', since it may not defiend as a compiler token.
+        // backup 'bool' before including '_cpu_dispatch.h', since it may not defined as a compiler token.
         #define NPY__DISPATCH_DEFBOOL
         typedef bool npy__dispatch_bkbool;
     #endif
@@ -134,10 +134,10 @@
  *    NPY_CPU_DISPATCH_DECLARE(void dispatch_me, (const int*, int*))
  *    NPY_CPU_DISPATCH_DECLARE(extern cb_type callback_tab, [TAB_SIZE])
  *
- * By assuming the provided config header drived from a dispatch-able source,
+ * By assuming the provided config header derived from a dispatch-able source,
  * that configured with "@targets baseline sse41 vsx3 asimdhp",
  * they supported by the compiler and enabled via '--cpu-dspatch',
- * then the prototype declrations at the above example will equlivent to the follows:
+ * then the prototype declrations at the above example will equivalent to the follows:
  *
  * - x86:
  *      void dispatch_me(const int*, int*); // baseline
@@ -179,7 +179,7 @@
 /**
  * Macro NPY_CPU_DISPATCH_DECLARE_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
  * if it was provided within the configration statments.
  */
 #define NPY_CPU_DISPATCH_DECLARE_XB(...) \
@@ -206,7 +206,7 @@
  *  In order to call or to assign the pointer of it from outside the dispatch-able source,
  *  you have to use this Macro as follows:
  *
- *    // bring the genreated config header of the dispatch-abel source
+ *    // bring the generated config header of the dispatch-able source
  *    #ifndef NPY_DISABLE_OPTIMIZATION
  *        #include "dispatchable_source_name.dispatch.h"
  *    #endif
@@ -217,44 +217,49 @@
  *    func_type the_callee(const int *src, int *dst, func_type *cb)
  *    {
  *        // direct call
- *        NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst))
+ *        NPY_CPU_DISPATCH_CALL(dispatch_me, (src, dst));
  *        // assign the pointer
- *        NPY_CPU_DISPATCH_CALL(*cb = dispatch_me)
+ *        *cb = NPY_CPU_DISPATCH_CALL(dispatch_me);
+ *        // or
+ *        NPY_CPU_DISPATCH_CALL(*cb = dispatch_me);
  *        // return the pointer
- *        NPY_CPU_DISPATCH_CALL(return dispatch_me)
+ *        return NPY_CPU_DISPATCH_CALL(dispatch_me);
  *    }
  */
 #define NPY_CPU_DISPATCH_CALL(...) \
-    if (0) {/*DUMMY*/} \
     NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__) \
     NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_BASE_CB_, __VA_ARGS__)
 // Preprocessor callbacks
 #define NPY_CPU_DISPATCH_CALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
-    else if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+    (TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
 #define NPY_CPU_DISPATCH_CALL_BASE_CB_(LEFT, ...) \
-    else { LEFT __VA_ARGS__; }
+    (LEFT __VA_ARGS__)
 /**
  * Macro NPY_CPU_DISPATCH_CALL_XB(LEFT, ...)
  *
- * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declration even
- * if it was provided within the configration statments.
+ * Same as `NPY_CPU_DISPATCH_DECLARE` but exclude the baseline declaration even
+ * if it was provided within the configration statements.
+ * Returns void.
  */
+#define NPY_CPU_DISPATCH_CALL_XB_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
+    (TESTED_FEATURES) ? (void) (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) :
 #define NPY_CPU_DISPATCH_CALL_XB(...) \
-    if (0) {/*DUMMY*/} \
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_CB_, __VA_ARGS__)
+    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_XB_CB_, __VA_ARGS__) \
+    ((void) 0 /* discarded expression value */)
 /**
  * Macro NPY_CPU_DISPATCH_CALL_ALL(LEFT, ...)
  *
  * Same as `NPY_CPU_DISPATCH_CALL` but dispatching all the required optimizations for
  * the exported functions and variables instead of highest interested one.
+ * Returns void.
  */
 #define NPY_CPU_DISPATCH_CALL_ALL(...) \
-    NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
-    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__)
+    (NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, NPY_CPU_DISPATCH_CALL_ALL_CB_, __VA_ARGS__) \
+    NPY__CPU_DISPATCH_BASELINE_CALL(NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_, __VA_ARGS__))
 // Preprocessor callbacks
 #define NPY_CPU_DISPATCH_CALL_ALL_CB_(TESTED_FEATURES, TARGET_NAME, LEFT, ...) \
-    if (TESTED_FEATURES) { NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__; }
+    ((TESTED_FEATURES) ? (NPY_CAT(NPY_CAT(LEFT, _), TARGET_NAME) __VA_ARGS__) : (void) 0),
 #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
-    { LEFT __VA_ARGS__; }
+    ( LEFT __VA_ARGS__ )
 
 #endif // NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index dfcf98c74..69bbc83a2 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -19,11 +19,11 @@ npy__cpu_init_features(void);
  * Multiple features can be present, and separated by space, comma, or tab.
  * Raises an error if parsing fails or if the feature was not enabled
 */
-static void
+static int
 npy__cpu_try_disable_env(void);
 
 /* Ensure the build's CPU baseline features are supported at runtime */
-static void
+static int
 npy__cpu_validate_baseline(void);
 
 /******************** Public Definitions *********************/
@@ -40,11 +40,12 @@ NPY_VISIBILITY_HIDDEN int
 npy_cpu_init(void)
 {
     npy__cpu_init_features();
-    npy__cpu_validate_baseline();
-    npy__cpu_try_disable_env();
-
-    if (PyErr_Occurred())
+    if (npy__cpu_validate_baseline() < 0) {
+        return -1;
+    }
+    if (npy__cpu_try_disable_env() < 0) {
         return -1;
+    }
     return 0;
 }
 
@@ -142,7 +143,7 @@ npy__cpu_dispatch_fid(const char *feature)
     return 0;
 }
 
-static void
+static int
 npy__cpu_validate_baseline(void)
 {
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_BASELINE_N > 0
@@ -165,16 +166,18 @@ npy__cpu_validate_baseline(void)
             "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
             baseline_failure
         );
+        return -1;
     }
 #endif
+    return 0;
 }
 
-static void
+static int
 npy__cpu_try_disable_env(void)
 {
     char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
     if (disenv == NULL || disenv[0] == 0) {
-        return;
+        return 0;
     }
     #define NPY__CPU_ENV_ERR_HEAD \
         "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
@@ -187,7 +190,7 @@ npy__cpu_try_disable_env(void)
             "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
             var_len, NPY__MAX_VAR_LEN - 1
         );
-        return;
+        return -1;
     }
     char disable_features[NPY__MAX_VAR_LEN];
     memcpy(disable_features, disenv, var_len);
@@ -210,7 +213,7 @@ npy__cpu_try_disable_env(void)
                 "(" NPY_WITH_CPU_BASELINE ").",
                 feature
             );
-            break;
+            return -1;
         }
         // check if the feature is part of dispatched features
         int feature_id = npy__cpu_dispatch_fid(feature);
@@ -236,36 +239,43 @@ npy__cpu_try_disable_env(void)
     *nexist_cur = '\0';
     if (nexist[0] != '\0') {
         *(nexist_cur-1) = '\0'; // trim the last space
-        PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot disable CPU features (%s), since "
-            "they are not part of the dispatched optimizations\n"
-            "(" NPY_WITH_CPU_DISPATCH ").",
-            nexist
-        );
+        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+                NPY__CPU_ENV_ERR_HEAD
+                "You cannot disable CPU features (%s), since "
+                "they are not part of the dispatched optimizations\n"
+                "(" NPY_WITH_CPU_DISPATCH ").",
+                nexist
+        ) < 0) {
+            return -1;
+        }
     }
 
     *notsupp_cur = '\0';
     if (notsupp[0] != '\0') {
         *(notsupp_cur-1) = '\0'; // trim the last space
-        PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot disable CPU features (%s), since "
-            "they are not supported by your machine.",
-            notsupp
-        );
+        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+                NPY__CPU_ENV_ERR_HEAD
+                "You cannot disable CPU features (%s), since "
+                "they are not supported by your machine.",
+                notsupp
+        ) < 0) {
+            return -1;
+        }
     }
 #else
-    PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-        NPY__CPU_ENV_ERR_HEAD
-        "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
-    #ifdef NPY_DISABLE_OPTIMIZATION
-        "the NumPy library was compiled with optimization disabled."
-    #else
-        "the NumPy library was compiled without any dispatched optimizations."
-    #endif
-    );
+    if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+            NPY__CPU_ENV_ERR_HEAD
+            "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+        #ifdef NPY_DISABLE_OPTIMIZATION
+            "the NumPy library was compiled with optimization disabled."
+        #else
+            "the NumPy library was compiled without any dispatched optimizations."
+        #endif
+    ) < 0) {
+        return -1;
+    }
 #endif
+    return 0;
 }
 
 /****************************************************************
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 693a9857d..28dd00032 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -1,8 +1,8 @@
 #ifndef _NPY_CPU_FEATURES_H_
 #define _NPY_CPU_FEATURES_H_
 
-#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 #include <Python.h> // for PyObject
+#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/numpy/core/src/common/npy_partition.h.src b/numpy/core/src/common/npy_partition.h.src
index 97dc2536b..72c2095f1 100644
--- a/numpy/core/src/common/npy_partition.h.src
+++ b/numpy/core/src/common/npy_partition.h.src
@@ -42,12 +42,12 @@
  *         npy_cdouble, npy_clongdouble#
  */
 
-NPY_VISIBILITY_HIDDEN int introselect_@suff@(@type@ *v, npy_intp num,
+NPY_NO_EXPORT int introselect_@suff@(@type@ *v, npy_intp num,
                                              npy_intp kth,
                                              npy_intp * pivots,
                                              npy_intp * npiv,
                                              void *NOT_USED);
-NPY_VISIBILITY_HIDDEN int aintroselect_@suff@(@type@ *v, npy_intp* tosort, npy_intp num,
+NPY_NO_EXPORT int aintroselect_@suff@(@type@ *v, npy_intp* tosort, npy_intp num,
                                               npy_intp kth,
                                               npy_intp * pivots,
                                               npy_intp * npiv,
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index 16a105499..ddbde0c9b 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -33,14 +33,14 @@ static NPY_INLINE int npy_get_msb(npy_uintp unum)
  *         cfloat, cdouble, clongdouble, datetime, timedelta#
  */
 
-int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
-int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
-int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
-int timsort_@suff@(void *vec, npy_intp cnt, void *null);
-int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
-int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int quicksort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
@@ -50,8 +50,8 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
  *         longlong, ulonglong#
  */
 
-int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
-int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
 
 /**end repeat**/
 
@@ -69,14 +69,14 @@ int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
  * #suff = string, unicode#
  */
 
-int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
-int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
-int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
-int timsort_@suff@(void *vec, npy_intp cnt, void *arr);
-int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int quicksort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_@suff@(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
 
 /**end repeat**/
 
@@ -88,13 +88,13 @@ int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
  */
 
 
-int npy_quicksort(void *vec, npy_intp cnt, void *arr);
-int npy_heapsort(void *vec, npy_intp cnt, void *arr);
-int npy_mergesort(void *vec, npy_intp cnt, void *arr);
-int npy_timsort(void *vec, npy_intp cnt, void *arr);
-int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
-int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_timsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
 
 #endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index 9d8b4ab5e..3a6dc9535 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -72,4 +72,71 @@
 #define npyv_div_f32 _mm256_div_ps
 #define npyv_div_f64 _mm256_div_pd
 
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm256_fmadd_ps
+    #define npyv_muladd_f64 _mm256_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm256_fmsub_ps
+    #define npyv_mulsub_f64 _mm256_fmsub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm256_fnmadd_ps
+    #define npyv_nmuladd_f64 _mm256_fnmadd_pd
+    // negate multiply and subtract, -(a*b) - c
+    #define npyv_nmulsub_f32 _mm256_fnmsub_ps
+    #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+        return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+    }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+        return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+    }
+#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m256 a)
+{
+    __m256 sum_halves = _mm256_hadd_ps(a, a);
+    sum_halves = _mm256_hadd_ps(sum_halves, sum_halves);
+    __m128 lo = _mm256_castps256_ps128(sum_halves);
+    __m128 hi = _mm256_extractf128_ps(sum_halves, 1);
+    __m128 sum = _mm_add_ps(lo, hi);
+    return _mm_cvtss_f32(sum);
+}
+
+NPY_FINLINE double npyv_sum_f64(__m256d a)
+{
+    __m256d sum_halves = _mm256_hadd_pd(a, a);
+    __m128d lo = _mm256_castpd256_pd128(sum_halves);
+    __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+    __m128d sum = _mm_add_pd(lo, hi);
+    return _mm_cvtsd_f64(sum);
+}
+
 #endif // _NPY_SIMD_AVX2_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index c99d628ee..6f0d3c0d9 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -5,6 +5,8 @@
 #define NPY_SIMD 256
 #define NPY_SIMD_WIDTH 32
 #define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm256_i32gather_*
+#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
 typedef __m256i npyv_u8;
 typedef __m256i npyv_s8;
@@ -65,3 +67,4 @@ typedef struct { __m256d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
index 9fd86016d..f72678b54 100644
--- a/numpy/core/src/common/simd/avx2/conversion.h
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(A) A
 #define npyv_cvt_u64_b64(A) A
 #define npyv_cvt_s64_b64(A) A
-#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A)
-#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A)
+#define npyv_cvt_f32_b32 _mm256_castsi256_ps
+#define npyv_cvt_f64_b64 _mm256_castsi256_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(BL)   BL
@@ -26,7 +26,21 @@
 #define npyv_cvt_b32_s32(BL) BL
 #define npyv_cvt_b64_u64(BL) BL
 #define npyv_cvt_b64_s64(BL) BL
-#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL)
-#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL)
+#define npyv_cvt_b32_f32 _mm256_castps_si256
+#define npyv_cvt_b64_f64 _mm256_castpd_si256
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint32)_mm256_movemask_epi8(a); }
+
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
+    return (npy_uint16)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm256_movemask_ps(_mm256_castsi256_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm256_movemask_pd(_mm256_castsi256_pd(a)); }
 
 #endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
new file mode 100644
index 000000000..b3eba6f5f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MATH_H
+#define _NPY_SIMD_AVX2_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm256_sqrt_ps
+#define npyv_sqrt_f64 _mm256_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm256_div_ps(_mm256_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm256_div_pd(_mm256_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm256_and_ps(
+        a, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm256_and_pd(
+        a, _mm256_castsi256_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm256_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm256_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 5ea7414fd..e27bf15fe 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -2,6 +2,8 @@
     #error "Not a standalone header"
 #endif
 
+#include "misc.h"
+
 #ifndef _NPY_SIMD_AVX2_MEMORY_H
 #define _NPY_SIMD_AVX2_MEMORY_H
 
@@ -66,5 +68,289 @@ NPYV_IMPL_AVX2_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1))
 #define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+    return _mm256_i32gather_epi32((const int*)ptr, idx, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+#if 0 // slower
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+#endif
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+    __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+    __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+    __m128d a01 = _mm_loadh_pd(a0, ptr + stride);
+    __m128d a23 = _mm_loadh_pd(a2, ptr + stride*3);
+    return _mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    __m128i a0 = _mm256_castsi256_si128(a);
+    __m128i a1 = _mm256_extracti128_si256(a, 1);
+    ptr[stride * 0] = _mm_cvtsi128_si32(a0);
+    ptr[stride * 1] = _mm_extract_epi32(a0, 1);
+    ptr[stride * 2] = _mm_extract_epi32(a0, 2);
+    ptr[stride * 3] = _mm_extract_epi32(a0, 3);
+    ptr[stride * 4] = _mm_cvtsi128_si32(a1);
+    ptr[stride * 5] = _mm_extract_epi32(a1, 1);
+    ptr[stride * 6] = _mm_extract_epi32(a1, 2);
+    ptr[stride * 7] = _mm_extract_epi32(a1, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm256_castps_si256(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+    __m128d a0 = _mm256_castpd256_pd128(a);
+    __m128d a1 = _mm256_extractf128_pd(a, 1);
+    _mm_storel_pd(ptr + stride * 0, a0);
+    _mm_storeh_pd(ptr + stride * 1, a0);
+    _mm_storel_pd(ptr + stride * 2, a1);
+    _mm_storeh_pd(ptr + stride * 3, a1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi32(fill);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
+    return _mm256_maskload_epi32((const int*)ptr, mask);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi64x(fill);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_maskload_epi64((const void*)ptr, mask);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m256i vfill = _mm256_set1_epi32(fill);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    const __m256i idx   = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
+    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
+    return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi64x(fill);
+    const __m256i idx   = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
+    _mm256_maskstore_epi32((int*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
+    __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    _mm256_maskstore_epi64((void*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    __m128i a0 = _mm256_castsi256_si128(a);
+    __m128i a1 = _mm256_extracti128_si256(a, 1);
+    switch(nlane) {
+    default:
+        ptr[stride*7] = _mm_extract_epi32(a1, 3);
+    case 7:
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+    case 6:
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+    case 5:
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+    case 4:
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+    case 3:
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+    case 1:
+        ptr[stride*0] = _mm_extract_epi32(a0, 0);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+    double *dptr = (double*)ptr;
+    switch(nlane) {
+    default:
+        _mm_storeh_pd(dptr + stride * 3, a1);
+    case 3:
+        _mm_storel_pd(dptr + stride * 2, a1);
+    case 2:
+        _mm_storeh_pd(dptr + stride * 1, a0);
+    case 1:
+        _mm_storel_pd(dptr + stride * 0, a0);
+    }
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                     \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_AVX2_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 5a9e68e32..4d6ec8f75 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,4 +94,36 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm256_shuffle_epi8(a, idx);
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm256_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index fcaef0efd..7372ca29e 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -113,4 +113,63 @@ NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
 #define npyv_div_f32 _mm512_div_ps
 #define npyv_div_f64 _mm512_div_pd
 
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 _mm512_fmadd_ps
+#define npyv_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 _mm512_fmsub_ps
+#define npyv_mulsub_f64 _mm512_fmsub_pd
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 _mm512_fnmadd_ps
+#define npyv_nmuladd_f64 _mm512_fnmadd_pd
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 _mm512_fnmsub_ps
+#define npyv_nmulsub_f64 _mm512_fnmsub_pd
+
+/***************************
+ * Reduce Sum
+ * there are three ways to implement reduce sum for AVX512:
+ * 1- split(256) /add /split(128) /add /hadd /hadd /extract
+ * 2- shuff(cross) /add /shuff(cross) /add /shuff /add /shuff /add /extract
+ * 3- _mm512_reduce_add_ps/pd
+ * The first one is been widely used by many projects
+ * 
+ * the second one is used by Intel Compiler, maybe because the
+ * latency of hadd increased by (2-3) starting from Skylake-X which makes two
+ * extra shuffles(non-cross) cheaper. check https://godbolt.org/z/s3G9Er for more info.
+ * 
+ * The third one is almost the same as the second one but only works for
+ * intel compiler/GCC 7.1/Clang 4, we still need to support older GCC.
+ ***************************/
+#ifdef NPY_HAVE_AVX512F_REDUCE
+    #define npyv_sum_f32 _mm512_reduce_add_ps
+    #define npyv_sum_f64 _mm512_reduce_add_pd
+#else
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        __m512 h64   = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512 sum32 = _mm512_add_ps(a, h64);
+        __m512 h32   = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum16 = _mm512_add_ps(sum32, h32);
+        __m512 h16   = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512 sum8  = _mm512_add_ps(sum16, h16);
+        __m512 h4    = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512 sum4  = _mm512_add_ps(sum8, h4);
+        return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
+    }
+    NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+    {
+        __m512d h64   = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+        __m512d sum32 = _mm512_add_pd(a, h64);
+        __m512d h32   = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+        __m512d sum16 = _mm512_add_pd(sum32, h32);
+        __m512d h16   = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+        __m512d sum8  = _mm512_add_pd(sum16, h16);
+        return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+    }
+#endif
+
 #endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 96fdf72b9..2de33765a 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -4,6 +4,9 @@
 #define NPY_SIMD 512
 #define NPY_SIMD_WIDTH 64
 #define NPY_SIMD_F64 1
+// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
+#define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
+#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
 
 typedef __m512i npyv_u8;
 typedef __m512i npyv_s8;
@@ -69,3 +72,4 @@ typedef struct { __m512d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
index 0f7e27de3..6ad299dd5 100644
--- a/numpy/core/src/common/simd/avx512/conversion.h
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -51,4 +51,35 @@
 #define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
 #define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
 
+// convert boolean vectors to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint64)_cvtmask64_u64(a);
+#elif defined(NPY_HAVE_AVX512BW)
+    return (npy_uint64)a;
+#else
+    int mask_lo = _mm256_movemask_epi8(npyv512_lower_si256(a));
+    int mask_hi = _mm256_movemask_epi8(npyv512_higher_si256(a));
+    return (unsigned)mask_lo | ((npy_uint64)(unsigned)mask_hi << 32);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+#ifdef NPY_HAVE_AVX512BW_MASK
+    return (npy_uint32)_cvtmask32_u32(a);
+#elif defined(NPY_HAVE_AVX512BW)
+    return (npy_uint32)a;
+#else
+    __m256i pack = _mm256_packs_epi16(
+        npyv512_lower_si256(a), npyv512_higher_si256(a)
+    );
+    return (npy_uint32)_mm256_movemask_epi8(_mm256_permute4x64_epi64(pack, _MM_SHUFFLE(3, 1, 2, 0)));
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint16)a; }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)a; }
+
 #endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
new file mode 100644
index 000000000..1db710670
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MATH_H
+#define _NPY_SIMD_AVX512_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm512_sqrt_ps
+#define npyv_sqrt_f64 _mm512_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm512_div_ps(_mm512_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm512_div_pd(_mm512_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_ps(a, a, 8);
+#else
+    return npyv_and_f32(
+        a, _mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff))
+    );
+#endif
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+#if 0 // def NPY_HAVE_AVX512DQ
+    return _mm512_range_pd(a, a, 8);
+#else
+    return npyv_and_f64(
+        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+    );
+#endif
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm512_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm512_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index e212c4555..bffd6e907 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -90,5 +90,243 @@ NPYV_IMPL_AVX512_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC))
 #define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m512i steps = npyv_set_s32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    return _mm512_i32gather_epi32(idx, (const __m512i*)ptr, 4);
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    return _mm512_i64gather_epi64(idx, (const __m512i*)ptr, 8);
+}
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+    const __m512i steps = _mm512_setr_epi32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    _mm512_i32scatter_epi32((__m512i*)ptr, idx, a, 4);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    _mm512_i64scatter_epi64((__m512i*)ptr, idx, a, 8);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set1_epi32(fill);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXLOAD_STRIDE32);
+    const __m512i steps = npyv_set_s32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    const __m512i vfill = _mm512_set1_epi32(fill);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64
+npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    assert(llabs(stride) <= NPY_SIMD_MAXSTORE_STRIDE32);
+    const __m512i steps = _mm512_setr_epi32(
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    );
+    const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
+    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = _mm512_setr_epi64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+/*****************************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
+ *****************************************************************************/
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                   \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_AVX512_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index cdbae7aac..f043004ec 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,4 +167,60 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm512_shuffle_epi8(a, idx);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9,
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    __m256i lo = _mm256_shuffle_epi8(npyv512_lower_si256(a),  idx);
+    __m256i hi = _mm256_shuffle_epi8(npyv512_higher_si256(a), idx);
+    return npyv512_combine_si256(lo, hi);
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm512_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm512_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index ec8b8ecd0..87e00d5d1 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -63,16 +63,84 @@
 /***************************
  * Division
  ***************************/
-#ifdef __aarch64__
+#if NPY_SIMD_F64
     #define npyv_div_f32 vdivq_f32
 #else
-    NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+    NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b)
     {
-        float32x4_t recip = vrecpeq_f32(b);
-        recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
-        return vmulq_f32(a, recip);
+        // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+        // estimate to 1/b
+        npyv_f32 recipe = vrecpeq_f32(b);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (2-d * x[n])
+         * converges to (1/d) if x0 is the result of VRECPE applied to d.
+         *
+         *  NOTE: at least 3 iterations is needed to improve precision
+         */
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        recipe = vmulq_f32(vrecpsq_f32(b, recipe), recipe);
+        // a/b = a*recip(b)
+        return vmulq_f32(a, recipe);
     }
 #endif
 #define npyv_div_f64 vdivq_f64
 
+/***************************
+ * FUSED F32
+ ***************************/
+#ifdef NPY_HAVE_NEON_VFPV4 // FMA
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmaq_f32(vnegq_f32(c), a, b); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmsq_f32(c, a, b); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vfmsq_f32(vnegq_f32(c), a, b); }
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlaq_f32(vnegq_f32(c), a, b); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlsq_f32(c, a, b); }
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return vmlsq_f32(vnegq_f32(c), a, b); }
+#endif
+/***************************
+ * FUSED F64
+ ***************************/
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmaq_f64(c, a, b); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmaq_f64(vnegq_f64(c), a, b); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmsq_f64(c, a, b); }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return vfmsq_f64(vnegq_f64(c), a, b); }
+#endif // NPY_SIMD_F64
+
+// Horizontal add: Calculates the sum of all vector elements.
+#if NPY_SIMD_F64
+    #define npyv_sum_f32 vaddvq_f32
+    #define npyv_sum_f64 vaddvq_f64
+#else
+    NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+    {
+        float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+        return vget_lane_f32(vpadd_f32(r, r), 0);
+    }
+#endif
+
 #endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
index b286931d1..f9840b1cb 100644
--- a/numpy/core/src/common/simd/neon/conversion.h
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -7,26 +7,68 @@
 
 // convert boolean vectors to integer vectors
 #define npyv_cvt_u8_b8(A)   A
-#define npyv_cvt_s8_b8(A)   vreinterpretq_s8_u8(A)
+#define npyv_cvt_s8_b8   vreinterpretq_s8_u8
 #define npyv_cvt_u16_b16(A) A
-#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
+#define npyv_cvt_s16_b16 vreinterpretq_s16_u16
 #define npyv_cvt_u32_b32(A) A
-#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
+#define npyv_cvt_s32_b32 vreinterpretq_s32_u32
 #define npyv_cvt_u64_b64(A) A
-#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
-#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
-#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
+#define npyv_cvt_s64_b64 vreinterpretq_s64_u64
+#define npyv_cvt_f32_b32 vreinterpretq_f32_u32
+#define npyv_cvt_f64_b64 vreinterpretq_f64_u64
 
 // convert integer vectors to boolean vectors
 #define npyv_cvt_b8_u8(BL)   BL
-#define npyv_cvt_b8_s8(BL)   vreinterpretq_u8_s8(BL)
+#define npyv_cvt_b8_s8   vreinterpretq_u8_s8
 #define npyv_cvt_b16_u16(BL) BL
-#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
+#define npyv_cvt_b16_s16 vreinterpretq_u16_s16
 #define npyv_cvt_b32_u32(BL) BL
-#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
+#define npyv_cvt_b32_s32 vreinterpretq_u32_s32
 #define npyv_cvt_b64_u64(BL) BL
-#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
-#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
-#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
+#define npyv_cvt_b64_s64 vreinterpretq_u64_s64
+#define npyv_cvt_b32_f32 vreinterpretq_u32_f32
+#define npyv_cvt_b64_f64 vreinterpretq_u64_f64
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u8 seq_scale = vandq_u8(a, scale);
+#if NPY_SIMD_F64
+    npy_uint8 sumlo = vaddv_u8(vget_low_u8(seq_scale));
+    npy_uint8 sumhi = vaddv_u8(vget_high_u8(seq_scale));
+    return sumlo + ((int)sumhi << 8);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(seq_scale)));
+    return vgetq_lane_u64(sumh, 0) + ((int)vgetq_lane_u64(sumh, 1) << 8);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128);
+    npyv_u16 seq_scale = vandq_u16(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u16(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(vpaddlq_u16(seq_scale));
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8);
+    npyv_u32 seq_scale = vandq_u32(a, scale);
+#if NPY_SIMD_F64
+    return vaddvq_u32(seq_scale);
+#else
+    npyv_u64 sumh = vpaddlq_u32(seq_scale);
+    return vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = vshrq_n_u64(a, 63);
+    return vgetq_lane_u64(bit, 0) | ((int)vgetq_lane_u64(bit, 1) << 1);
+}
 
 #endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
new file mode 100644
index 000000000..a2bbdf2a5
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -0,0 +1,86 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MATH_H
+#define _NPY_SIMD_NEON_MATH_H
+
+/***************************
+ * Elementary
+ ***************************/
+// Absolute
+#define npyv_abs_f32 vabsq_f32
+#define npyv_abs_f64 vabsq_f64
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vmulq_f32(a, a); }
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+    { return vmulq_f64(a, a); }
+#endif
+
+// Square root
+#if NPY_SIMD_F64
+    #define npyv_sqrt_f32 vsqrtq_f32
+    #define npyv_sqrt_f64 vsqrtq_f64
+#else
+    // Based on ARM doc, see https://developer.arm.com/documentation/dui0204/j/CIHDIACI
+    NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a)
+    {
+        const npyv_f32 zero = vdupq_n_f32(0.0f);
+        const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
+        npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
+        // guard agianst floating-point division-by-zero error
+        npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
+        // estimate to (1/√a)
+        npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
+        /**
+         * Newton-Raphson iteration:
+         *  x[n+1] = x[n] * (3-d * (x[n]*x[n]) )/2)
+         * converges to (1/√d)if x0 is the result of VRSQRTE applied to d.
+         *
+         * NOTE: at least 3 iterations is needed to improve precision
+         */
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        rsqrte = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, rsqrte), rsqrte), rsqrte);
+        // a * (1/√a)
+        npyv_f32 sqrt = vmulq_f32(a, rsqrte);
+        // return zero if the a is zero
+        // - return zero if a is zero.
+        // - return positive infinity if a is positive infinity
+        return vbslq_f32(vorrq_u32(is_zero, is_inf), a, sqrt);
+    }
+#endif // NPY_SIMD_F64
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+#if NPY_SIMD_F64
+    const npyv_f32 one = vdupq_n_f32(1.0f);
+    return npyv_div_f32(one, a);
+#else
+    npyv_f32 recipe = vrecpeq_f32(a);
+    /**
+     * Newton-Raphson iteration:
+     *  x[n+1] = x[n] * (2-d * x[n])
+     * converges to (1/d) if x0 is the result of VRECPE applied to d.
+     *
+     * NOTE: at least 3 iterations is needed to improve precision
+     */
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    recipe = vmulq_f32(vrecpsq_f32(a, recipe), recipe);
+    return recipe;
+#endif
+}
+#if NPY_SIMD_F64
+    NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+    {
+        const npyv_f64 one = vdupq_n_f64(1.0);
+        return npyv_div_f64(one, a);
+    }
+#endif // NPY_SIMD_F64
+
+#endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index afa703584..1e258f1bc 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -5,6 +5,8 @@
 #ifndef _NPY_SIMD_NEON_MEMORY_H
 #define _NPY_SIMD_NEON_MEMORY_H
 
+#include "misc.h"
+
 /***************************
  * load/store
  ***************************/
@@ -45,5 +47,290 @@ NPYV_IMPL_NEON_MEM(f32, float)
 #if NPY_SIMD_F64
 NPYV_IMPL_NEON_MEM(f64, double)
 #endif
+/***************************
+ * Non-contiguous Load
+ ***************************/
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+    switch (stride) {
+    case 2:
+        return vld2q_s32((const int32_t*)ptr).val[0];
+    case 3:
+        return vld3q_s32((const int32_t*)ptr).val[0];
+    case 4:
+        return vld4q_s32((const int32_t*)ptr).val[0];
+    default:;
+        int32x2_t ax = vcreate_s32(*ptr);
+        int32x4_t a = vcombine_s32(ax, ax);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride,   a, 1);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2);
+                  a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3);
+        return a;
+    }
+}
+
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_u32_s32(
+        npyv_loadn_s32((const npy_int32*)ptr, stride)
+    );
+}
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_f32_s32(
+        npyv_loadn_s32((const npy_int32*)ptr, stride)
+    );
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{
+    return vcombine_s64(
+        vld1_s64((const int64_t*)ptr), vld1_s64((const int64_t*)ptr + stride)
+    );
+}
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_u64_s64(
+        npyv_loadn_s64((const npy_int64*)ptr, stride)
+    );
+}
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{
+    return npyv_reinterpret_f64_s64(
+        npyv_loadn_s64((const npy_int64*)ptr, stride)
+    );
+}
+#endif
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    vst1q_lane_s32((int32_t*)ptr, a, 0);
+    vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+    vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+    vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{
+    vst1q_lane_s64((int64_t*)ptr, a, 0);
+    vst1q_lane_s64((int64_t*)ptr + stride, a, 1);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); }
+
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
+#endif
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0);
+    case 2:
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill));
+    case 3:
+        return vcombine_s32(
+            vld1_s32((const int32_t*)ptr),
+            vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0)
+        );
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+    }
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s64(ptr, nlane, 0); }
+
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    int32x4_t vfill = vdupq_n_s32(fill);
+    switch(nlane) {
+    case 3:
+        vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2);
+    case 2:
+        vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1);
+    case 1:
+        vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0);
+        return vfill;
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    }
+}
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        vst1q_lane_s32((int32_t*)ptr, a, 0);
+        break;
+    case 2:
+        vst1_s32((int32_t*)ptr, vget_low_s32(a));
+        break;
+    case 3:
+        vst1_s32((int32_t*)ptr, vget_low_s32(a));
+        vst1q_lane_s32((int32_t*)ptr + 2, a, 2);
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        vst1q_lane_s64((int64_t*)ptr, a, 0);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    default:
+        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+    case 3:
+        vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+    case 2:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+    case 1:
+        vst1q_lane_s32((int32_t*)ptr, a, 0);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        vst1q_lane_s64((int64_t*)ptr, a, 0);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                     \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
+#endif
 
 #endif // _NPY_SIMD_NEON_MEMORY_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index 280a34297..c8ddc92ad 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -72,3 +72,4 @@ typedef float64x2x3_t npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 712a77982..50b06ed11 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -107,4 +107,13 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
 
+// Reverse elements of each 64-bit lane
+#define npyv_rev64_u8  vrev64q_u8
+#define npyv_rev64_s8  vrev64q_s8
+#define npyv_rev64_u16 vrev64q_u16
+#define npyv_rev64_s16 vrev64q_s16
+#define npyv_rev64_u32 vrev64q_u32
+#define npyv_rev64_s32 vrev64q_s32
+#define npyv_rev64_f32 vrev64q_f32
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 2f39c8427..8804223c9 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -49,6 +49,55 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_WIDTH 0
     #define NPY_SIMD_F64 0
 #endif
+/**
+ * Some SIMD extensions currently(AVX2, AVX512F) require (de facto)
+ * a maximum number of strides sizes when dealing with non-contiguous memory access.
+ *
+ * Therefore the following functions must be used to check the maximum
+ * acceptable limit of strides before using any of non-contiguous load/store intrinsics.
+ *
+ * For instance:
+ *  npy_intp ld_stride = step[0] / sizeof(float);
+ *  npy_intp st_stride = step[1] / sizeof(float);
+ *
+ *  if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) {
+ *      for (;;)
+ *          npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride);
+ *          // ...
+ *          npyv_storen_f32(st_pointer, st_stride, a);
+ *  }
+ *  else {
+ *      for (;;)
+ *          // C scalars
+ *  }
+ */
+#ifndef NPY_SIMD_MAXLOAD_STRIDE32
+    #define NPY_SIMD_MAXLOAD_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE32
+    #define NPY_SIMD_MAXSTORE_STRIDE32 0
+#endif
+#ifndef NPY_SIMD_MAXLOAD_STRIDE64
+    #define NPY_SIMD_MAXLOAD_STRIDE64 0
+#endif
+#ifndef NPY_SIMD_MAXSTORE_STRIDE64
+    #define NPY_SIMD_MAXSTORE_STRIDE64 0
+#endif
+#define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \
+    NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \
+    { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \
+    NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \
+    { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; }
+#if NPY_SIMD
+    NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32)
+    NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+    NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
+#if NPY_SIMD_F64
+    NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64)
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index 12d0af05c..8440cc52e 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -91,5 +91,87 @@ NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
 // TODO: emulate integer division
 #define npyv_div_f32 _mm_div_ps
 #define npyv_div_f64 _mm_div_pd
+/***************************
+ * FUSED
+ ***************************/
+#ifdef NPY_HAVE_FMA3
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm_fmadd_ps
+    #define npyv_muladd_f64 _mm_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm_fmsub_ps
+    #define npyv_mulsub_f64 _mm_fmsub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm_fnmadd_ps
+    #define npyv_nmuladd_f64 _mm_fnmadd_pd
+    // negate multiply and subtract, -(a*b) - c
+    #define npyv_nmulsub_f32 _mm_fnmsub_ps
+    #define npyv_nmulsub_f64 _mm_fnmsub_pd
+#elif defined(NPY_HAVE_FMA4)
+    // multiply and add, a*b + c
+    #define npyv_muladd_f32 _mm_macc_ps
+    #define npyv_muladd_f64 _mm_macc_pd
+    // multiply and subtract, a*b - c
+    #define npyv_mulsub_f32 _mm_msub_ps
+    #define npyv_mulsub_f64 _mm_msub_pd
+    // negate multiply and add, -(a*b) + c
+    #define npyv_nmuladd_f32 _mm_nmacc_ps
+    #define npyv_nmuladd_f64 _mm_nmacc_pd
+#else
+    // multiply and add, a*b + c
+    NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_add_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_add_f64(npyv_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(npyv_mul_f64(a, b), c); }
+    // negate multiply and add, -(a*b) + c
+    NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
+    NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+#endif // NPY_HAVE_FMA3
+#ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
+    // negate multiply and subtract, -(a*b) - c
+    NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 neg_a = npyv_xor_f32(a, npyv_setall_f32(-0.0f));
+        return npyv_sub_f32(npyv_mul_f32(neg_a, b), c);
+    }
+    NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
+        return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
+    }
+#endif // !NPY_HAVE_FMA3
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(__m128 a)
+{
+#ifdef NPY_HAVE_SSE3
+    __m128 sum_halves = _mm_hadd_ps(a, a);
+    return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves));
+#else
+    __m128 t1 = _mm_movehl_ps(a, a);
+    __m128 t2 = _mm_add_ps(a, t1);
+    __m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+    __m128 t4 = _mm_add_ss(t2, t3);
+    return _mm_cvtss_f32(t4); 
+#endif
+}
+
+NPY_FINLINE double npyv_sum_f64(__m128d a)
+{
+#ifdef NPY_HAVE_SSE3
+    return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+    return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
+}
 
 #endif // _NPY_SIMD_SSE_ARITHMETIC_H
+
+
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
index ea9660d13..ab4beea96 100644
--- a/numpy/core/src/common/simd/sse/conversion.h
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -14,8 +14,8 @@
 #define npyv_cvt_s32_b32(BL) BL
 #define npyv_cvt_u64_b64(BL) BL
 #define npyv_cvt_s64_b64(BL) BL
-#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL)
-#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL)
+#define npyv_cvt_f32_b32 _mm_castsi128_ps
+#define npyv_cvt_f64_b64 _mm_castsi128_pd
 
 // convert integer types to mask types
 #define npyv_cvt_b8_u8(A)   A
@@ -26,7 +26,20 @@
 #define npyv_cvt_b32_s32(A) A
 #define npyv_cvt_b64_u64(A) A
 #define npyv_cvt_b64_s64(A) A
-#define npyv_cvt_b32_f32(A) _mm_castps_si128(A)
-#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A)
+#define npyv_cvt_b32_f32 _mm_castps_si128
+#define npyv_cvt_b64_f64 _mm_castpd_si128
+
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{ return (npy_uint16)_mm_movemask_epi8(a); }
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    __m128i pack = _mm_packs_epi16(a, a);
+    return (npy_uint8)_mm_movemask_epi8(pack);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{ return (npy_uint8)_mm_movemask_ps(_mm_castsi128_ps(a)); }
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{ return (npy_uint8)_mm_movemask_pd(_mm_castsi128_pd(a)); }
 
 #endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
new file mode 100644
index 000000000..b7203cd89
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -0,0 +1,40 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MATH_H
+#define _NPY_SIMD_SSE_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 _mm_sqrt_ps
+#define npyv_sqrt_f64 _mm_sqrt_pd
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{ return _mm_div_ps(_mm_set1_ps(1.0f), a); }
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{ return _mm_div_pd(_mm_set1_pd(1.0), a); }
+
+// Absolute
+NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a)
+{
+    return _mm_and_ps(
+        a, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))
+    );
+}
+NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
+{
+    return _mm_and_pd(
+        a, _mm_castsi128_pd(npyv_setall_s64(0x7fffffffffffffffLL))
+    );
+}
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return _mm_mul_ps(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return _mm_mul_pd(a, a); }
+
+#endif
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 1a555d6f0..1074c3b02 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -5,6 +5,8 @@
 #ifndef _NPY_SIMD_SSE_MEMORY_H
 #define _NPY_SIMD_SSE_MEMORY_H
 
+#include "misc.h"
+
 /***************************
  * load/store
  ***************************/
@@ -70,5 +72,427 @@ NPYV_IMPL_SSE_MEM_INT(npy_int64,  s64)
 // store higher part
 #define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
 #define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{
+    __m128i a = _mm_cvtsi32_si128(*ptr);
+#ifdef NPY_HAVE_SSE41
+    a = _mm_insert_epi32(a, ptr[stride],   1);
+    a = _mm_insert_epi32(a, ptr[stride*2], 2);
+    a = _mm_insert_epi32(a, ptr[stride*3], 3);
+#else
+    __m128i a1 = _mm_cvtsi32_si128(ptr[stride]);
+    __m128i a2 = _mm_cvtsi32_si128(ptr[stride*2]);
+    __m128i a3 = _mm_cvtsi32_si128(ptr[stride*3]);
+    a = _mm_unpacklo_epi32(a, a1);
+    a = _mm_unpacklo_epi64(a, _mm_unpacklo_epi32(a2, a3));
+#endif
+    return a;
+}
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return npyv_loadn_s32((const npy_int32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return _mm_castsi128_ps(npyv_loadn_s32((const npy_int32*)ptr, stride)); }
+//// 64
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return _mm_loadh_pd(npyv_loadl_f64(ptr), ptr + stride); }
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{
+    ptr[stride * 0] = _mm_cvtsi128_si32(a);
+#ifdef NPY_HAVE_SSE41
+    ptr[stride * 1] = _mm_extract_epi32(a, 1);
+    ptr[stride * 2] = _mm_extract_epi32(a, 2);
+    ptr[stride * 3] = _mm_extract_epi32(a, 3);
+#else
+    ptr[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+    ptr[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    ptr[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+#endif
+}
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_s32((npy_int32*)ptr, stride, _mm_castps_si128(a)); }
+//// 64
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{
+    _mm_storel_pd(ptr, a);
+    _mm_storeh_pd(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+#if defined(__clang__) && __clang_major__ > 7
+    /**
+     * Clang >=8 perform aggressive optimization that tends to
+     * zero the bits of upper half part of vectors even
+     * when we try to fill it up with certain scalars,
+     * which my lead to zero division errors.
+    */
+    #define NPYV__CLANG_ZEROUPPER
+#endif
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane > 3) {
+        return npyv_load_s32(ptr);
+    }
+    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+    for (npy_uint64 i = 0; i < nlane; ++i) {
+        data[i] = ptr[i];
+    }
+    return npyv_loada_s32(data);
+#else
+    #ifndef NPY_HAVE_SSE41
+        const short *wptr = (const short*)ptr;
+    #endif
+    const __m128i vfill = npyv_setall_s32(fill);
+    __m128i a;
+    switch(nlane) {
+    case 2:
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    #ifdef NPY_HAVE_SSE41
+        case 1:
+            return _mm_insert_epi32(vfill, ptr[0], 0);
+        case 3:
+            a = _mm_loadl_epi64((const __m128i*)ptr);
+            a = _mm_insert_epi32(a, ptr[2], 2);
+            a = _mm_insert_epi32(a, fill, 3);
+            return a;
+    #else
+        case 1:
+            a = _mm_insert_epi16(vfill, wptr[0], 0);
+            return _mm_insert_epi16(a, wptr[1], 1);
+        case 3:
+            a = _mm_loadl_epi64((const __m128i*)ptr);
+            a = _mm_unpacklo_epi64(a, vfill);
+            a = _mm_insert_epi16(a, wptr[4], 4);
+            a = _mm_insert_epi16(a, wptr[5], 5);
+            return a;
+    #endif // NPY_HAVE_SSE41
+        default:
+            return npyv_load_s32(ptr);
+        }
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return _mm_cvtsi32_si128(*ptr);
+    case 2:
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    case 3:;
+        npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+    #ifdef NPY_HAVE_SSE41
+        return _mm_insert_epi32(a, ptr[2], 2);
+    #else
+        return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+    #endif
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane <= 2) {
+        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+        for (npy_uint64 i = 0; i < nlane; ++i) {
+            data[i] = ptr[i];
+        }
+        return npyv_loada_s64(data);
+    }
+#else
+    if (nlane == 1) {
+        const __m128i vfill = npyv_setall_s64(fill);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+#endif
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_load_s64(ptr);
+}
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane > 3) {
+        return npyv_loadn_s32(ptr, stride);
+    }
+    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
+    for (npy_uint64 i = 0; i < nlane; ++i) {
+        data[i] = ptr[stride*i];
+    }
+    return npyv_loada_s32(data);
+#else
+    __m128i vfill = npyv_setall_s32(fill);
+    #ifndef NPY_HAVE_SSE41
+        const short *wptr = (const short*)ptr;
+    #endif
+    switch(nlane) {
+    #ifdef NPY_HAVE_SSE41
+        case 3:
+            vfill = _mm_insert_epi32(vfill, ptr[stride*2], 2);
+        case 2:
+            vfill = _mm_insert_epi32(vfill, ptr[stride], 1);
+        case 1:
+            vfill = _mm_insert_epi32(vfill, ptr[0], 0);
+            break;
+    #else
+        case 3:
+            vfill = _mm_unpacklo_epi32(_mm_cvtsi32_si128(ptr[stride*2]), vfill);
+        case 2:
+            vfill = _mm_unpacklo_epi64(_mm_unpacklo_epi32(
+                _mm_cvtsi32_si128(*ptr), _mm_cvtsi32_si128(ptr[stride])
+            ), vfill);
+            break;
+        case 1:
+            vfill = _mm_insert_epi16(vfill, wptr[0], 0);
+            vfill = _mm_insert_epi16(vfill, wptr[1], 1);
+            break;
+    #endif // NPY_HAVE_SSE41
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    } // switch
+    return vfill;
+#endif
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        return _mm_cvtsi32_si128(ptr[0]);
+    case 2:;
+        npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+        return _mm_insert_epi32(a, ptr[stride], 1);
+#else
+        return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+#endif // NPY_HAVE_SSE41
+    case 3:;
+        a = _mm_cvtsi32_si128(ptr[0]);
+#ifdef NPY_HAVE_SSE41
+        a = _mm_insert_epi32(a, ptr[stride], 1);
+        a = _mm_insert_epi32(a, ptr[stride*2], 2);
+        return a;
+#else
+        a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+        a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+        return a;
+#endif // NPY_HAVE_SSE41
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    }
+}
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+#ifdef NPYV__CLANG_ZEROUPPER
+    if (nlane <= 2) {
+        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
+        for (npy_uint64 i = 0; i < nlane; ++i) {
+            data[i] = ptr[i*stride];
+        }
+        return npyv_loada_s64(data);
+    }
+#else
+    if (nlane == 1) {
+        const __m128i vfill = npyv_setall_s64(fill);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+#endif
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        *ptr = _mm_cvtsi128_si32(a);
+        break;
+    case 2:
+        _mm_storel_epi64((__m128i *)ptr, a);
+        break;
+    case 3:
+        _mm_storel_epi64((__m128i *)ptr, a);
+    #ifdef NPY_HAVE_SSE41
+        ptr[2] = _mm_extract_epi32(a, 2);
+    #else
+        ptr[2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    #endif
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        _mm_storel_epi64((__m128i *)ptr, a);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+#ifdef NPY_HAVE_SSE41
+    default:
+        ptr[stride*3] = _mm_extract_epi32(a, 3);
+    case 3:
+        ptr[stride*2] = _mm_extract_epi32(a, 2);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
+#else
+    default:
+        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+    case 3:
+        ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+    case 2:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+#endif
+    case 1:
+        ptr[stride*0] = _mm_cvtsi128_si32(a);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        _mm_storel_epi64((__m128i *)ptr, a);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_SSE_MEMORY_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index 3f68b4ad7..d96ab9c56 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,4 +81,45 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i lo = _mm_shufflelo_epi16(a, _MM_SHUFFLE(0, 1, 2, 3));
+    return _mm_shufflehi_epi16(lo, _MM_SHUFFLE(0, 1, 2, 3));
+#endif
+}
+#define npyv_rev64_s16 npyv_rev64_u16
+
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return _mm_shuffle_epi8(a, idx);
+#else
+    __m128i rev16 = npyv_rev64_u16(a);
+    // swap 8bit pairs
+    return _mm_or_si128(_mm_slli_epi16(rev16, 8), _mm_srli_epi16(rev16, 8));
+#endif
+}
+#define npyv_rev64_s8 npyv_rev64_u8
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+#define npyv_rev64_s32 npyv_rev64_u32
+
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{
+    return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
+}
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index 364b4baf1..132d3d347 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -64,3 +64,4 @@ typedef struct { __m128d val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
index dd23b5b11..2f6762e63 100644
--- a/numpy/core/src/common/simd/vsx/arithmetic.h
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -100,4 +100,32 @@
 #define npyv_div_f32 vec_div
 #define npyv_div_f64 vec_div
 
+/***************************
+ * FUSED
+ ***************************/
+// multiply and add, a*b + c
+#define npyv_muladd_f32 vec_madd
+#define npyv_muladd_f64 vec_madd
+// multiply and subtract, a*b - c
+#define npyv_mulsub_f32 vec_msub
+#define npyv_mulsub_f64 vec_msub
+// negate multiply and add, -(a*b) + c
+#define npyv_nmuladd_f32 vec_nmsub // equivalent to -(a*b - c)
+#define npyv_nmuladd_f64 vec_nmsub
+// negate multiply and subtract, -(a*b) - c
+#define npyv_nmulsub_f32 vec_nmadd // equivalent to -(a*b + c)
+#define npyv_nmulsub_f64 vec_nmadd
+
+// Horizontal add: Calculates the sum of all vector elements.
+NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
+{
+    npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
+    return vec_extract(sum, 0) + vec_extract(sum, 1);
+}
+
+NPY_FINLINE double npyv_sum_f64(npyv_f64 a)
+{
+    return vec_extract(a, 0) + vec_extract(a, 1);
+}
+
 #endif // _NPY_SIMD_VSX_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
index 6ed135990..5803e1cdd 100644
--- a/numpy/core/src/common/simd/vsx/conversion.h
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -29,4 +29,26 @@
 #define npyv_cvt_b32_f32(A) ((npyv_b32) A)
 #define npyv_cvt_b64_f64(A) ((npyv_b64) A)
 
+// convert boolean vector to integer bitfield
+NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
+{
+    const npyv_u8 qperm = npyv_set_u8(120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 112, 96, 80, 64, 48, 32, 16, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
+{
+    const npyv_u8 qperm = npyv_setf_u8(128, 96, 64, 32, 0);
+    return vec_extract((npyv_u32)vec_vbpermq((npyv_u8)a, qperm), 2);
+}
+NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
+{
+    npyv_u64 bit = npyv_shri_u64((npyv_u64)a, 63);
+    return vec_extract(bit, 0) | (int)vec_extract(bit, 1) << 1;
+}
+
 #endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
new file mode 100644
index 000000000..7c8610b19
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -0,0 +1,36 @@
+#ifndef NPY_SIMD
+    #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MATH_H
+#define _NPY_SIMD_VSX_MATH_H
+/***************************
+ * Elementary
+ ***************************/
+// Square root
+#define npyv_sqrt_f32 vec_sqrt
+#define npyv_sqrt_f64 vec_sqrt
+
+// Reciprocal
+NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
+{
+    const npyv_f32 one = npyv_setall_f32(1.0f);
+    return vec_div(one, a);
+}
+NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a)
+{
+    const npyv_f64 one = npyv_setall_f64(1.0);
+    return vec_div(one, a);
+}
+
+// Absolute
+#define npyv_abs_f32 vec_abs
+#define npyv_abs_f64 vec_abs
+
+// Square
+NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
+{ return vec_mul(a, a); }
+NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
+{ return vec_mul(a, a); }
+
+#endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h
index e0d908bf9..08a0a9276 100644
--- a/numpy/core/src/common/simd/vsx/memory.h
+++ b/numpy/core/src/common/simd/vsx/memory.h
@@ -4,147 +4,343 @@
 
 #ifndef _NPY_SIMD_VSX_MEMORY_H
 #define _NPY_SIMD_VSX_MEMORY_H
+
+#include "misc.h"
+
 /****************************
- * load/store
+ * Private utilities
  ****************************/
 // TODO: test load by cast
 #define VSX__CAST_lOAD 0
 #if VSX__CAST_lOAD
-    #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR)))
+    #define npyv__load(T_VEC, PTR) (*((T_VEC*)(PTR)))
 #else
     /**
      * CLANG fails to load unaligned addresses via vec_xl, vec_xst
      * so we failback to vec_vsx_ld, vec_vsx_st
      */
     #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
-        #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR)
+        #define npyv__load(T_VEC, PTR) vec_vsx_ld(0, PTR)
     #else
-        #define npyv__load(PTR, T_VEC) vec_xl(0, PTR)
+        #define npyv__load(T_VEC, PTR) vec_xl(0, PTR)
     #endif
 #endif
-// unaligned load
-#define npyv_load_u8(PTR)  npyv__load(PTR, npyv_u8)
-#define npyv_load_s8(PTR)  npyv__load(PTR, npyv_s8)
-#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16)
-#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16)
-#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32)
-#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32)
-#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32)
-#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64)
-#if VSX__CAST_lOAD
-    #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64)
-    #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64)
+// unaligned store
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define npyv__store(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
 #else
-    #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR))
-    #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR))
+    #define npyv__store(PTR, VEC) vec_xst(VEC, 0, PTR)
 #endif
-// aligned load
-#define npyv_loada_u8(PTR)  vec_ld(0, PTR)
-#define npyv_loada_s8  npyv_loada_u8
-#define npyv_loada_u16 npyv_loada_u8
-#define npyv_loada_s16 npyv_loada_u8
-#define npyv_loada_u32 npyv_loada_u8
-#define npyv_loada_s32 npyv_loada_u8
-#define npyv_loada_u64 npyv_load_u64
-#define npyv_loada_s64 npyv_load_s64
-#define npyv_loada_f32 npyv_loada_u8
-#define npyv_loada_f64 npyv_load_f64
-// stream load
-#define npyv_loads_u8  npyv_loada_u8
-#define npyv_loads_s8  npyv_loada_s8
-#define npyv_loads_u16 npyv_loada_u16
-#define npyv_loads_s16 npyv_loada_s16
-#define npyv_loads_u32 npyv_loada_u32
-#define npyv_loads_s32 npyv_loada_s32
-#define npyv_loads_u64 npyv_loada_u64
-#define npyv_loads_s64 npyv_loada_s64
-#define npyv_loads_f32 npyv_loada_f32
-#define npyv_loads_f64 npyv_loada_f64
-// load lower part
+
 // avoid aliasing rules
 #ifdef __cplusplus
     template<typename T_PTR>
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr)
-    { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; }
+    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
+    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 #else
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr)
-    { npy_uint64 *ptr64 = ptr; return ptr64; }
+    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 #endif // __cplusplus
-#if defined(__clang__) && !defined(__IBMC__)
-    // vec_promote doesn't support doubleword on clang
-    #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR))
-#else
-    #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0)
-#endif
-#define npyv_loadl_u8(PTR)  ((npyv_u8)npyv_loadl_u64(PTR))
-#define npyv_loadl_s8(PTR)  ((npyv_s8)npyv_loadl_u64(PTR))
-#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR))
-#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR))
-#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR))
-#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR))
-#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR))
-#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR))
-// unaligned store
-#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
-    #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
-#else
-    #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR)
-#endif
-#define npyv_store_s8  npyv_store_u8
-#define npyv_store_u16 npyv_store_u8
-#define npyv_store_s16 npyv_store_u8
-#define npyv_store_u32 npyv_store_u8
-#define npyv_store_s32 npyv_store_u8
-#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
-#define npyv_store_f32 npyv_store_u8
-#define npyv_store_f64 npyv_store_u8
-// aligned store
-#define npyv_storea_u8(PTR, VEC)  vec_st(VEC, 0, PTR)
-#define npyv_storea_s8  npyv_storea_u8
-#define npyv_storea_u16 npyv_storea_u8
-#define npyv_storea_s16 npyv_storea_u8
-#define npyv_storea_u32 npyv_storea_u8
-#define npyv_storea_s32 npyv_storea_u8
-#define npyv_storea_u64 npyv_store_u64
-#define npyv_storea_s64 npyv_store_s64
-#define npyv_storea_f32 npyv_storea_u8
-#define npyv_storea_f64 npyv_store_f64
-// stream store
-#define npyv_stores_u8  npyv_storea_u8
-#define npyv_stores_s8  npyv_storea_s8
-#define npyv_stores_u16 npyv_storea_u16
-#define npyv_stores_s16 npyv_storea_s16
-#define npyv_stores_u32 npyv_storea_u32
-#define npyv_stores_s32 npyv_storea_s32
-#define npyv_stores_u64 npyv_storea_u64
-#define npyv_stores_s64 npyv_storea_s64
-#define npyv_stores_f32 npyv_storea_f32
-#define npyv_stores_f64 npyv_storea_f64
+
+// load lower part
+NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
+{
+    #if defined(__clang__) && !defined(__IBMC__)
+        // vec_promote doesn't support doubleword on clang
+        return npyv_setall_u64(*npyv__ptr2u64(ptr));
+    #else
+        return vec_promote(*npyv__ptr2u64(ptr), 0);
+    #endif
+}
 // store lower part
-#define npyv_storel_u8(PTR, VEC) \
+#define npyv__storel(PTR, VEC) \
     *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0)
-#define npyv_storel_s8  npyv_storel_u8
-#define npyv_storel_u16 npyv_storel_u8
-#define npyv_storel_s16 npyv_storel_u8
-#define npyv_storel_u32 npyv_storel_u8
-#define npyv_storel_s32 npyv_storel_u8
-#define npyv_storel_s64 npyv_storel_u8
-#define npyv_storel_u64 npyv_storel_u8
-#define npyv_storel_f32 npyv_storel_u8
-#define npyv_storel_f64 npyv_storel_u8
-// store higher part
-#define npyv_storeh_u8(PTR, VEC) \
+
+#define npyv__storeh(PTR, VEC) \
     *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1)
-#define npyv_storeh_s8  npyv_storeh_u8
-#define npyv_storeh_u16 npyv_storeh_u8
-#define npyv_storeh_s16 npyv_storeh_u8
-#define npyv_storeh_u32 npyv_storeh_u8
-#define npyv_storeh_s32 npyv_storeh_u8
-#define npyv_storeh_s64 npyv_storeh_u8
-#define npyv_storeh_u64 npyv_storeh_u8
-#define npyv_storeh_f32 npyv_storeh_u8
-#define npyv_storeh_f64 npyv_storeh_u8
+
+/****************************
+ * load/store
+ ****************************/
+#define NPYV_IMPL_VSX_MEM(SFX, DW_CAST)                                                 \
+    NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr)              \
+    { return (npyv_##SFX)npyv__load(npyv_##SFX, (const npyv_lanetype_##DW_CAST*)ptr); } \
+    NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return (npyv_##SFX)vec_ld(0, (const npyv_lanetype_u32*)ptr); }                    \
+    NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return npyv_loada_##SFX(ptr); }                                                   \
+    NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr)             \
+    { return (npyv_##SFX)npyv__loadl(ptr); }                                            \
+    NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)         \
+    { npyv__store((npyv_lanetype_##DW_CAST*)ptr, (npyv_##DW_CAST)vec); }                \
+    NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { vec_st((npyv_u32)vec, 0, (npyv_lanetype_u32*)ptr); }                              \
+    NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv_storea_##SFX(ptr, vec); }                                                    \
+    NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv__storel(ptr, vec); }                                                         \
+    NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec)        \
+    { npyv__storeh(ptr, vec); }
+
+NPYV_IMPL_VSX_MEM(u8,  u8)
+NPYV_IMPL_VSX_MEM(s8,  s8)
+NPYV_IMPL_VSX_MEM(u16, u16)
+NPYV_IMPL_VSX_MEM(s16, s16)
+NPYV_IMPL_VSX_MEM(u32, u32)
+NPYV_IMPL_VSX_MEM(s32, s32)
+NPYV_IMPL_VSX_MEM(u64, f64)
+NPYV_IMPL_VSX_MEM(s64, f64)
+NPYV_IMPL_VSX_MEM(f32, f32)
+NPYV_IMPL_VSX_MEM(f64, f64)
+
+/***************************
+ * Non-contiguous Load
+ ***************************/
+//// 32
+NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return npyv_set_u32(
+        ptr[stride * 0], ptr[stride * 1],
+        ptr[stride * 2], ptr[stride * 3]
+    );
+}
+NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_loadn_u32((const npy_uint32*)ptr, stride); }
+//// 64
+NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_set_u64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_set_s64(ptr[0], ptr[stride]); }
+NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
+{ return npyv_set_f64(ptr[0], ptr[stride]); }
+/***************************
+ * Non-contiguous Store
+ ***************************/
+//// 32
+NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    ptr[stride * 0] = vec_extract(a, 0);
+    ptr[stride * 1] = vec_extract(a, 1);
+    ptr[stride * 2] = vec_extract(a, 2);
+    ptr[stride * 3] = vec_extract(a, 3);
+}
+NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+//// 64
+NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    ptr[stride * 0] = vec_extract(a, 0);
+    ptr[stride * 1] = vec_extract(a, 1);
+}
+NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
+
+/*********************************
+ * Partial Load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    npyv_s32 vfill = npyv_setall_s32(fill);
+    switch(nlane) {
+    case 1:
+        return vec_insert(ptr[0], vfill, 0);
+    case 2:
+        return (npyv_s32)vec_insert(
+            *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+        );
+    case 3:
+        vfill = vec_insert(ptr[2], vfill, 2);
+        return (npyv_s32)vec_insert(
+            *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
+        );
+    default:
+        return npyv_load_s32(ptr);
+    }
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_till_s32(ptr, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s64(ptr[0], fill);
+    }
+    return npyv_load_s64(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{  return npyv_load_till_s64(ptr, nlane, 0); }
+/*********************************
+ * Non-contiguous partial load
+ *********************************/
+//// 32
+NPY_FINLINE npyv_s32
+npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
+{
+    assert(nlane > 0);
+    npyv_s32 vfill = npyv_setall_s32(fill);
+    switch(nlane) {
+    case 3:
+        vfill = vec_insert(ptr[stride*2], vfill, 2);
+    case 2:
+        vfill = vec_insert(ptr[stride], vfill, 1);
+    case 1:
+        vfill = vec_insert(*ptr, vfill, 0);
+        break;
+    default:
+        return npyv_loadn_s32(ptr, stride);
+    } // switch
+    return vfill;
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32
+npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); }
+//// 64
+NPY_FINLINE npyv_s64
+npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s64(*ptr, fill);
+    }
+    return npyv_loadn_s64(ptr, stride);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+/*********************************
+ * Partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    case 1:
+        *ptr = vec_extract(a, 0);
+        break;
+    case 2:
+        npyv_storel_s32(ptr, a);
+        break;
+    case 3:
+        npyv_storel_s32(ptr, a);
+        ptr[2] = vec_extract(a, 2);
+        break;
+    default:
+        npyv_store_s32(ptr, a);
+    }
+}
+//// 64
+NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        npyv_storel_s64(ptr, a);
+        return;
+    }
+    npyv_store_s64(ptr, a);
+}
+/*********************************
+ * Non-contiguous partial store
+ *********************************/
+//// 32
+NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    switch(nlane) {
+    default:
+        ptr[stride*3] = vec_extract(a, 3);
+    case 3:
+        ptr[stride*2] = vec_extract(a, 2);
+    case 2:
+        ptr[stride*1] = vec_extract(a, 1);
+    case 1:
+        ptr[stride*0] = vec_extract(a, 0);
+        break;
+    }
+}
+//// 64
+NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        npyv_storel_s64(ptr, a);
+        return;
+    }
+    npyv_storen_s64(ptr, stride, a);
+}
+/*****************************************************************
+ * Implement partial load/store for u32/f32/u64/f64... via casting
+ *****************************************************************/
+#define NPYV_IMPL_VSX_REST_PARTIAL_TYPES(F_SFX, T_SFX)                                      \
+    NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX                                         \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill)         \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill)                                                            \
+    {                                                                                       \
+        union {                                                                             \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        } pun = {.from_##F_SFX = fill};                                                     \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store_till_##F_SFX                                                \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store_till_##T_SFX(                                                            \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f32, s32)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(u64, s64)
+NPYV_IMPL_VSX_REST_PARTIAL_TYPES(f64, s64)
 
 #endif // _NPY_SIMD_VSX_MEMORY_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h
index bfb9115fa..6533e5093 100644
--- a/numpy/core/src/common/simd/vsx/reorder.h
+++ b/numpy/core/src/common/simd/vsx/reorder.h
@@ -62,4 +62,45 @@ NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
 NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
 
+// Reverse elements of each 64-bit lane
+NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
+{
+#if defined(NPY_HAVE_VSX3) && ((defined(__GNUC__) && __GNUC__ > 7) || defined(__IBMC__))
+    return (npyv_u8)vec_revb((npyv_u64)a);
+#elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+    npyv_u8 ret;
+    __asm__ ("xxbrd %x0,%x1" : "=wa" (ret) : "wa" (a));
+    return ret;
+#else
+    const npyv_u8 idx = npyv_set_u8(
+        7, 6, 5, 4, 3, 2, 1, 0,/*64*/15, 14, 13, 12, 11, 10, 9, 8
+    );
+    return vec_perm(a, a, idx);
+#endif
+}
+NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a)
+{ return (npyv_s8)npyv_rev64_u8((npyv_u8)a); }
+
+NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        6, 7, 4, 5, 2, 3, 0, 1,/*64*/14, 15, 12, 13, 10, 11, 8, 9
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a)
+{ return (npyv_s16)npyv_rev64_u16((npyv_u16)a); }
+
+NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a)
+{
+    const npyv_u8 idx = npyv_set_u8(
+        4, 5, 6, 7, 0, 1, 2, 3,/*64*/12, 13, 14, 15, 8, 9, 10, 11
+    );
+    return vec_perm(a, a, idx);
+}
+NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
+{ return (npyv_s32)npyv_rev64_u32((npyv_u32)a); }
+NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
+{ return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
+
 #endif // _NPY_SIMD_VSX_REORDER_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
index 5525dc1e6..27dde98e7 100644
--- a/numpy/core/src/common/simd/vsx/vsx.h
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -62,3 +62,4 @@ typedef struct { npyv_f64 val[3]; } npyv_f64x3;
 #include "operators.h"
 #include "conversion.h"
 #include "arithmetic.h"
+#include "math.h"
diff --git a/numpy/core/src/multiarray/_datetime.h b/numpy/core/src/multiarray/_datetime.h
index 4e7ade5ed..c0d2f1967 100644
--- a/numpy/core/src/multiarray/_datetime.h
+++ b/numpy/core/src/multiarray/_datetime.h
@@ -200,17 +200,15 @@ convert_pyobject_to_datetime_metadata(PyObject *obj,
                                         PyArray_DatetimeMetaData *out_meta);
 
 /*
- * 'ret' is a PyUString containing the datetime string, and this
- * function appends the metadata string to it.
+ * Returns datetime metadata as a new reference a Unicode object.
+ * Returns NULL on error.
  *
  * If 'skip_brackets' is true, skips the '[]'.
  *
- * This function steals the reference 'ret'
  */
 NPY_NO_EXPORT PyObject *
-append_metastr_to_string(PyArray_DatetimeMetaData *meta,
-                                    int skip_brackets,
-                                    PyObject *ret);
+metastr_to_unicode(PyArray_DatetimeMetaData *meta, int skip_brackets);
+
 
 /*
  * Tests for and converts a Python datetime.datetime or datetime.date
@@ -375,4 +373,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
 NPY_NO_EXPORT PyArray_Descr *
 find_object_datetime_type(PyObject *obj, int type_num);
 
+NPY_NO_EXPORT int
+PyArray_InitializeDatetimeCasts(void);
+
 #endif
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index da631c830..3c8caefce 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -9,8 +9,7 @@
 #include "common.h"
 #include "mem_overlap.h"
 #include "npy_extint128.h"
-#include "common.h"
-
+#include "array_method.h"
 
 #if defined(MS_WIN32) || defined(__CYGWIN__)
 #define EXPORT(x) __declspec(dllexport) x
@@ -38,6 +37,7 @@ IsPythonScalar(PyObject * dummy, PyObject *args)
 
 #include "npy_pycompat.h"
 
+
 /** Function to test calling via ctypes */
 EXPORT(void*) forward_pointer(void *x)
 {
@@ -178,17 +178,20 @@ test_neighborhood_iterator(PyObject* NPY_UNUSED(self), PyObject* args)
     /* Compute boundaries for the neighborhood iterator */
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -337,17 +340,20 @@ test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
     /* Compute boundaries for the neighborhood iterator */
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b1, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -361,17 +367,20 @@ test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
 
     for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
         PyObject* bound;
+
         bound = PySequence_GetItem(b2, i);
         if (bound == NULL) {
             goto clean_itx;
         }
-        if (!PyInt_Check(bound)) {
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
             PyErr_SetString(PyExc_ValueError,
-                    "bound not long");
+                    "bound is invalid");
             Py_DECREF(bound);
             goto clean_itx;
         }
-        bounds[i] = PyInt_AsLong(bound);
         Py_DECREF(bound);
     }
 
@@ -612,6 +621,105 @@ fromstring_null_term_c_api(PyObject *dummy, PyObject *byte_obj)
 }
 
 
+/*
+ * Create a custom field dtype from an existing void one (and test some errors).
+ * The dtypes created by this function may be not be usable (or even crash
+ * while using).
+ */
+static PyObject *
+create_custom_field_dtype(PyObject *NPY_UNUSED(mod), PyObject *args)
+{
+    PyArray_Descr *dtype;
+    PyTypeObject *scalar_type;
+    PyTypeObject *original_type = NULL;
+    int error_path;
+
+    if (!PyArg_ParseTuple(args, "O!O!i",
+            &PyArrayDescr_Type, &dtype,
+            &PyType_Type, &scalar_type,
+            &error_path)) {
+        return NULL;
+    }
+    /* check that the result should be more or less valid */
+    if (dtype->type_num != NPY_VOID || dtype->fields == NULL ||
+            !PyDict_CheckExact(dtype->fields) ||
+            PyTuple_Size(dtype->names) != 1 ||
+            !PyDataType_REFCHK(dtype) ||
+            dtype->elsize != sizeof(PyObject *)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Bad dtype passed to test function, must be an object "
+                "containing void with a single field.");
+        return NULL;
+    }
+
+    /* Copy and then appropriate this dtype */
+    original_type = Py_TYPE(dtype);
+    dtype = PyArray_DescrNew(dtype);
+    if (dtype == NULL) {
+        return NULL;
+    }
+
+    Py_INCREF(scalar_type);
+    Py_SETREF(dtype->typeobj, scalar_type);
+    if (error_path == 1) {
+        /* Test that we reject this, if fields was not already set */
+        Py_SETREF(dtype->fields, NULL);
+    }
+    else if (error_path == 2) {
+        /*
+         * Test that we reject this if the type is not set to something that
+         * we are pretty sure can be safely replaced.
+         */
+        Py_SET_TYPE(dtype, scalar_type);
+    }
+    else if (error_path != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "invalid error argument to test function.");
+    }
+    if (PyArray_RegisterDataType(dtype) < 0) {
+        /* Fix original type in the error_path == 2 case and delete it */
+        Py_SET_TYPE(dtype, original_type);
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    Py_INCREF(dtype);  /* hold on to the original (leaks a reference) */
+    return (PyObject *)dtype;
+}
+
+
+PyObject *
+corrupt_or_fix_bufferinfo(PyObject *dummy, PyObject *obj)
+{
+    void **buffer_info_ptr;
+    if (PyArray_Check(obj)) {
+        buffer_info_ptr = &((PyArrayObject_fields *)obj)->_buffer_info;
+    }
+    else if (PyArray_IsScalar(obj, Void)) {
+        buffer_info_ptr = &((PyVoidScalarObject *)obj)->_buffer_info;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "argument must be an array or void scalar");
+        return NULL;
+    }
+    if (*buffer_info_ptr == NULL) {
+        /* set to an invalid value (as a subclass might accidentally) */
+        *buffer_info_ptr = obj;
+        assert(((uintptr_t)obj & 7) == 0);
+    }
+    else if (*buffer_info_ptr == obj) {
+        /* Reset to a NULL (good value) */
+        *buffer_info_ptr = NULL;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "buffer was already exported, this test doesn't support that");
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
 /* check no elison for avoided increfs */
 static PyObject *
 incref_elide(PyObject *dummy, PyObject *args)
@@ -905,6 +1013,79 @@ get_c_wrapping_array(PyObject* NPY_UNUSED(self), PyObject* arg)
 }
 
 
+static PyObject *
+get_all_cast_information(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
+{
+    PyObject *result = PyList_New(0);
+    if (result == NULL) {
+        return NULL;
+    }
+    PyObject *classes = PyObject_CallMethod(
+            (PyObject *)&PyArrayDescr_Type, "__subclasses__", "");
+    if (classes == NULL) {
+        return NULL;
+    }
+    Py_SETREF(classes, PySequence_Fast(classes, NULL));
+    if (classes == NULL) {
+        goto fail;
+    }
+
+    Py_ssize_t nclass = PySequence_Length(classes);
+    for (Py_ssize_t  i = 0; i < nclass; i++) {
+        PyArray_DTypeMeta *from_dtype = (
+                (PyArray_DTypeMeta *)PySequence_Fast_GET_ITEM(classes, i));
+        if (from_dtype->abstract) {
+            /*
+             * TODO: In principle probably needs to recursively check this,
+             *       also we may allow casts to abstract dtypes at some point.
+             */
+            continue;
+        }
+
+        PyObject *to_dtype, *cast_obj;
+        Py_ssize_t pos = 0;
+
+        while (PyDict_Next(from_dtype->castingimpls, &pos, &to_dtype, &cast_obj)) {
+            if (cast_obj == Py_None) {
+                continue;
+            }
+            PyArrayMethodObject *cast = (PyArrayMethodObject *)cast_obj;
+
+            /* Pass some information about this cast out! */
+            PyObject *cast_info = Py_BuildValue("{sOsOsisisisisisssi}",
+                    "from", from_dtype,
+                    "to", to_dtype,
+                    "legacy", (cast->name != NULL &&
+                               strncmp(cast->name, "legacy_", 7) == 0),
+                    "casting", cast->casting & ~_NPY_CAST_IS_VIEW,
+                    "requires_pyapi", cast->flags & NPY_METH_REQUIRES_PYAPI,
+                    "supports_unaligned",
+                        cast->flags & NPY_METH_SUPPORTS_UNALIGNED,
+                    "no_floatingpoint_errors",
+                        cast->flags & NPY_METH_NO_FLOATINGPOINT_ERRORS,
+                    "name", cast->name,
+                    "cast_is_view",
+                        cast->casting & _NPY_CAST_IS_VIEW);
+            if (cast_info == NULL) {
+                goto fail;
+            }
+            int res = PyList_Append(result, cast_info);
+            Py_DECREF(cast_info);
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+    Py_DECREF(classes);
+    return result;
+
+  fail:
+    Py_XDECREF(classes);
+    Py_XDECREF(result);
+    return NULL;
+}
+
+
 /*
  * Test C-api level item getting.
  */
@@ -1155,11 +1336,11 @@ array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject
     }
 
     for (j = 0; j < nterms; ++j) {
-        terms[j].a = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(A, j));
+        terms[j].a = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(A, j));
         if (error_converting(terms[j].a)) {
             goto fail;
         }
-        terms[j].ub = (npy_int64)PyInt_AsSsize_t(PyTuple_GET_ITEM(U, j));
+        terms[j].ub = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(U, j));
         if (error_converting(terms[j].ub)) {
             goto fail;
         }
@@ -1735,8 +1916,8 @@ get_struct_alignments(PyObject *NPY_UNUSED(self), PyObject *args) {
 /**begin repeat
  * #N = 1,2,3#
  */
-    alignment = PyInt_FromLong(_ALIGN(struct TestStruct@N@));
-    size = PyInt_FromLong(sizeof(struct TestStruct@N@));
+    alignment = PyLong_FromLong(_ALIGN(struct TestStruct@N@));
+    size = PyLong_FromLong(sizeof(struct TestStruct@N@));
     val = PyTuple_Pack(2, alignment, size);
     Py_DECREF(alignment);
     Py_DECREF(size);
@@ -1902,7 +2083,7 @@ PrintFloat_Printf_g(PyObject *obj, int precision)
         PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
     }
 
-    return PyUString_FromString(str);
+    return PyUnicode_FromString(str);
 }
 
 
@@ -1938,6 +2119,18 @@ getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
     return ret;
 }
 
+
+static PyObject *
+uses_new_casts(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+#if NPY_USE_NEW_CASTINGIMPL
+    Py_RETURN_TRUE;
+#else
+    Py_RETURN_FALSE;
+#endif
+}
+
+
 static PyObject *
 run_byteorder_converter(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -1952,7 +2145,7 @@ run_byteorder_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SWAP: return PyUnicode_FromString("NPY_SWAP");
         case NPY_IGNORE: return PyUnicode_FromString("NPY_IGNORE");
     }
-    return PyInt_FromLong(byteorder);
+    return PyLong_FromLong(byteorder);
 }
 
 static PyObject *
@@ -1967,7 +2160,7 @@ run_sortkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_HEAPSORT: return PyUnicode_FromString("NPY_HEAPSORT");
         case NPY_STABLESORT: return PyUnicode_FromString("NPY_STABLESORT");
     }
-    return PyInt_FromLong(kind);
+    return PyLong_FromLong(kind);
 }
 
 static PyObject *
@@ -1980,7 +2173,7 @@ run_selectkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
     switch (kind) {
         case NPY_INTROSELECT: return PyUnicode_FromString("NPY_INTROSELECT");
     }
-    return PyInt_FromLong(kind);
+    return PyLong_FromLong(kind);
 }
 
 static PyObject *
@@ -1994,7 +2187,7 @@ run_searchside_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SEARCHLEFT: return PyUnicode_FromString("NPY_SEARCHLEFT");
         case NPY_SEARCHRIGHT: return PyUnicode_FromString("NPY_SEARCHRIGHT");
     }
-    return PyInt_FromLong(side);
+    return PyLong_FromLong(side);
 }
 
 static PyObject *
@@ -2010,7 +2203,7 @@ run_order_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_FORTRANORDER: return PyUnicode_FromString("NPY_FORTRANORDER");
         case NPY_KEEPORDER: return PyUnicode_FromString("NPY_KEEPORDER");
     }
-    return PyInt_FromLong(order);
+    return PyLong_FromLong(order);
 }
 
 static PyObject *
@@ -2025,7 +2218,7 @@ run_clipmode_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_WRAP: return PyUnicode_FromString("NPY_WRAP");
         case NPY_RAISE: return PyUnicode_FromString("NPY_RAISE");
     }
-    return PyInt_FromLong(mode);
+    return PyLong_FromLong(mode);
 }
 
 static PyObject *
@@ -2041,8 +2234,8 @@ run_casting_converter(PyObject* NPY_UNUSED(self), PyObject *args)
         case NPY_SAFE_CASTING: return PyUnicode_FromString("NPY_SAFE_CASTING");
         case NPY_SAME_KIND_CASTING: return PyUnicode_FromString("NPY_SAME_KIND_CASTING");
         case NPY_UNSAFE_CASTING: return PyUnicode_FromString("NPY_UNSAFE_CASTING");
+        default: return PyLong_FromLong(casting);
     }
-    return PyInt_FromLong(casting);
 }
 
 static PyObject *
@@ -2083,6 +2276,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"fromstring_null_term_c_api",
         fromstring_null_term_c_api,
         METH_O, NULL},
+    {"create_custom_field_dtype",
+        create_custom_field_dtype,
+        METH_VARARGS, NULL},
+    {"corrupt_or_fix_bufferinfo",
+        corrupt_or_fix_bufferinfo,
+        METH_O, NULL},
     {"incref_elide",
         incref_elide,
         METH_VARARGS, NULL},
@@ -2119,6 +2318,12 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"get_c_wrapping_array",
         get_c_wrapping_array,
         METH_O, NULL},
+    {"get_all_cast_information",
+        get_all_cast_information,
+        METH_NOARGS,
+        "Return a list with info on all available casts. Some of the info"
+        "may differ for an actual cast if it uses value-based casting "
+        "(flexible types)."},
     {"array_indexing",
         array_indexing,
         METH_VARARGS, NULL},
@@ -2179,6 +2384,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"getset_numericops",
         getset_numericops,
         METH_NOARGS, NULL},
+    {"uses_new_casts",
+            uses_new_casts,
+            METH_NOARGS, NULL},
 /**begin repeat
  * #name = cabs, carg#
  */
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index 795fc7315..887deff53 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -2,17 +2,12 @@
 #include <Python.h>
 #include "structmember.h"
 
-#if PY_VERSION_HEX >= 0x03060000
 #include <pymem.h>
 /* public api in 3.7 */
 #if PY_VERSION_HEX < 0x03070000
 #define PyTraceMalloc_Track _PyTraceMalloc_Track
 #define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
 #endif
-#else
-#define PyTraceMalloc_Track(...)
-#define PyTraceMalloc_Untrack(...)
-#endif
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index b8dc7d516..361964a5c 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -132,17 +132,22 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
 
     NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
         /* Process the innermost dimension */
-        stransfer(dst_data, dst_strides_it[0], src_data, src_strides_it[0],
-                    shape_it[0], src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_strides_it[0], src_data, src_strides_it[0],
+                shape_it[0], src_itemsize, transferdata) < 0) {
+            goto fail;
+        }
     } NPY_RAW_ITER_TWO_NEXT(idim, ndim, coord, shape_it,
                             dst_data, dst_strides_it,
                             src_data, src_strides_it);
 
     NPY_END_THREADS;
-
     NPY_AUXDATA_FREE(transferdata);
-
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return 0;
+fail:
+    NPY_END_THREADS;
+    NPY_AUXDATA_FREE(transferdata);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 41eb75f1c..023772776 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -82,16 +82,21 @@ raw_array_assign_scalar(int ndim, npy_intp const *shape,
 
     NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
         /* Process the innermost dimension */
-        stransfer(dst_data, dst_strides_it[0], src_data, 0,
-                    shape_it[0], src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_strides_it[0], src_data, 0,
+                shape_it[0], src_itemsize, transferdata) < 0) {
+            goto fail;
+        }
     } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
                             shape_it, dst_data, dst_strides_it);
 
     NPY_END_THREADS;
-
     NPY_AUXDATA_FREE(transferdata);
-
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return 0;
+fail:
+    NPY_END_THREADS;
+    NPY_AUXDATA_FREE(transferdata);
+    return -1;
 }
 
 /*
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 9d367da1f..53d891049 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -128,7 +128,9 @@ _prime_global_pytype_to_type_dict(void)
 
 
 /**
- * Add a new mapping from a python type to the DType class.
+ * Add a new mapping from a python type to the DType class. For a user
+ * defined legacy dtype, this function does nothing unless the pytype
+ * subclass from `np.generic`.
  *
  * This assumes that the DType class is guaranteed to hold on the
  * python type (this assumption is guaranteed).
@@ -145,21 +147,29 @@ _PyArray_MapPyTypeToDType(
 {
     PyObject *Dtype_obj = (PyObject *)DType;
 
-    if (userdef) {
+    if (userdef && !PyObject_IsSubclass(
+                    (PyObject *)pytype, (PyObject *)&PyGenericArrType_Type)) {
         /*
-         * It seems we did not strictly enforce this in the legacy dtype
-         * API, but assume that it is always true. Further, this could be
-         * relaxed in the future. In particular we should have a new
-         * superclass of ``np.generic`` in order to note enforce the array
-         * scalar behaviour.
+         * We expect that user dtypes (for now) will subclass some numpy
+         * scalar class to allow automatic discovery.
          */
-        if (!PyObject_IsSubclass((PyObject *)pytype, (PyObject *)&PyGenericArrType_Type)) {
-            PyErr_Format(PyExc_RuntimeError,
-                    "currently it is only possible to register a DType "
-                    "for scalars deriving from `np.generic`, got '%S'.",
-                    (PyObject *)pytype);
-            return -1;
+        if (DType->legacy) {
+            /*
+             * For legacy user dtypes, discovery relied on subclassing, but
+             * arbitrary type objects are supported, so do nothing.
+             */
+            return 0;
         }
+        /*
+         * We currently enforce that user DTypes subclass from `np.generic`
+         * (this should become a `np.generic` base class and may be lifted
+         * entirely).
+         */
+        PyErr_Format(PyExc_RuntimeError,
+                "currently it is only possible to register a DType "
+                "for scalars deriving from `np.generic`, got '%S'.",
+                (PyObject *)pytype);
+        return -1;
     }
 
     /* Create the global dictionary if it does not exist */
@@ -288,7 +298,7 @@ discover_dtype_from_pyobject(
         Py_INCREF(DType);
         Py_DECREF(legacy_descr);
         /* TODO: Enable warning about subclass handling */
-        if (0 && !((*flags) & GAVE_SUBCLASS_WARNING)) {
+        if ((0) && !((*flags) & GAVE_SUBCLASS_WARNING)) {
             if (DEPRECATE_FUTUREWARNING(
                     "in the future NumPy will not automatically find the "
                     "dtype for subclasses of scalars known to NumPy (i.e. "
@@ -306,51 +316,6 @@ discover_dtype_from_pyobject(
 }
 
 
-/*
- * This function should probably become public API eventually.  At this
- * time it is implemented by falling back to `PyArray_AdaptFlexibleDType`.
- * We will use `CastingImpl[from, to].adjust_descriptors(...)` to implement
- * this logic.
- */
-static NPY_INLINE PyArray_Descr *
-cast_descriptor_to_fixed_dtype(
-        PyArray_Descr *descr, PyArray_DTypeMeta *fixed_DType)
-{
-    if (fixed_DType == NULL) {
-        /* Nothing to do, we only need to promote the new dtype */
-        Py_INCREF(descr);
-        return descr;
-    }
-
-    if (!fixed_DType->parametric) {
-        /*
-         * Don't actually do anything, the default is always the result
-         * of any cast.
-         */
-        return fixed_DType->default_descr(fixed_DType);
-    }
-    if (PyObject_TypeCheck((PyObject *)descr, (PyTypeObject *)fixed_DType)) {
-        Py_INCREF(descr);
-        return descr;
-    }
-    /*
-     * TODO: When this is implemented for all dtypes, the special cases
-     *       can be removed...
-     */
-    if (fixed_DType->legacy && fixed_DType->parametric &&
-            NPY_DTYPE(descr)->legacy) {
-        PyArray_Descr *flex_dtype = PyArray_DescrFromType(fixed_DType->type_num);
-        return PyArray_AdaptFlexibleDType(descr, flex_dtype);
-    }
-
-    PyErr_SetString(PyExc_NotImplementedError,
-            "Must use casting to find the correct dtype, this is "
-            "not yet implemented! "
-            "(It should not be possible to hit this code currently!)");
-    return NULL;
-}
-
-
 /**
  * Discover the correct descriptor from a known DType class and scalar.
  * If the fixed DType can discover a dtype instance/descr all is fine,
@@ -392,7 +357,7 @@ find_scalar_descriptor(
         return descr;
     }
 
-    Py_SETREF(descr, cast_descriptor_to_fixed_dtype(descr, fixed_DType));
+    Py_SETREF(descr, PyArray_CastDescrToDType(descr, fixed_DType));
     return descr;
 }
 
@@ -434,7 +399,7 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
             .flags = NPY_ARRAY_WRITEABLE,  /* assume array is not behaved. */
         };
     Py_SET_TYPE(&arr_fields, &PyArray_Type);
-    Py_REFCNT(&arr_fields) = 1;
+    Py_SET_REFCNT(&arr_fields, 1);
 
     if (NPY_UNLIKELY(descr->type_num == NPY_OBJECT)) {
         /*
@@ -495,12 +460,10 @@ PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value)
         res = -1;
         goto finish;
     }
-    stransfer(item, 0, data, 0, 1, tmp_descr->elsize, transferdata);
-    NPY_AUXDATA_FREE(transferdata);
-
-    if (needs_api && PyErr_Occurred()) {
+    if (stransfer(item, 0, data, 0, 1, tmp_descr->elsize, transferdata) < 0) {
         res = -1;
     }
+    NPY_AUXDATA_FREE(transferdata);
 
   finish:
     if (PyDataType_REFCHK(tmp_descr)) {
@@ -550,7 +513,7 @@ update_shape(int curr_ndim, int *max_ndim,
             success = -1;
             if (!sequence) {
                 /* Remove dimensions that we cannot use: */
-                *max_ndim -= new_ndim + i;
+                *max_ndim -= new_ndim - i;
             }
             else {
                 assert(i == 0);
@@ -585,7 +548,7 @@ npy_new_coercion_cache(
         cache = _coercion_cache_cache[_coercion_cache_num];
     }
     else {
-        cache = PyObject_MALLOC(sizeof(coercion_cache_obj));
+        cache = PyMem_Malloc(sizeof(coercion_cache_obj));
     }
     if (cache == NULL) {
         PyErr_NoMemory();
@@ -617,7 +580,7 @@ npy_unlink_coercion_cache(coercion_cache_obj *current)
         _coercion_cache_num++;
     }
     else {
-        PyObject_FREE(current);
+        PyMem_Free(current);
     }
     return next;
 }
@@ -639,12 +602,13 @@ npy_free_coercion_cache(coercion_cache_obj *next) {
  *
  * @param out_descr The current descriptor.
  * @param descr The newly found descriptor to promote with
+ * @param fixed_DType The user provided (fixed) DType or NULL
  * @param flags dtype discover flags to signal failed promotion.
  * @return -1 on error, 0 on success.
  */
 static NPY_INLINE int
 handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
-        enum _dtype_discovery_flags *flags)
+        PyArray_DTypeMeta *fixed_DType, enum _dtype_discovery_flags *flags)
 {
     assert(!(*flags & DESCRIPTOR_WAS_SET));
 
@@ -654,7 +618,11 @@ handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
         return 0;
     }
     PyArray_Descr *new_descr = PyArray_PromoteTypes(descr, *out_descr);
-    if (new_descr == NULL) {
+    if (NPY_UNLIKELY(new_descr == NULL)) {
+        if (fixed_DType != NULL) {
+            /* If a DType is fixed, promotion must not fail. */
+            return -1;
+        }
         PyErr_Clear();
         *flags |= PROMOTION_FAILED;
         /* Continue with object, since we may need the dimensionality */
@@ -669,13 +637,15 @@ handle_promotion(PyArray_Descr **out_descr, PyArray_Descr *descr,
  * Handle a leave node (known scalar) during dtype and shape discovery.
  *
  * @param obj The python object or nested sequence to convert
- * @param max_dims The maximum number of dimensions.
  * @param curr_dims The current number of dimensions (depth in the recursion)
+ * @param max_dims The maximum number of dimensions.
  * @param out_shape The discovered output shape, will be filled
- * @param coercion_cache The coercion cache object to use.
- * @param DType the DType class that should be used, or NULL, if not provided.
+ * @param fixed_DType The user provided (fixed) DType or NULL
  * @param flags used signal that this is a ragged array, used internally and
  *        can be expanded if necessary.
+ * @param DType the DType class that should be used, or NULL, if not provided.
+ *
+ * @return 0 on success -1 on error
  */
 static NPY_INLINE int
 handle_scalar(
@@ -700,7 +670,7 @@ handle_scalar(
     if (descr == NULL) {
         return -1;
     }
-    if (handle_promotion(out_descr, descr, flags) < 0) {
+    if (handle_promotion(out_descr, descr, fixed_DType, flags) < 0) {
         Py_DECREF(descr);
         return -1;
     }
@@ -729,8 +699,13 @@ find_descriptor_from_array(
     enum _dtype_discovery_flags flags = 0;
     *out_descr = NULL;
 
-    if (NPY_UNLIKELY(DType != NULL && DType->parametric &&
-            PyArray_ISOBJECT(arr))) {
+    if (DType == NULL) {
+        *out_descr = PyArray_DESCR(arr);
+        Py_INCREF(*out_descr);
+        return 0;
+    }
+
+    if (NPY_UNLIKELY(DType->parametric && PyArray_ISOBJECT(arr))) {
         /*
          * We have one special case, if (and only if) the input array is of
          * object DType and the dtype is not fixed already but parametric.
@@ -779,7 +754,7 @@ find_descriptor_from_array(
         }
         Py_DECREF(iter);
     }
-    else if (DType != NULL && NPY_UNLIKELY(DType->type_num == NPY_DATETIME) &&
+    else if (NPY_UNLIKELY(DType->type_num == NPY_DATETIME) &&
                 PyArray_ISSTRING(arr)) {
         /*
          * TODO: This branch should be deprecated IMO, the workaround is
@@ -808,8 +783,7 @@ find_descriptor_from_array(
          * If this is not an object array figure out the dtype cast,
          * or simply use the returned DType.
          */
-        *out_descr = cast_descriptor_to_fixed_dtype(
-                     PyArray_DESCR(arr), DType);
+        *out_descr = PyArray_CastDescrToDType(PyArray_DESCR(arr), DType);
         if (*out_descr == NULL) {
             return -1;
         }
@@ -992,7 +966,7 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             /* object array with no elements, no need to promote/adjust. */
             return max_dims;
         }
-        if (handle_promotion(out_descr, cast_descr, flags) < 0) {
+        if (handle_promotion(out_descr, cast_descr, fixed_DType, flags) < 0) {
             Py_DECREF(cast_descr);
             return -1;
         }
@@ -1221,7 +1195,58 @@ PyArray_DiscoverDTypeAndShape(
         }
         else if (fixed_DType->type_num != NPY_OBJECT) {
             /* Only object DType supports ragged cases unify error */
-            if (!too_deep) {
+
+            /*
+             * We used to let certain ragged arrays pass if they also
+             * support e.g. conversion using `float(arr)`, which currently
+             * works for arrays with only one element.
+             * Thus we catch at least most of such cases here and give a
+             * DeprecationWarning instead of an error.
+             * Note that some of these will actually error later on when
+             * attempting to do the actual assign.
+             */
+            int deprecate_single_element_ragged = 0;
+            coercion_cache_obj *current = *coercion_cache_head;
+            while (current != NULL) {
+                if (current->sequence) {
+                    if (current->depth == ndim) {
+                        /*
+                         * Assume that only array-likes will allow the deprecated
+                         * behaviour
+                         */
+                        deprecate_single_element_ragged = 0;
+                        break;
+                    }
+                    /* check next converted sequence/array-like */
+                    current = current->next;
+                    continue;
+                }
+                PyArrayObject *arr = (PyArrayObject *)(current->arr_or_sequence);
+                assert(PyArray_NDIM(arr) + current->depth >= ndim);
+                if (PyArray_NDIM(arr) != ndim - current->depth) {
+                    /* This array is not compatible with the final shape */
+                    if (PyArray_SIZE(arr) != 1) {
+                        deprecate_single_element_ragged = 0;
+                        break;
+                    }
+                    deprecate_single_element_ragged = 1;
+                }
+                current = current->next;
+            }
+
+            if (deprecate_single_element_ragged) {
+                /* Deprecated 2020-07-24, NumPy 1.20 */
+                if (DEPRECATE(
+                        "setting an array element with a sequence. "
+                        "This was supported in some cases where the elements "
+                        "are arrays with a single element. For example "
+                        "`np.array([1, np.array([2])], dtype=int)`. "
+                        "In the future this will raise the same ValueError as "
+                        "`np.array([1, [2]], dtype=int)`.") < 0) {
+                    goto fail;
+                }
+            }
+            else if (!too_deep) {
                 PyObject *shape = PyArray_IntTupleFromIntp(ndim, out_shape);
                 PyErr_Format(PyExc_ValueError,
                         "setting an array element with a sequence. The "
@@ -1276,15 +1301,9 @@ PyArray_DiscoverDTypeAndShape(
          * the correct default.
          */
         if (fixed_DType != NULL) {
-            if (fixed_DType->default_descr == NULL) {
-                Py_INCREF(fixed_DType->singleton);
-                *out_descr = fixed_DType->singleton;
-            }
-            else {
-                *out_descr = fixed_DType->default_descr(fixed_DType);
-                if (*out_descr == NULL) {
-                    goto fail;
-                }
+            *out_descr = fixed_DType->default_descr(fixed_DType);
+            if (*out_descr == NULL) {
+                goto fail;
             }
         }
     }
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
new file mode 100644
index 000000000..cae452454
--- /dev/null
+++ b/numpy/core/src/multiarray/array_method.c
@@ -0,0 +1,614 @@
+/*
+ * This file implements an abstraction layer for "Array methods", which
+ * work with a specific DType class input and provide low-level C function
+ * pointers to do fast operations on the given input functions.
+ * It thus adds an abstraction layer around individual ufunc loops.
+ *
+ * Unlike methods, a ArrayMethod can have multiple inputs and outputs.
+ * This has some serious implication for garbage collection, and as far
+ * as I (@seberg) understands, it is not possible to always guarantee correct
+ * cyclic garbage collection of dynamically created DTypes with methods.
+ * The keyword (or rather the solution) for this seems to be an "ephemeron"
+ * which I believe should allow correct garbage collection but seems
+ * not implemented in Python at this time.
+ * The vast majority of use-cases will not require correct garbage collection.
+ * Some use cases may require the user to be careful.
+ *
+ * Generally there are two main ways to solve this issue:
+ *
+ * 1. A method with a single input (or inputs of all the same DTypes) can
+ *    be "owned" by that DType (it becomes unusable when the DType is deleted).
+ *    This holds especially for all casts, which must have a defined output
+ *    DType and must hold on to it strongly.
+ * 2. A method which can infer the output DType(s) from the input types does
+ *    not need to keep the output type alive. (It can use NULL for the type,
+ *    or an abstract base class which is known to be persistent.)
+ *    It is then sufficient for a ufunc (or other owner) to only hold a
+ *    weak reference to the input DTypes.
+ */
+
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <npy_pycompat.h>
+#include "arrayobject.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "convert_datatype.h"
+
+
+/*
+ * The default descriptor resolution function.  The logic is as follows:
+ *
+ * 1. The output is ensured to be canonical (currently native byte order),
+ *    if it is of the correct DType.
+ * 2. If any DType is was not defined, it is replaced by the common DType
+ *    of all inputs. (If that common DType is parametric, this is an error.)
+ *
+ * We could allow setting the output descriptors specifically to simplify
+ * this step.
+ */
+static NPY_CASTING
+default_resolve_descriptors(
+        PyArrayMethodObject *method,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **input_descrs,
+        PyArray_Descr **output_descrs)
+{
+    int nin = method->nin;
+    int nout = method->nout;
+    int all_defined = 1;
+
+    for (int i = 0; i < nin + nout; i++) {
+        PyArray_DTypeMeta *dtype = dtypes[i];
+        if (dtype == NULL) {
+            output_descrs[i] = NULL;
+            all_defined = 0;
+            continue;
+        }
+        if (NPY_DTYPE(input_descrs[i]) == dtype) {
+            output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
+        }
+        else {
+            output_descrs[i] = dtype->default_descr(dtype);
+        }
+        if (NPY_UNLIKELY(output_descrs[i] == NULL)) {
+            goto fail;
+        }
+    }
+    if (all_defined) {
+        return method->casting;
+    }
+
+    if (NPY_UNLIKELY(nin == 0 || dtypes[0] == NULL)) {
+        /* Registration should reject this, so this would be indicates a bug */
+        PyErr_SetString(PyExc_RuntimeError,
+                "Invalid use of default resolver without inputs or with "
+                "input or output DType incorrectly missing.");
+        goto fail;
+    }
+    /* We find the common dtype of all inputs, and use it for the unknowns */
+    PyArray_DTypeMeta *common_dtype = dtypes[0];
+    assert(common_dtype != NULL);
+    for (int i = 1; i < nin; i++) {
+        Py_SETREF(common_dtype, PyArray_CommonDType(common_dtype, dtypes[i]));
+        if (common_dtype == NULL) {
+            goto fail;
+        }
+    }
+    for (int i = nin; i < nin + nout; i++) {
+        if (output_descrs[i] != NULL) {
+            continue;
+        }
+        if (NPY_DTYPE(input_descrs[i]) == common_dtype) {
+            output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
+        }
+        else {
+            output_descrs[i] = common_dtype->default_descr(common_dtype);
+        }
+        if (NPY_UNLIKELY(output_descrs[i] == NULL)) {
+            goto fail;
+        }
+    }
+
+    return method->casting;
+
+  fail:
+    for (int i = 0; i < nin + nout; i++) {
+        Py_XDECREF(output_descrs[i]);
+    }
+    return -1;
+}
+
+
+/**
+ * The default method to fetch the correct loop for a cast or ufunc
+ * (at the time of writing only casts).
+ * The default version can return loops explicitly registered during method
+ * creation. It does specialize contiguous loops, although has to check
+ * all descriptors itemsizes for this.
+ *
+ * @param context
+ * @param aligned
+ * @param move_references UNUSED.
+ * @param strides
+ * @param descriptors
+ * @param out_loop
+ * @param out_transferdata
+ * @param flags
+ * @return 0 on success -1 on failure.
+ */
+static int
+default_get_strided_loop(
+        PyArrayMethod_Context *NPY_UNUSED(context),
+        int NPY_UNUSED(aligned), int NPY_UNUSED(move_references),
+        npy_intp *NPY_UNUSED(strides),
+        PyArray_StridedUnaryOp **NPY_UNUSED(out_loop),
+        NpyAuxData **NPY_UNUSED(out_transferdata),
+        NPY_ARRAYMETHOD_FLAGS *NPY_UNUSED(flags))
+{
+    PyErr_SetString(PyExc_NotImplementedError,
+            "default loop getter is not implemented");
+    return -1;
+}
+
+
+/**
+ * Validate that the input is usable to create a new ArrayMethod.
+ *
+ * @param spec
+ * @return 0 on success -1 on error.
+ */
+static int
+validate_spec(PyArrayMethod_Spec *spec)
+{
+    int nargs = spec->nin + spec->nout;
+    /* Check the passed spec for invalid fields/values */
+    if (spec->nin < 0 || spec->nout < 0 || nargs > NPY_MAXARGS) {
+        PyErr_Format(PyExc_ValueError,
+                "ArrayMethod inputs and outputs must be greater zero and"
+                "not exceed %d. (method: %s)", NPY_MAXARGS, spec->name);
+        return -1;
+    }
+    switch (spec->casting & ~_NPY_CAST_IS_VIEW) {
+        case NPY_NO_CASTING:
+        case NPY_EQUIV_CASTING:
+        case NPY_SAFE_CASTING:
+        case NPY_SAME_KIND_CASTING:
+        case NPY_UNSAFE_CASTING:
+            break;
+        default:
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod has invalid casting `%d`. (method: %s)",
+                    spec->casting, spec->name);
+            return -1;
+    }
+
+    for (int i = 0; i < nargs; i++) {
+        if (spec->dtypes[i] == NULL && i < spec->nin) {
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod must have well defined input DTypes. "
+                    "(method: %s)", spec->name);
+            return -1;
+        }
+        if (!PyObject_TypeCheck(spec->dtypes[i], &PyArrayDTypeMeta_Type)) {
+            PyErr_Format(PyExc_TypeError,
+                    "ArrayMethod provided object %R is not a DType."
+                    "(method: %s)", spec->dtypes[i], spec->name);
+            return -1;
+        }
+        if (spec->dtypes[i]->abstract && i < spec->nin) {
+            PyErr_Format(PyExc_TypeError,
+                    "abstract DType %S are currently not allowed for inputs."
+                    "(method: %s defined at %s)", spec->dtypes[i], spec->name);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Initialize a new BoundArrayMethodObject from slots.  Slots which are
+ * not provided may be filled with defaults.
+ *
+ * @param res The new PyBoundArrayMethodObject to be filled.
+ * @param spec The specification list passed by the user.
+ * @param private Private flag to limit certain slots to use in NumPy.
+ * @return -1 on error 0 on success
+ */
+static int
+fill_arraymethod_from_slots(
+        PyBoundArrayMethodObject *res, PyArrayMethod_Spec *spec,
+        int private)
+{
+    PyArrayMethodObject *meth = res->method;
+
+    /* Set the defaults */
+    meth->get_strided_loop = &default_get_strided_loop;
+    meth->resolve_descriptors = &default_resolve_descriptors;
+
+    /* Fill in the slots passed by the user */
+    /*
+     * TODO: This is reasonable for now, but it would be nice to find a
+     *       shorter solution, and add some additional error checking (e.g.
+     *       the same slot used twice). Python uses an array of slot offsets.
+     */
+    for (PyType_Slot *slot = &spec->slots[0]; slot->slot != 0; slot++) {
+        switch (slot->slot) {
+            case NPY_METH_resolve_descriptors:
+                meth->resolve_descriptors = slot->pfunc;
+                continue;
+            case NPY_METH_get_loop:
+                if (private) {
+                    /* Only allow override for private functions initially */
+                    meth->get_strided_loop = slot->pfunc;
+                    continue;
+                }
+                break;
+            case NPY_METH_strided_loop:
+                meth->strided_loop = slot->pfunc;
+                continue;
+            case NPY_METH_contiguous_loop:
+                meth->contiguous_loop = slot->pfunc;
+                continue;
+            case NPY_METH_unaligned_strided_loop:
+                meth->unaligned_strided_loop = slot->pfunc;
+                continue;
+            case NPY_METH_unaligned_contiguous_loop:
+                meth->unaligned_contiguous_loop = slot->pfunc;
+                continue;
+            default:
+                break;
+        }
+        PyErr_Format(PyExc_RuntimeError,
+                "invalid slot number %d to ArrayMethod: %s",
+                slot->slot, spec->name);
+        return -1;
+    }
+
+    /* Check whether the slots are valid: */
+    if (meth->resolve_descriptors == &default_resolve_descriptors) {
+        for (int i = 0; i < meth->nin + meth->nout; i++) {
+            if (res->dtypes[i] == NULL) {
+                if (i < meth->nin) {
+                    PyErr_Format(PyExc_TypeError,
+                            "All input DTypes must be specified when using "
+                            "the default `resolve_descriptors` function. "
+                            "(method: %s)", spec->name);
+                    return -1;
+                }
+                else if (meth->nin == 0) {
+                    PyErr_Format(PyExc_TypeError,
+                            "Must specify output DTypes or use custom "
+                            "`resolve_descriptors` when there are no inputs. "
+                            "(method: %s defined at %s)", spec->name);
+                    return -1;
+                }
+            }
+            if (i >= meth->nin && res->dtypes[i]->parametric) {
+                PyErr_Format(PyExc_TypeError,
+                        "must provide a `resolve_descriptors` function if any "
+                        "output DType is parametric. (method: %s)",
+                        spec->name);
+                return -1;
+            }
+        }
+    }
+    if (meth->get_strided_loop != &default_get_strided_loop) {
+        /* Do not check the actual loop fields. */
+        return 0;
+    }
+
+    /* Check whether the provided loops make sense. */
+    if (meth->strided_loop == NULL) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide a strided inner loop function. (method: %s)",
+                spec->name);
+        return -1;
+    }
+    if (meth->contiguous_loop == NULL) {
+        meth->contiguous_loop = meth->strided_loop;
+    }
+    if (meth->unaligned_contiguous_loop != NULL &&
+            meth->unaligned_strided_loop == NULL) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide unaligned strided inner loop when providing "
+                "a contiguous version. (method: %s)", spec->name);
+        return -1;
+    }
+    if ((meth->unaligned_strided_loop == NULL) !=
+            !(meth->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+        PyErr_Format(PyExc_TypeError,
+                "Must provide unaligned strided inner loop when providing "
+                "a contiguous version. (method: %s)", spec->name);
+        return -1;
+    }
+
+    return 0;
+}
+
+
+/**
+ * Create a new ArrayMethod (internal version).
+ *
+ * @param name A name for the individual method, may be NULL.
+ * @param spec A filled context object to pass generic information about
+ *        the method (such as usually needing the API, and the DTypes).
+ *        Unused fields must be NULL.
+ * @param slots Slots with the correct pair of IDs and (function) pointers.
+ * @param private Some slots are currently considered private, if not true,
+ *        these will be rejected.
+ *
+ * @returns A new (bound) ArrayMethod object.
+ */
+NPY_NO_EXPORT PyBoundArrayMethodObject *
+PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private)
+{
+    int nargs = spec->nin + spec->nout;
+
+    if (spec->name == NULL) {
+        spec->name = "<unknown>";
+    }
+
+    if (validate_spec(spec) < 0) {
+        return NULL;
+    }
+
+    PyBoundArrayMethodObject *res;
+    res = PyObject_New(PyBoundArrayMethodObject, &PyBoundArrayMethod_Type);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->method = NULL;
+
+    res->dtypes = PyMem_Malloc(sizeof(PyArray_DTypeMeta *) * nargs);
+    if (res->dtypes == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    for (int i = 0; i < nargs ; i++) {
+        Py_XINCREF(spec->dtypes[i]);
+        res->dtypes[i] = spec->dtypes[i];
+    }
+
+    res->method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (res->method == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    memset((char *)(res->method) + sizeof(PyObject), 0,
+           sizeof(PyArrayMethodObject) - sizeof(PyObject));
+
+    res->method->nin = spec->nin;
+    res->method->nout = spec->nout;
+    res->method->flags = spec->flags;
+    res->method->casting = spec->casting;
+    if (fill_arraymethod_from_slots(res, spec, private) < 0) {
+        Py_DECREF(res);
+        return NULL;
+    }
+
+    ssize_t length = strlen(spec->name);
+    res->method->name = PyMem_Malloc(length + 1);
+    if (res->method->name == NULL) {
+        Py_DECREF(res);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    strcpy(res->method->name, spec->name);
+
+    return res;
+}
+
+
+static void
+arraymethod_dealloc(PyObject *self)
+{
+    PyArrayMethodObject *meth;
+    meth = ((PyArrayMethodObject *)self);
+
+    PyMem_Free(meth->name);
+
+    Py_TYPE(self)->tp_free(self);
+}
+
+
+NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy._ArrayMethod",
+    .tp_basicsize = sizeof(PyArrayMethodObject),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_dealloc = arraymethod_dealloc,
+};
+
+
+
+static PyObject *
+boundarraymethod_repr(PyBoundArrayMethodObject *self)
+{
+    int nargs = self->method->nin + self->method->nout;
+    PyObject *dtypes = PyTuple_New(nargs);
+    if (dtypes == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < nargs; i++) {
+        Py_INCREF(self->dtypes[i]);
+        PyTuple_SET_ITEM(dtypes, i, (PyObject *)self->dtypes[i]);
+    }
+    return PyUnicode_FromFormat(
+            "<np._BoundArrayMethod `%s` for dtypes %S>",
+            self->method->name, dtypes);
+}
+
+
+static void
+boundarraymethod_dealloc(PyObject *self)
+{
+    PyBoundArrayMethodObject *meth;
+    meth = ((PyBoundArrayMethodObject *)self);
+    int nargs = meth->method->nin + meth->method->nout;
+
+    for (int i = 0; i < nargs; i++) {
+        Py_XDECREF(meth->dtypes[i]);
+    }
+    PyMem_Free(meth->dtypes);
+
+    Py_XDECREF(meth->method);
+
+    Py_TYPE(self)->tp_free(self);
+}
+
+
+/*
+ * Calls resolve_descriptors() and returns the casting level and the resolved
+ * descriptors as a tuple. If the operation is impossible returns (-1, None).
+ * May raise an error, but usually should not.
+ * The function validates the casting attribute compared to the returned
+ * casting level.
+ */
+static PyObject *
+boundarraymethod__resolve_descripors(
+        PyBoundArrayMethodObject *self, PyObject *descr_tuple)
+{
+    int nin = self->method->nin;
+    int nout = self->method->nout;
+
+    PyArray_Descr *given_descrs[NPY_MAXARGS];
+    PyArray_Descr *loop_descrs[NPY_MAXARGS];
+
+    if (!PyTuple_CheckExact(descr_tuple) ||
+            PyTuple_Size(descr_tuple) != nin + nout) {
+        PyErr_Format(PyExc_ValueError,
+                "_resolve_descriptors() takes exactly one tuple with as many "
+                "elements as the method takes arguments (%d+%d).", nin, nout);
+        return NULL;
+    }
+
+    for (int i = 0; i < nin + nout; i++) {
+        PyObject *tmp = PyTuple_GetItem(descr_tuple, i);
+        if (tmp == NULL) {
+            return NULL;
+        }
+        else if (tmp == Py_None) {
+            if (i < nin) {
+                PyErr_SetString(PyExc_ValueError,
+                        "only output dtypes may be omitted (set to None).");
+                return NULL;
+            }
+            given_descrs[i] = NULL;
+        }
+        else if (PyArray_DescrCheck(tmp)) {
+            if (Py_TYPE(tmp) != (PyTypeObject *)self->dtypes[i]) {
+                PyErr_Format(PyExc_ValueError,
+                        "input dtype %S was not an exact instance of the bound "
+                        "DType class %S.", tmp, self->dtypes[i]);
+                return NULL;
+            }
+            given_descrs[i] = (PyArray_Descr *)tmp;
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                    "dtype tuple can only contain dtype instances or None.");
+            return NULL;
+        }
+    }
+
+    NPY_CASTING casting = self->method->resolve_descriptors(
+            self->method, self->dtypes, given_descrs, loop_descrs);
+
+    if (casting < 0 && PyErr_Occurred()) {
+        return NULL;
+    }
+    else if (casting < 0) {
+        return Py_BuildValue("iO", casting, Py_None);
+    }
+
+    PyObject *result_tuple = PyTuple_New(nin + nout);
+    if (result_tuple == NULL) {
+        return NULL;
+    }
+    for (int i = 0; i < nin + nout; i++) {
+        /* transfer ownership to the tuple. */
+        PyTuple_SET_ITEM(result_tuple, i, (PyObject *)loop_descrs[i]);
+    }
+
+    /*
+     * The casting flags should be the most generic casting level (except the
+     * cast-is-view flag.  If no input is parametric, it must match exactly.
+     */
+    int parametric = 0;
+    for (int i = 0; i < nin + nout; i++) {
+        if (self->dtypes[i]->parametric) {
+            parametric = 1;
+            break;
+        }
+    }
+    if (!parametric) {
+        /*
+         * Non-parametric can only mismatch if it switches from no to equiv
+         * (e.g. due to byteorder changes).
+         */
+        if (self->method->casting != (casting & ~_NPY_CAST_IS_VIEW) &&
+                !(self->method->casting == NPY_NO_CASTING &&
+                  casting == NPY_EQUIV_CASTING)) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "resolve_descriptors cast level did not match stored one "
+                    "(expected %d, got %d) for method %s",
+                    self->method->casting, (casting & ~_NPY_CAST_IS_VIEW),
+                    self->method->name);
+            Py_DECREF(result_tuple);
+            return NULL;
+        }
+    }
+    else {
+        NPY_CASTING cast = casting & ~_NPY_CAST_IS_VIEW;
+        if (cast != PyArray_MinCastSafety(cast, self->method->casting)) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "resolve_descriptors cast level did not match stored one "
+                    "(expected %d, got %d) for method %s",
+                    self->method->casting, (casting & ~_NPY_CAST_IS_VIEW),
+                    self->method->name);
+            Py_DECREF(result_tuple);
+            return NULL;
+        }
+    }
+
+    return Py_BuildValue("iN", casting, result_tuple);
+}
+
+
+PyMethodDef boundarraymethod_methods[] = {
+    {"_resolve_descriptors", (PyCFunction)boundarraymethod__resolve_descripors,
+     METH_O, "Resolve the given dtypes."},
+    {NULL, 0, 0, NULL},
+};
+
+
+static PyObject *
+boundarraymethod__supports_unaligned(PyBoundArrayMethodObject *self)
+{
+    return PyBool_FromLong(self->method->flags & NPY_METH_SUPPORTS_UNALIGNED);
+}
+
+
+PyGetSetDef boundarraymethods_getters[] = {
+    {"_supports_unaligned",
+     (getter)boundarraymethod__supports_unaligned, NULL,
+     "whether the method supports unaligned inputs/outputs.", NULL},
+    {NULL, NULL, NULL, NULL, NULL},
+};
+
+
+NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy._BoundArrayMethod",
+    .tp_basicsize = sizeof(PyBoundArrayMethodObject),
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_repr = (reprfunc)boundarraymethod_repr,
+    .tp_dealloc = boundarraymethod_dealloc,
+    .tp_methods = boundarraymethod_methods,
+    .tp_getset = boundarraymethods_getters,
+};
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
new file mode 100644
index 000000000..15ea948ce
--- /dev/null
+++ b/numpy/core/src/multiarray/array_method.h
@@ -0,0 +1,150 @@
+#ifndef _NPY_ARRAY_METHOD_H
+#define _NPY_ARRAY_METHOD_H
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <Python.h>
+#include <numpy/ndarraytypes.h>
+#include <lowlevel_strided_loops.h>
+
+
+typedef enum {
+    /* Flag for whether the GIL is required */
+    NPY_METH_REQUIRES_PYAPI = 1 << 1,
+    /*
+     * Some functions cannot set floating point error flags, this flag
+     * gives us the option (not requirement) to skip floating point error
+     * setup/check. No function should set error flags and ignore them
+     * since it would interfere with chaining operations (e.g. casting).
+     */
+    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
+    /* Whether the method supports unaligned access (not runtime) */
+    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
+
+    /* All flags which can change at runtime */
+    NPY_METH_RUNTIME_FLAGS = (
+            NPY_METH_REQUIRES_PYAPI |
+            NPY_METH_NO_FLOATINGPOINT_ERRORS),
+} NPY_ARRAYMETHOD_FLAGS;
+
+
+struct PyArrayMethodObject_tag;
+
+/*
+ * This struct is specific to an individual (possibly repeated) call of
+ * the ArrayMethods strided operator, and as such is passed into the various
+ * methods of the ArrayMethod object (the resolve_descriptors function,
+ * the get_loop function and the individual lowlevel strided operator calls).
+ * It thus has to be persistent for one end-user call, and then be discarded.
+ *
+ * TODO: Before making this public, we should review which information should
+ *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
+ */
+typedef struct {
+    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
+    struct PyArrayMethodObject_tag *method;
+
+    /* Operand descriptors, filled in by resolve_descriptors */
+    PyArray_Descr **descriptors;
+} PyArrayMethod_Context;
+
+
+typedef NPY_CASTING (resolve_descriptors_function)(
+        struct PyArrayMethodObject_tag *method,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs);
+
+
+typedef int (get_loop_function)(
+        PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        npy_intp *strides,
+        PyArray_StridedUnaryOp **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/*
+ * This struct will be public and necessary for creating a new ArrayMethod
+ * object (casting and ufuncs).
+ * We could version the struct, although since we allow passing arbitrary
+ * data using the slots, and have flags, that may be enough?
+ * (See also PyBoundArrayMethodObject.)
+ */
+typedef struct {
+    const char *name;
+    int nin, nout;
+    NPY_CASTING casting;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    PyArray_DTypeMeta **dtypes;
+    PyType_Slot *slots;
+} PyArrayMethod_Spec;
+
+
+/*
+ * Structure of the ArrayMethod. This structure should probably not be made
+ * public. If necessary, we can make certain operations on it public
+ * (e.g. to allow users indirect access to `get_strided_loop`).
+ *
+ * NOTE: In some cases, it may not be clear whether information should be
+ * stored here or on the bound version. E.g. `nin` and `nout` (and in the
+ * future the gufunc `signature`) is already stored on the ufunc so that
+ * storing these here duplicates the information.
+ */
+typedef struct PyArrayMethodObject_tag {
+    PyObject_HEAD
+    char *name;
+    int nin, nout;
+    /* Casting is normally "safe" for functions, but is important for casts */
+    NPY_CASTING casting;
+    /* default flags. The get_strided_loop function can override these */
+    NPY_ARRAYMETHOD_FLAGS flags;
+    resolve_descriptors_function *resolve_descriptors;
+    get_loop_function *get_strided_loop;
+    /* Typical loop functions (contiguous ones are used in current casts) */
+    PyArray_StridedUnaryOp *strided_loop;
+    PyArray_StridedUnaryOp *contiguous_loop;
+    PyArray_StridedUnaryOp *unaligned_strided_loop;
+    PyArray_StridedUnaryOp *unaligned_contiguous_loop;
+} PyArrayMethodObject;
+
+
+/*
+ * We will sometimes have to create a ArrayMethod and allow passing it around,
+ * similar to `instance.method` returning a bound method, e.g. a function like
+ * `ufunc.resolve()` can return a bound object.
+ * The current main purpose of the BoundArrayMethod is that it holds on to the
+ * `dtypes` (the classes), so that the `ArrayMethod` (e.g. for casts) will
+ * not create references cycles.  In principle, it could hold any information
+ * which is also stored on the ufunc (and thus does not need to be repeated
+ * on the `ArrayMethod` itself.
+ */
+typedef struct {
+    PyObject_HEAD
+    PyArray_DTypeMeta **dtypes;
+    PyArrayMethodObject *method;
+} PyBoundArrayMethodObject;
+
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type;
+extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
+
+/*
+ * SLOTS IDs For the ArrayMethod creation, one public, the IDs are fixed.
+ * TODO: Before making it public, consider adding a large constant to private
+ *       slots.
+ */
+#define NPY_METH_resolve_descriptors 1
+#define NPY_METH_get_loop 2
+#define NPY_METH_strided_loop 3
+#define NPY_METH_contiguous_loop 4
+#define NPY_METH_unaligned_strided_loop 5
+#define NPY_METH_unaligned_contiguous_loop 6
+
+
+NPY_NO_EXPORT PyBoundArrayMethodObject *
+PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private);
+
+#endif  /*_NPY_ARRAY_METHOD_H*/
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index 9ea8efdd9..2c07cdebc 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -26,7 +26,6 @@ static PyObject *
 get_array_function(PyObject *obj)
 {
     static PyObject *ndarray_array_function = NULL;
-    PyObject *array_function;
 
     if (ndarray_array_function == NULL) {
         ndarray_array_function = get_ndarray_array_function();
@@ -38,7 +37,7 @@ get_array_function(PyObject *obj)
         return ndarray_array_function;
     }
 
-    array_function = PyArray_LookupSpecial(obj, "__array_function__");
+    PyObject *array_function = PyArray_LookupSpecial(obj, "__array_function__");
     if (array_function == NULL && PyErr_Occurred()) {
         PyErr_Clear(); /* TODO[gh-14801]: propagate crashes during attribute access? */
     }
@@ -53,9 +52,7 @@ get_array_function(PyObject *obj)
 static void
 pyobject_array_insert(PyObject **array, int length, int index, PyObject *item)
 {
-    int j;
-
-    for (j = length; j > index; j--) {
+    for (int j = length; j > index; j--) {
         array[j] = array[j - 1];
     }
     array[index] = item;
@@ -74,18 +71,16 @@ get_implementing_args_and_methods(PyObject *relevant_args,
                                   PyObject **methods)
 {
     int num_implementing_args = 0;
-    Py_ssize_t i;
-    int j;
 
     PyObject **items = PySequence_Fast_ITEMS(relevant_args);
     Py_ssize_t length = PySequence_Fast_GET_SIZE(relevant_args);
 
-    for (i = 0; i < length; i++) {
+    for (Py_ssize_t i = 0; i < length; i++) {
         int new_class = 1;
         PyObject *argument = items[i];
 
         /* Have we seen this type before? */
-        for (j = 0; j < num_implementing_args; j++) {
+        for (int j = 0; j < num_implementing_args; j++) {
             if (Py_TYPE(argument) == Py_TYPE(implementing_args[j])) {
                 new_class = 0;
                 break;
@@ -109,7 +104,7 @@ get_implementing_args_and_methods(PyObject *relevant_args,
 
                 /* "subclasses before superclasses, otherwise left to right" */
                 arg_index = num_implementing_args;
-                for (j = 0; j < num_implementing_args; j++) {
+                for (int j = 0; j < num_implementing_args; j++) {
                     PyObject *other_type;
                     other_type = (PyObject *)Py_TYPE(implementing_args[j]);
                     if (PyObject_IsInstance(argument, other_type)) {
@@ -129,7 +124,7 @@ get_implementing_args_and_methods(PyObject *relevant_args,
     return num_implementing_args;
 
 fail:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(methods[j]);
     }
@@ -161,13 +156,10 @@ NPY_NO_EXPORT PyObject *
 array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
                            PyObject *kwargs)
 {
-    Py_ssize_t j;
-    PyObject *implementation, *result;
-
     PyObject **items = PySequence_Fast_ITEMS(types);
     Py_ssize_t length = PySequence_Fast_GET_SIZE(types);
 
-    for (j = 0; j < length; j++) {
+    for (Py_ssize_t j = 0; j < length; j++) {
         int is_subclass = PyObject_IsSubclass(
             items[j], (PyObject *)&PyArray_Type);
         if (is_subclass == -1) {
@@ -179,11 +171,11 @@ array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
         }
     }
 
-    implementation = PyObject_GetAttr(func, npy_ma_str_implementation);
+    PyObject *implementation = PyObject_GetAttr(func, npy_ma_str_implementation);
     if (implementation == NULL) {
         return NULL;
     }
-    result = PyObject_Call(implementation, args, kwargs);
+    PyObject *result = PyObject_Call(implementation, args, kwargs);
     Py_DECREF(implementation);
     return result;
 }
@@ -208,32 +200,32 @@ call_array_function(PyObject* argument, PyObject* method,
 }
 
 
-/*
- * Implements the __array_function__ protocol for a function, as described in
- * in NEP-18. See numpy.core.overrides for a full docstring.
+/**
+ * Internal handler for the array-function dispatching. The helper returns
+ * either the result, or NotImplemented (as a borrowed reference).
+ *
+ * @param public_api The public API symbol used for dispatching
+ * @param relevant_args Arguments which may implement __array_function__
+ * @param args Original arguments
+ * @param kwargs Original keyword arguments
+ *
+ * @returns The result of the dispatched version, or a borrowed reference
+ *          to NotImplemented to indicate the default implementation should
+ *          be used.
  */
 NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+array_implement_array_function_internal(
+    PyObject *public_api, PyObject *relevant_args,
+    PyObject *args, PyObject *kwargs)
 {
-    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
-
-    PyObject *types = NULL;
     PyObject *implementing_args[NPY_MAXARGS];
     PyObject *array_function_methods[NPY_MAXARGS];
+    PyObject *types = NULL;
 
-    int j, any_overrides;
-    int num_implementing_args = 0;
     PyObject *result = NULL;
 
     static PyObject *errmsg_formatter = NULL;
 
-    if (!PyArg_UnpackTuple(
-            positional_args, "implement_array_function", 5, 5,
-            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
-        return NULL;
-    }
-
     relevant_args = PySequence_Fast(
         relevant_args,
         "dispatcher for __array_function__ did not return an iterable");
@@ -242,7 +234,7 @@ array_implement_array_function(
     }
 
     /* Collect __array_function__ implementations */
-    num_implementing_args = get_implementing_args_and_methods(
+    int num_implementing_args = get_implementing_args_and_methods(
         relevant_args, implementing_args, array_function_methods);
     if (num_implementing_args == -1) {
         goto cleanup;
@@ -254,15 +246,19 @@ array_implement_array_function(
      * arguments implement __array_function__ at all (e.g., if they are all
      * built-in types).
      */
-    any_overrides = 0;
-    for (j = 0; j < num_implementing_args; j++) {
+    int any_overrides = 0;
+    for (int j = 0; j < num_implementing_args; j++) {
         if (!is_default_array_function(array_function_methods[j])) {
             any_overrides = 1;
             break;
         }
     }
     if (!any_overrides) {
-        result = PyObject_Call(implementation, args, kwargs);
+        /*
+         * When the default implementation should be called, return
+         * `Py_NotImplemented` to indicate this.
+         */
+        result = Py_NotImplemented;
         goto cleanup;
     }
 
@@ -275,14 +271,14 @@ array_implement_array_function(
     if (types == NULL) {
         goto cleanup;
     }
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
         Py_INCREF(arg_type);
         PyTuple_SET_ITEM(types, j, arg_type);
     }
 
     /* Call __array_function__ methods */
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *argument = implementing_args[j];
         PyObject *method = array_function_methods[j];
 
@@ -319,7 +315,7 @@ array_implement_array_function(
     }
 
 cleanup:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(array_function_methods[j]);
     }
@@ -330,6 +326,109 @@ cleanup:
 
 
 /*
+ * Implements the __array_function__ protocol for a Python function, as described in
+ * in NEP-18. See numpy.core.overrides for a full docstring.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_array_function(
+    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
+{
+    PyObject *implementation, *public_api, *relevant_args, *args, *kwargs;
+
+    if (!PyArg_UnpackTuple(
+            positional_args, "implement_array_function", 5, 5,
+            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
+        return NULL;
+    }
+
+    /* Remove `like=` kwarg, which is NumPy-exclusive and thus not present
+     * in downstream libraries. If `like=` is specified but doesn't
+     * implement `__array_function__`, raise a `TypeError`.
+     */
+    if (kwargs != NULL && PyDict_Contains(kwargs, npy_ma_str_like)) {
+        PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
+        if (like_arg && !get_array_function(like_arg)) {
+            return PyErr_Format(PyExc_TypeError,
+                    "The `like` argument must be an array-like that implements "
+                    "the `__array_function__` protocol.");
+        }
+        PyDict_DelItem(kwargs, npy_ma_str_like);
+    }
+
+    PyObject *res = array_implement_array_function_internal(
+        public_api, relevant_args, args, kwargs);
+
+    if (res == Py_NotImplemented) {
+        return PyObject_Call(implementation, args, kwargs);
+    }
+    return res;
+}
+
+
+/*
+ * Implements the __array_function__ protocol for C array creation functions
+ * only. Added as an extension to NEP-18 in an effort to bring NEP-35 to
+ * life with minimal dispatch overhead.
+ */
+NPY_NO_EXPORT PyObject *
+array_implement_c_array_function_creation(
+    const char *function_name, PyObject *args, PyObject *kwargs)
+{
+    if (kwargs == NULL) {
+        return Py_NotImplemented;
+    }
+
+    /* Remove `like=` kwarg, which is NumPy-exclusive and thus not present
+     * in downstream libraries. If that key isn't present, return NULL and
+     * let originating call to continue. If the key is present but doesn't
+     * implement `__array_function__`, raise a `TypeError`.
+     */
+    if (!PyDict_Contains(kwargs, npy_ma_str_like)) {
+        return Py_NotImplemented;
+    }
+
+    PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
+    if (like_arg == NULL) {
+        return NULL;
+    }
+    else if (!get_array_function(like_arg)) {
+        return PyErr_Format(PyExc_TypeError,
+                "The `like` argument must be an array-like that implements "
+                "the `__array_function__` protocol.");
+    }
+    PyObject *relevant_args = PyTuple_Pack(1, like_arg);
+    PyDict_DelItem(kwargs, npy_ma_str_like);
+
+    PyObject *numpy_module = PyImport_Import(npy_ma_str_numpy);
+    if (numpy_module == NULL) {
+        Py_DECREF(relevant_args);
+        return NULL;
+    }
+
+    PyObject *public_api = PyObject_GetAttrString(numpy_module, function_name);
+    Py_DECREF(numpy_module);
+    if (public_api == NULL) {
+        Py_DECREF(relevant_args);
+        return NULL;
+    }
+    if (!PyCallable_Check(public_api)) {
+        Py_DECREF(relevant_args);
+        Py_DECREF(public_api);
+        return PyErr_Format(PyExc_RuntimeError,
+                            "numpy.%s is not callable.",
+                            function_name);
+    }
+
+    PyObject* result = array_implement_array_function_internal(
+            public_api, relevant_args, args, kwargs);
+
+    Py_DECREF(relevant_args);
+    Py_DECREF(public_api);
+    return result;
+}
+
+
+/*
  * Python wrapper for get_implementing_args_and_methods, for testing purposes.
  */
 NPY_NO_EXPORT PyObject *
@@ -337,8 +436,6 @@ array__get_implementing_args(
     PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
 {
     PyObject *relevant_args;
-    int j;
-    int num_implementing_args = 0;
     PyObject *implementing_args[NPY_MAXARGS];
     PyObject *array_function_methods[NPY_MAXARGS];
     PyObject *result = NULL;
@@ -355,7 +452,7 @@ array__get_implementing_args(
         return NULL;
     }
 
-    num_implementing_args = get_implementing_args_and_methods(
+    int num_implementing_args = get_implementing_args_and_methods(
         relevant_args, implementing_args, array_function_methods);
     if (num_implementing_args == -1) {
         goto cleanup;
@@ -366,14 +463,14 @@ array__get_implementing_args(
     if (result == NULL) {
         goto cleanup;
     }
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         PyObject *argument = implementing_args[j];
         Py_INCREF(argument);
         PyList_SET_ITEM(result, j, argument);
     }
 
 cleanup:
-    for (j = 0; j < num_implementing_args; j++) {
+    for (int j = 0; j < num_implementing_args; j++) {
         Py_DECREF(implementing_args[j]);
         Py_DECREF(array_function_methods[j]);
     }
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index 0d224e2b6..fdcf1746d 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -10,6 +10,10 @@ array__get_implementing_args(
     PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
 
 NPY_NO_EXPORT PyObject *
+array_implement_c_array_function_creation(
+    const char *function_name, PyObject *args, PyObject *kwargs);
+
+NPY_NO_EXPORT PyObject *
 array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
                            PyObject *kwargs);
 
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 95c650674..a2474d79f 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -416,7 +416,7 @@ WARN_IN_DEALLOC(PyObject* warning, const char * msg) {
     if (PyErr_WarnEx(warning, msg, 1) < 0) {
         PyObject * s;
 
-        s = PyUString_FromString("array_dealloc");
+        s = PyUnicode_FromString("array_dealloc");
         if (s) {
             PyErr_WriteUnraisable(s);
             Py_DECREF(s);
@@ -434,7 +434,9 @@ array_dealloc(PyArrayObject *self)
 {
     PyArrayObject_fields *fa = (PyArrayObject_fields *)self;
 
-    _dealloc_cached_buffer_info((PyObject*)self);
+    if (_buffer_info_free(fa->_buffer_info, (PyObject *)self) < 0) {
+        PyErr_WriteUnraisable(NULL);
+    }
 
     if (fa->weakreflist != NULL) {
         PyObject_ClearWeakRefs((PyObject *)self);
@@ -1745,7 +1747,7 @@ array_free(PyObject * v)
 NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy.ndarray",
-    .tp_basicsize = NPY_SIZEOF_PYARRAYOBJECT,
+    .tp_basicsize = sizeof(PyArrayObject_fields),
     /* methods */
     .tp_dealloc = (destructor)array_dealloc,
     .tp_repr = (reprfunc)array_repr,
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 1c93fa0ef..ecaca72a1 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -47,7 +47,7 @@ static NPY_INLINE npy_bool
 PySequence_NoString_Check(PyObject *op) {
     return
         PySequence_Check(op) &&
-        !PyString_Check(op) &&
+        !PyBytes_Check(op) &&
         !PyUnicode_Check(op) &&
         !PyArray_IsZeroDim(op);
 }
@@ -175,7 +175,7 @@ MyPyLong_AsUnsigned@Type@ (PyObject *obj)
  *
  * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, LONG, UINT, ULONG,
  *         LONGLONG, ULONGLONG, HALF, FLOAT, DOUBLE#
- * #func1 = PyBool_FromLong, PyInt_FromLong*6, PyLong_FromUnsignedLong*2,
+ * #func1 = PyBool_FromLong, PyLong_FromLong*6, PyLong_FromUnsignedLong*2,
  *          PyLong_FromLongLong, PyLong_FromUnsignedLongLong,
  *          MyPyFloat_FromHalf, PyFloat_FromDouble*2#
  * #func2 = PyObject_IsTrue, MyPyLong_AsLong*6, MyPyLong_AsUnsignedLong*2,
@@ -302,9 +302,42 @@ static int
             oop.real = NPY_NAN;
             oop.imag = NPY_NAN;
         }
+        else if (PyBytes_Check(op) || PyUnicode_Check(op)) {
+            /*
+             * Unlike most numeric conversion functions PyComplex_AsCComplex
+             * does not handle strings, so we have to use its constructor.
+             */
+            PyObject *pycomplex, *args;
+            if (PyBytes_Check(op)) {
+                /* The complex constructor expects unicode */
+                PyObject *unicode;
+                unicode = PyUnicode_FromEncodedObject(op, NULL, NULL);
+                if (unicode == NULL) {
+                    return -1;
+                }
+                args = PyTuple_Pack(1, unicode);
+                Py_DECREF(unicode);
+            }
+            else {
+                args = PyTuple_Pack(1, op);
+            }
+            if (args == NULL) {
+                return -1;
+            }
+            pycomplex = PyComplex_Type.tp_new(&PyComplex_Type, args, NULL);
+            Py_DECREF(args);
+            if (pycomplex == NULL) {
+                return -1;
+            }
+            oop = PyComplex_AsCComplex(pycomplex);
+            Py_DECREF(pycomplex);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
         else {
-            oop = PyComplex_AsCComplex (op);
-            if (PyErr_Occurred()) {
+            oop = PyComplex_AsCComplex(op);
+            if (error_converting(oop.real)) {
                 return -1;
             }
         }
@@ -615,7 +648,7 @@ static PyObject *
 OBJECT_getitem(void *ip, void *NPY_UNUSED(ap))
 {
     PyObject *obj;
-    NPY_COPY_PYOBJECT_PTR(&obj, ip);
+    memcpy(&obj, ip, sizeof(obj));
     if (obj == NULL) {
         Py_RETURN_NONE;
     }
@@ -631,12 +664,12 @@ OBJECT_setitem(PyObject *op, void *ov, void *NPY_UNUSED(ap))
 {
     PyObject *obj;
 
-    NPY_COPY_PYOBJECT_PTR(&obj, ov);
+    memcpy(&obj, ov, sizeof(obj));
 
     Py_INCREF(op);
     Py_XDECREF(obj);
 
-    NPY_COPY_PYOBJECT_PTR(ov, &op);
+    memcpy(ov, &op, sizeof(op));
 
     return PyErr_Occurred() ? -1 : 0;
 }
@@ -832,7 +865,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
             npy_intp names_size = PyTuple_GET_SIZE(descr->names);
 
             if (names_size != PyTuple_Size(op)) {
-                errmsg = PyUString_FromFormat(
+                errmsg = PyUnicode_FromFormat(
                         "could not assign tuple of length %zd to structure "
                         "with %" NPY_INTP_FMT " fields.",
                         PyTuple_Size(op), names_size);
@@ -2204,11 +2237,11 @@ OBJECT_copyswapn(PyObject **dst, npy_intp dstride, PyObject **src,
             dstp = (unsigned char*)dst;
             srcp = (unsigned char*)src;
             for (i = 0; i < n; i++) {
-                NPY_COPY_PYOBJECT_PTR(&tmp, srcp);
+                memcpy(&tmp, srcp, sizeof(tmp));
                 Py_XINCREF(tmp);
-                NPY_COPY_PYOBJECT_PTR(&tmp, dstp);
+                memcpy(&tmp, dstp, sizeof(tmp));
                 Py_XDECREF(tmp);
-                NPY_COPY_PYOBJECT_PTR(dstp, srcp);
+                memcpy(dstp, srcp, sizeof(tmp));
                 dstp += dstride;
                 srcp += sstride;
             }
@@ -2232,11 +2265,11 @@ OBJECT_copyswap(PyObject **dst, PyObject **src, int NPY_UNUSED(swap),
         }
         else {
             PyObject *tmp;
-            NPY_COPY_PYOBJECT_PTR(&tmp, src);
+            memcpy(&tmp, src, sizeof(tmp));
             Py_XINCREF(tmp);
-            NPY_COPY_PYOBJECT_PTR(&tmp, dst);
+            memcpy(&tmp, dst, sizeof(tmp));
             Py_XDECREF(tmp);
-            NPY_COPY_PYOBJECT_PTR(dst, src);
+            memcpy(dst, src, sizeof(tmp));
         }
     }
 }
@@ -2653,7 +2686,7 @@ OBJECT_nonzero (PyObject **ip, PyArrayObject *ap)
     }
     else {
         PyObject *obj;
-        NPY_COPY_PYOBJECT_PTR(&obj, ip);
+        memcpy(&obj, ip, sizeof(obj));
         if (obj == NULL) {
             return NPY_FALSE;
         }
@@ -3105,8 +3138,8 @@ BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
     #if defined(__ARM_NEON__) || defined (__ARM_NEON)
         uint8x16_t zero = vdupq_n_u8(0);
         for(; i < n - (n % 32); i+=32) {
-            uint8x16_t d1 = vld1q_u8((char *)&ip[i]);
-            uint8x16_t d2 = vld1q_u8((char *)&ip[i + 16]);
+            uint8x16_t d1 = vld1q_u8((uint8_t *)&ip[i]);
+            uint8x16_t d2 = vld1q_u8((uint8_t *)&ip[i + 16]);
             d1 = vceqq_u8(d1, zero);
             d2 = vceqq_u8(d2, zero);
             if(_mm_movemask_epi8_neon(vminq_u8(d1, d2)) != 0xFFFF) {
@@ -4428,7 +4461,7 @@ set_typeinfo(PyObject *dict)
             return -1;
         }
     }
-    key = PyInt_FromLong(NPY_@name2@);
+    key = PyLong_FromLong(NPY_@name2@);
     if (key == NULL) {
         return -1;
     }
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 8b482dc03..813850224 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -267,7 +267,7 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
 
             child = (PyArray_Descr*)PyTuple_GetItem(item, 0);
             offset_obj = PyTuple_GetItem(item, 1);
-            new_offset = PyInt_AsLong(offset_obj);
+            new_offset = PyLong_AsLong(offset_obj);
             if (error_converting(new_offset)) {
                 return -1;
             }
@@ -428,35 +428,27 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
 
 
 /*
- * Global information about all active buffers
+ * Information about all active buffers is stored as a linked list on
+ * the ndarray. The initial pointer is currently tagged to have a chance of
+ * detecting incompatible subclasses.
  *
  * Note: because for backward compatibility we cannot define bf_releasebuffer,
  * we must manually keep track of the additional data required by the buffers.
  */
 
 /* Additional per-array data required for providing the buffer interface */
-typedef struct {
+typedef struct _buffer_info_t_tag {
     char *format;
     int ndim;
     Py_ssize_t *strides;
     Py_ssize_t *shape;
+    struct _buffer_info_t_tag *next;
 } _buffer_info_t;
 
-/*
- * { id(array): [list of pointers to _buffer_info_t, the last one is latest] }
- *
- * Because shape, strides, and format can be different for different buffers,
- * we may need to keep track of multiple buffer infos for each array.
- *
- * However, when none of them has changed, the same buffer info may be reused.
- *
- * Thread-safety is provided by GIL.
- */
-static PyObject *_buffer_info_cache = NULL;
 
 /* Fill in the info structure */
 static _buffer_info_t*
-_buffer_info_new(PyObject *obj)
+_buffer_info_new(PyObject *obj, int flags)
 {
     /*
      * Note that the buffer info is cached as PyLongObjects making them appear
@@ -474,18 +466,18 @@ _buffer_info_new(PyObject *obj)
             PyErr_NoMemory();
             goto fail;
         }
+        info->ndim = 0;
+        info->shape = NULL;
+        info->strides = NULL;
+
         descr = PyArray_DescrFromScalar(obj);
         if (descr == NULL) {
             goto fail;
         }
-        info->ndim = 0;
-        info->shape = NULL;
-        info->strides = NULL;
     }
     else {
         assert(PyArray_Check(obj));
         PyArrayObject * arr = (PyArrayObject *)obj;
-        descr = PyArray_DESCR(arr);
 
         info = PyObject_Malloc(sizeof(_buffer_info_t) +
                                sizeof(Py_ssize_t) * PyArray_NDIM(arr) * 2);
@@ -504,25 +496,67 @@ _buffer_info_new(PyObject *obj)
             info->shape = (npy_intp *)((char *)info + sizeof(_buffer_info_t));
             assert((size_t)info->shape % sizeof(npy_intp) == 0);
             info->strides = info->shape + PyArray_NDIM(arr);
-            for (k = 0; k < PyArray_NDIM(arr); ++k) {
-                info->shape[k] = PyArray_DIMS(arr)[k];
-                info->strides[k] = PyArray_STRIDES(arr)[k];
+
+#if NPY_RELAXED_STRIDES_CHECKING
+            /*
+             * When NPY_RELAXED_STRIDES_CHECKING is used, some buffer users
+             * may expect a contiguous buffer to have well formatted strides
+             * also when a dimension is 1, but we do not guarantee this
+             * internally. Thus, recalculate strides for contiguous arrays.
+             * (This is unnecessary, but has no effect in the case where
+             * NPY_RELAXED_STRIDES CHECKING is disabled.)
+             */
+            int f_contiguous = (flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS;
+            if (PyArray_IS_C_CONTIGUOUS(arr) && !(
+                    f_contiguous && PyArray_IS_F_CONTIGUOUS(arr))) {
+                Py_ssize_t sd = PyArray_ITEMSIZE(arr);
+                for (k = info->ndim-1; k >= 0; --k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = sd;
+                    sd *= info->shape[k];
+                }
+            }
+            else if (PyArray_IS_F_CONTIGUOUS(arr)) {
+                Py_ssize_t sd = PyArray_ITEMSIZE(arr);
+                for (k = 0; k < info->ndim; ++k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = sd;
+                    sd *= info->shape[k];
+                }
+            }
+            else {
+#else  /* NPY_RELAXED_STRIDES_CHECKING */
+            /* We can always use the arrays strides directly */
+            {
+#endif
+
+                for (k = 0; k < PyArray_NDIM(arr); ++k) {
+                    info->shape[k] = PyArray_DIMS(arr)[k];
+                    info->strides[k] = PyArray_STRIDES(arr)[k];
+                }
             }
         }
+        descr = PyArray_DESCR(arr);
         Py_INCREF(descr);
     }
 
     /* Fill in format */
-    err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
-    Py_DECREF(descr);
-    if (err != 0) {
-        goto fail;
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        err = _buffer_format_string(descr, &fmt, obj, NULL, NULL);
+        Py_DECREF(descr);
+        if (err != 0) {
+            goto fail;
+        }
+        if (_append_char(&fmt, '\0') < 0) {
+            goto fail;
+        }
+        info->format = fmt.s;
     }
-    if (_append_char(&fmt, '\0') < 0) {
-        goto fail;
+    else {
+        Py_DECREF(descr);
+        info->format = NULL;
     }
-    info->format = fmt.s;
-
+    info->next = NULL;
     return info;
 
 fail:
@@ -538,9 +572,10 @@ _buffer_info_cmp(_buffer_info_t *a, _buffer_info_t *b)
     Py_ssize_t c;
     int k;
 
-    c = strcmp(a->format, b->format);
-    if (c != 0) return c;
-
+    if (a->format != NULL && b->format != NULL) {
+        c = strcmp(a->format, b->format);
+        if (c != 0) return c;
+    }
     c = a->ndim - b->ndim;
     if (c != 0) return c;
 
@@ -554,113 +589,161 @@ _buffer_info_cmp(_buffer_info_t *a, _buffer_info_t *b)
     return 0;
 }
 
-static void
-_buffer_info_free(_buffer_info_t *info)
+
+/*
+ * Tag the buffer info pointer by adding 2 (unless it is NULL to simplify
+ * object initialization).
+ * The linked list of buffer-infos was appended to the array struct in
+ * NumPy 1.20. Tagging the pointer gives us a chance to raise/print
+ * a useful error message instead of crashing hard if a C-subclass uses
+ * the same field.
+ */
+static NPY_INLINE void *
+buffer_info_tag(void *buffer_info)
 {
-    if (info->format) {
-        PyObject_Free(info->format);
+    if (buffer_info == NULL) {
+        return buffer_info;
+    }
+    else {
+        return (void *)((uintptr_t)buffer_info + 3);
     }
-    PyObject_Free(info);
 }
 
-/* Get buffer info from the global dictionary */
-static _buffer_info_t*
-_buffer_get_info(PyObject *obj)
-{
-    PyObject *key = NULL, *item_list = NULL, *item = NULL;
-    _buffer_info_t *info = NULL, *old_info = NULL;
 
-    if (_buffer_info_cache == NULL) {
-        _buffer_info_cache = PyDict_New();
-        if (_buffer_info_cache == NULL) {
-            return NULL;
-        }
-    }
-
-    /* Compute information */
-    info = _buffer_info_new(obj);
-    if (info == NULL) {
-        return NULL;
+static NPY_INLINE int
+_buffer_info_untag(
+        void *tagged_buffer_info, _buffer_info_t **buffer_info, PyObject *obj)
+{
+    if (tagged_buffer_info == NULL) {
+        *buffer_info = NULL;
+        return 0;
     }
-
-    /* Check if it is identical with an old one; reuse old one, if yes */
-    key = PyLong_FromVoidPtr((void*)obj);
-    if (key == NULL) {
-        goto fail;
+    if (NPY_UNLIKELY(((uintptr_t)tagged_buffer_info & 0x7) != 3)) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Object of type %S appears to be C subclassed NumPy array, "
+                "void scalar, or allocated in a non-standard way."
+                "NumPy reserves the right to change the size of these "
+                "structures. Projects are required to take this into account "
+                "by either recompiling against a specific NumPy version or "
+                "padding the struct and enforcing a maximum NumPy version.",
+                Py_TYPE(obj));
+        return -1;
     }
-    item_list = PyDict_GetItem(_buffer_info_cache, key);
+    *buffer_info = (void *)((uintptr_t)tagged_buffer_info - 3);
+    return 0;
+}
 
-    if (item_list != NULL) {
-        Py_INCREF(item_list);
-        if (PyList_GET_SIZE(item_list) > 0) {
-            item = PyList_GetItem(item_list, PyList_GET_SIZE(item_list) - 1);
-            old_info = (_buffer_info_t*)PyLong_AsVoidPtr(item);
 
-            if (_buffer_info_cmp(info, old_info) == 0) {
-                _buffer_info_free(info);
-                info = old_info;
-            }
-        }
-    }
-    else {
-        item_list = PyList_New(0);
-        if (item_list == NULL) {
-            goto fail;
-        }
-        if (PyDict_SetItem(_buffer_info_cache, key, item_list) != 0) {
-            goto fail;
+/*
+ * NOTE: for backward compatibility (esp. with PyArg_ParseTuple("s#", ...))
+ * we do *not* define bf_releasebuffer at all.
+ *
+ * Instead, any extra data allocated with the buffer is released only in
+ * array_dealloc.
+ *
+ * Ensuring that the buffer stays in place is taken care by refcounting;
+ * ndarrays do not reallocate if there are references to them, and a buffer
+ * view holds one reference.
+ *
+ * This is stored in the array's _buffer_info slot (currently as a void *).
+ */
+static void
+_buffer_info_free_untagged(void *_buffer_info)
+{
+    _buffer_info_t *next = _buffer_info;
+    while (next != NULL) {
+        _buffer_info_t *curr = next;
+        next = curr->next;
+        if (curr->format) {
+            PyObject_Free(curr->format);
         }
+        /* Shape is allocated as part of info */
+        PyObject_Free(curr);
     }
+}
 
-    if (info != old_info) {
-        /* Needs insertion */
-        item = PyLong_FromVoidPtr((void*)info);
-        if (item == NULL) {
-            goto fail;
-        }
-        PyList_Append(item_list, item);
-        Py_DECREF(item);
-    }
 
-    Py_DECREF(item_list);
-    Py_DECREF(key);
-    return info;
-
-fail:
-    if (info != NULL && info != old_info) {
-        _buffer_info_free(info);
+/*
+ * Checks whether the pointer is tagged, and then frees the cache list.
+ * (The tag check is only for transition due to changed structure size in 1.20)
+ */
+NPY_NO_EXPORT int
+_buffer_info_free(void *buffer_info, PyObject *obj)
+{
+    _buffer_info_t *untagged_buffer_info;
+    if (_buffer_info_untag(buffer_info, &untagged_buffer_info, obj) < 0) {
+        return -1;
     }
-    Py_XDECREF(item_list);
-    Py_XDECREF(key);
-    return NULL;
+    _buffer_info_free_untagged(untagged_buffer_info);
+    return 0;
 }
 
-/* Clear buffer info from the global dictionary */
-static void
-_buffer_clear_info(PyObject *arr)
+
+/*
+ * Get the buffer info returning either the old one (passed in) or a new
+ * buffer info which adds holds on to (and thus replaces) the old one.
+ */
+static _buffer_info_t*
+_buffer_get_info(void **buffer_info_cache_ptr, PyObject *obj, int flags)
 {
-    PyObject *key, *item_list, *item;
-    _buffer_info_t *info;
-    int k;
+    _buffer_info_t *info = NULL;
+    _buffer_info_t *stored_info;  /* First currently stored buffer info */
 
-    if (_buffer_info_cache == NULL) {
-        return;
+    if (_buffer_info_untag(*buffer_info_cache_ptr, &stored_info, obj) < 0) {
+        return NULL;
     }
+    _buffer_info_t *old_info = stored_info;
 
-    key = PyLong_FromVoidPtr((void*)arr);
-    item_list = PyDict_GetItem(_buffer_info_cache, key);
-    if (item_list != NULL) {
-        for (k = 0; k < PyList_GET_SIZE(item_list); ++k) {
-            item = PyList_GET_ITEM(item_list, k);
-            info = (_buffer_info_t*)PyLong_AsVoidPtr(item);
-            _buffer_info_free(info);
+    /* Compute information (it would be nice to skip this in simple cases) */
+    info = _buffer_info_new(obj, flags);
+    if (info == NULL) {
+        return NULL;
+    }
+
+    if (old_info != NULL && _buffer_info_cmp(info, old_info) != 0) {
+        _buffer_info_t *next_info = old_info->next;
+        old_info = NULL;  /* Can't use this one, but possibly next */
+
+         if (info->ndim > 1 && next_info != NULL) {
+             /*
+              * Some arrays are C- and F-contiguous and if they have more
+              * than one dimension, the buffer-info may differ between
+              * the two due to RELAXED_STRIDES_CHECKING.
+              * If we export both buffers, the first stored one may be
+              * the one for the other contiguity, so check both.
+              * This is generally very unlikely in all other cases, since
+              * in all other cases the first one will match unless array
+              * metadata was modified in-place (which is discouraged).
+              */
+             if (_buffer_info_cmp(info, next_info) == 0) {
+                 old_info = next_info;
+             }
+         }
+    }
+    if (old_info != NULL) {
+        /*
+         * The two info->format are considered equal if one of them
+         * has no format set (meaning the format is arbitrary and can
+         * be modified). If the new info has a format, but we reuse
+         * the old one, this transfers the ownership to the old one.
+         */
+        if (old_info->format == NULL) {
+            old_info->format = info->format;
+            info->format = NULL;
         }
-        PyDict_DelItem(_buffer_info_cache, key);
+        _buffer_info_free_untagged(info);
+        info = old_info;
+    }
+    else {
+        /* Insert new info as first item in the linked buffer-info list. */
+        info->next = stored_info;
+        *buffer_info_cache_ptr = buffer_info_tag(info);
     }
 
-    Py_DECREF(key);
+    return info;
 }
 
+
 /*
  * Retrieving buffers for ndarray
  */
@@ -705,8 +788,9 @@ array_getbuffer(PyObject *obj, Py_buffer *view, int flags)
         goto fail;
     }
 
-    /* Fill in information */
-    info = _buffer_get_info(obj);
+    /* Fill in information (and add it to _buffer_info if necessary) */
+    info = _buffer_get_info(
+            &((PyArrayObject_fields *)self)->_buffer_info, obj, flags);
     if (info == NULL) {
         goto fail;
     }
@@ -742,35 +826,6 @@ array_getbuffer(PyObject *obj, Py_buffer *view, int flags)
     }
     if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
         view->strides = info->strides;
-
-#ifdef NPY_RELAXED_STRIDES_CHECKING
-        /*
-         * If NPY_RELAXED_STRIDES_CHECKING is on, the array may be
-         * contiguous, but it won't look that way to Python when it
-         * tries to determine contiguity by looking at the strides
-         * (since one of the elements may be -1).  In that case, just
-         * regenerate strides from shape.
-         */
-        if (PyArray_CHKFLAGS(self, NPY_ARRAY_C_CONTIGUOUS) &&
-                !((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS)) {
-            Py_ssize_t sd = view->itemsize;
-            int i;
-
-            for (i = view->ndim-1; i >= 0; --i) {
-                view->strides[i] = sd;
-                sd *= view->shape[i];
-            }
-        }
-        else if (PyArray_CHKFLAGS(self, NPY_ARRAY_F_CONTIGUOUS)) {
-            Py_ssize_t sd = view->itemsize;
-            int i;
-
-            for (i = 0; i < view->ndim; ++i) {
-                view->strides[i] = sd;
-                sd *= view->shape[i];
-            }
-        }
-#endif
     }
     else {
         view->strides = NULL;
@@ -785,90 +840,48 @@ fail:
 }
 
 /*
- * Retrieving buffers for scalars
+ * Retrieving buffers for void scalar (which can contain any complex types),
+ * defined in buffer.c since it requires the complex format building logic.
  */
-int
+NPY_NO_EXPORT int
 void_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
-    _buffer_info_t *info = NULL;
-    PyArray_Descr *descr = NULL;
-    int elsize;
+    PyVoidScalarObject *scalar = (PyVoidScalarObject *)self;
 
     if (flags & PyBUF_WRITABLE) {
         PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
-        goto fail;
-    }
-
-    /* Fill in information */
-    info = _buffer_get_info(self);
-    if (info == NULL) {
-        goto fail;
-    }
-
-    view->ndim = info->ndim;
-    view->shape = info->shape;
-    view->strides = info->strides;
-
-    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
-        view->format = info->format;
-    } else {
-        view->format = NULL;
-    }
-
-    descr = PyArray_DescrFromScalar(self);
-    view->buf = (void *)scalar_value(self, descr);
-    elsize = descr->elsize;
-    view->len = elsize;
-    if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
-        elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
+        return -1;
     }
-    view->itemsize = elsize;
-
-    Py_DECREF(descr);
 
+    view->ndim = 0;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->len = scalar->descr->elsize;
+    view->itemsize = scalar->descr->elsize;
     view->readonly = 1;
     view->suboffsets = NULL;
-    view->obj = self;
     Py_INCREF(self);
-    return 0;
-
-fail:
-    view->obj = NULL;
-    return -1;
-}
-
-/*
- * NOTE: for backward compatibility (esp. with PyArg_ParseTuple("s#", ...))
- * we do *not* define bf_releasebuffer at all.
- *
- * Instead, any extra data allocated with the buffer is released only in
- * array_dealloc.
- *
- * Ensuring that the buffer stays in place is taken care by refcounting;
- * ndarrays do not reallocate if there are references to them, and a buffer
- * view holds one reference.
- */
-
-NPY_NO_EXPORT void
-_dealloc_cached_buffer_info(PyObject *self)
-{
-    int reset_error_state = 0;
-    PyObject *ptype, *pvalue, *ptraceback;
-
-    /* This function may be called when processing an exception --
-     * we need to stash the error state to avoid confusing PyDict
-     */
+    view->obj = self;
+    view->buf = scalar->obval;
 
-    if (PyErr_Occurred()) {
-        reset_error_state = 1;
-        PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+    if (((flags & PyBUF_FORMAT) != PyBUF_FORMAT)) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
     }
 
-    _buffer_clear_info(self);
-
-    if (reset_error_state) {
-        PyErr_Restore(ptype, pvalue, ptraceback);
+    /*
+     * If a format is being exported, we need to use _buffer_get_info
+     * to find the correct format.  This format must also be stored, since
+     * at least in theory it can change (in practice it should never change).
+     */
+    _buffer_info_t *info = _buffer_get_info(&scalar->_buffer_info, self, flags);
+    if (info == NULL) {
+        return -1;
     }
+    view->format = info->format;
+    return 0;
 }
 
 
@@ -931,7 +944,7 @@ _descriptor_from_pep3118_format(char const *s)
     }
     *p = '\0';
 
-    str = PyUString_FromStringAndSize(buf, strlen(buf));
+    str = PyUnicode_FromStringAndSize(buf, strlen(buf));
     if (str == NULL) {
         free(buf);
         return NULL;
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index 92ab75053..43d88271b 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -392,7 +392,7 @@ __New_PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject *out,
         else {
             val = PyArray_DIM(arrnew,i);
         }
-        PyTuple_SET_ITEM(newshape, i, PyInt_FromLong((long)val));
+        PyTuple_SET_ITEM(newshape, i, PyLong_FromLong((long)val));
     }
     arr2 = (PyArrayObject *)PyArray_Reshape(arr1, newshape);
     Py_DECREF(arr1);
@@ -1023,7 +1023,7 @@ PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject *max, PyArrayObject *o
     if (min != NULL) {
         if (PyArray_ISUNSIGNED(self)) {
             int cmp;
-            zero = PyInt_FromLong(0);
+            zero = PyLong_FromLong(0);
             cmp = PyObject_RichCompareBool(min, zero, Py_LT);
             if (cmp == -1) {
                 Py_DECREF(zero);
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 2abc79167..841ed799d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -12,7 +12,6 @@
 #include "abstractdtypes.h"
 #include "usertypes.h"
 
-#include "common.h"
 #include "npy_buffer.h"
 
 #include "get_attr_string.h"
@@ -127,26 +126,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
     return 0;
 }
 
-
-/* new reference */
-NPY_NO_EXPORT PyArray_Descr *
-_array_typedescr_fromstr(char const *c_str)
-{
-    PyArray_Descr *descr = NULL;
-    PyObject *stringobj = PyString_FromString(c_str);
-
-    if (stringobj == NULL) {
-        return NULL;
-    }
-    if (PyArray_DescrConverter(stringobj, &descr) != NPY_SUCCEED) {
-        Py_DECREF(stringobj);
-        return NULL;
-    }
-    Py_DECREF(stringobj);
-    return descr;
-}
-
-
 NPY_NO_EXPORT char *
 index2ptr(PyArrayObject *mp, npy_intp i)
 {
@@ -169,7 +148,7 @@ NPY_NO_EXPORT int
 _zerofill(PyArrayObject *ret)
 {
     if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        PyObject *zero = PyInt_FromLong(0);
+        PyObject *zero = PyLong_FromLong(0);
         PyArray_FillObjectArray(ret, zero);
         Py_DECREF(zero);
         if (PyErr_Occurred()) {
@@ -254,7 +233,6 @@ NPY_NO_EXPORT PyObject *
 convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending)
 {
     npy_intp i;
-    PyObject *ret, *tmp;
 
     /*
      * Negative dimension indicates "newaxis", which can
@@ -264,40 +242,40 @@ convert_shape_to_string(npy_intp n, npy_intp const *vals, char *ending)
     for (i = 0; i < n && vals[i] < 0; i++);
 
     if (i == n) {
-        return PyUString_FromFormat("()%s", ending);
-    }
-    else {
-        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
-        if (ret == NULL) {
-            return NULL;
-        }
+        return PyUnicode_FromFormat("()%s", ending);
     }
 
+    PyObject *ret = PyUnicode_FromFormat("%" NPY_INTP_FMT, vals[i++]);
+    if (ret == NULL) {
+        return NULL;
+    }
     for (; i < n; ++i) {
+        PyObject *tmp;
+
         if (vals[i] < 0) {
-            tmp = PyUString_FromString(",newaxis");
+            tmp = PyUnicode_FromString(",newaxis");
         }
         else {
-            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
+            tmp = PyUnicode_FromFormat(",%" NPY_INTP_FMT, vals[i]);
         }
         if (tmp == NULL) {
             Py_DECREF(ret);
             return NULL;
         }
 
-        PyUString_ConcatAndDel(&ret, tmp);
+        Py_SETREF(ret, PyUnicode_Concat(ret, tmp));
+        Py_DECREF(tmp);
         if (ret == NULL) {
             return NULL;
         }
     }
 
     if (i == 1) {
-        tmp = PyUString_FromFormat(",)%s", ending);
+        Py_SETREF(ret, PyUnicode_FromFormat("(%S,)%s", ret, ending));
     }
     else {
-        tmp = PyUString_FromFormat(")%s", ending);
+        Py_SETREF(ret, PyUnicode_FromFormat("(%S)%s", ret, ending));
     }
-    PyUString_ConcatAndDel(&ret, tmp);
     return ret;
 }
 
@@ -310,7 +288,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j)
              *shape1 = NULL, *shape2 = NULL,
              *shape1_i = NULL, *shape2_j = NULL;
 
-    format = PyUString_FromString("shapes %s and %s not aligned:"
+    format = PyUnicode_FromString("shapes %s and %s not aligned:"
                                   " %d (dim %d) != %d (dim %d)");
 
     shape1 = convert_shape_to_string(PyArray_NDIM(a), PyArray_DIMS(a), "");
@@ -333,7 +311,7 @@ dot_alignment_error(PyArrayObject *a, int i, PyArrayObject *b, int j)
         goto end;
     }
 
-    errmsg = PyUString_Format(format, fmt_args);
+    errmsg = PyUnicode_Format(format, fmt_args);
     if (errmsg != NULL) {
         PyErr_SetObject(PyExc_ValueError, errmsg);
     }
@@ -373,10 +351,7 @@ _unpack_field(PyObject *value, PyArray_Descr **descr, npy_intp *offset)
     *descr = (PyArray_Descr *)PyTuple_GET_ITEM(value, 0);
     off  = PyTuple_GET_ITEM(value, 1);
 
-    if (PyInt_Check(off)) {
-        *offset = PyInt_AsSsize_t(off);
-    }
-    else if (PyLong_Check(off)) {
+    if (PyLong_Check(off)) {
         *offset = PyLong_AsSsize_t(off);
     }
     else {
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 793cefaf8..ef9bc79da 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -2,7 +2,6 @@
 #define _NPY_PRIVATE_COMMON_H_
 #include "structmember.h"
 #include <numpy/npy_common.h>
-#include <numpy/npy_cpu.h>
 #include <numpy/ndarraytypes.h>
 #include <limits.h>
 #include "npy_import.h"
@@ -292,43 +291,6 @@ npy_memchr(char * haystack, char needle,
     return p;
 }
 
-/*
- * Convert NumPy stride to BLAS stride. Returns 0 if conversion cannot be done
- * (BLAS won't handle negative or zero strides the way we want).
- */
-static NPY_INLINE int
-blas_stride(npy_intp stride, unsigned itemsize)
-{
-    /*
-     * Should probably check pointer alignment also, but this may cause
-     * problems if we require complex to be 16 byte aligned.
-     */
-    if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
-        stride /= itemsize;
-#ifndef HAVE_BLAS_ILP64
-        if (stride <= INT_MAX) {
-#else
-        if (stride <= NPY_MAX_INT64) {
-#endif
-            return stride;
-        }
-    }
-    return 0;
-}
-
-/*
- * Define a chunksize for CBLAS. CBLAS counts in integers.
- */
-#if NPY_MAX_INTP > INT_MAX
-# ifndef HAVE_BLAS_ILP64
-#  define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
-# else
-#  define NPY_CBLAS_CHUNK  (NPY_MAX_INT64 / 2 + 1)
-# endif
-#else
-# define NPY_CBLAS_CHUNK  NPY_MAX_INTP
-#endif
-
 #include "ucsnarrow.h"
 
 /*
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index a8e4aa789..da857071b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -249,7 +249,7 @@ arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     NPY_END_THREADS
     Py_DECREF(arr_x);
 
-    return PyInt_FromLong(monotonic);
+    return PyLong_FromLong(monotonic);
 }
 
 /*
@@ -1229,41 +1229,6 @@ arr_unravel_index(PyObject *self, PyObject *args, PyObject *kwds)
 
     char *kwlist[] = {"indices", "shape", "order", NULL};
 
-    /*
-     * TODO: remove this in favor of warning raised in the dispatcher when
-     * __array_function__ is enabled by default.
-     */
-
-    /*
-     * Continue to support the older "dims" argument in place
-     * of the "shape" argument. Issue an appropriate warning
-     * if "dims" is detected in keywords, then replace it with
-     * the new "shape" argument and continue processing as usual.
-     */
-    if (kwds) {
-        PyObject *dims_item, *shape_item;
-        dims_item = _PyDict_GetItemStringWithError(kwds, "dims");
-        if (dims_item == NULL && PyErr_Occurred()){
-            return NULL;
-        }
-        shape_item = _PyDict_GetItemStringWithError(kwds, "shape");
-        if (shape_item == NULL && PyErr_Occurred()){
-            return NULL;
-        }
-        if (dims_item != NULL && shape_item == NULL) {
-            if (DEPRECATE("'shape' argument should be"
-                          " used instead of 'dims'") < 0) {
-                return NULL;
-            }
-            if (PyDict_SetItemString(kwds, "shape", dims_item) < 0) {
-                return NULL;
-            }
-            if (PyDict_DelItemString(kwds, "dims") < 0) {
-                return NULL;
-            }
-        }
-    }
-
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:unravel_index",
                     kwlist,
                     &indices0,
@@ -1420,7 +1385,7 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *obj;
     PyObject *str;
-    #if (PY_VERSION_HEX >= 0x030700A2)
+    #if PY_VERSION_HEX >= 0x030700A2 && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300)
     const char *docstr;
     #else
     char *docstr;
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index e41fdc8f1..dd18f71fd 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -6,7 +6,6 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-#include "numpy/arrayobject.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -1152,7 +1151,7 @@ PyArray_IntTupleFromIntp(int len, npy_intp const *vals)
     }
     for (i = 0; i < len; i++) {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-        PyObject *o = PyInt_FromLong((long) vals[i]);
+        PyObject *o = PyLong_FromLong((long) vals[i]);
 #else
         PyObject *o = PyLong_FromLongLong((npy_longlong) vals[i]);
 #endif
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index e7cbeaa77..29a2bb0e8 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -8,9 +8,6 @@
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
-
-#include "npy_config.h"
-
 #include "npy_pycompat.h"
 
 #include "common.h"
@@ -248,13 +245,13 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
                     return -1;
                 }
                 PyTuple_SET_ITEM(tupobj,0,obj);
-                obj = PyUString_FromString((const char *)format);
+                obj = PyUnicode_FromString((const char *)format);
                 if (obj == NULL) {
                     Py_DECREF(tupobj);
                     Py_DECREF(it);
                     return -1;
                 }
-                strobj = PyUString_Format(obj, tupobj);
+                strobj = PyUnicode_Format(obj, tupobj);
                 Py_DECREF(obj);
                 Py_DECREF(tupobj);
                 if (strobj == NULL) {
@@ -403,7 +400,7 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
         }
     }
     /* Python integer */
-    else if (PyLong_Check(obj) || PyInt_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Try long long before unsigned long long */
         npy_longlong ll_v = PyLong_AsLongLong(obj);
         if (error_converting(ll_v)) {
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 94cd1e5fa..f9dd35a73 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -18,10 +18,13 @@
 #include "dtypemeta.h"
 #include "scalartypes.h"
 #include "mapping.h"
+#include "legacy_dtype_implementation.h"
 
 #include "convert_datatype.h"
 #include "_datetime.h"
 #include "datetime_strings.h"
+#include "array_method.h"
+#include "usertypes.h"
 
 
 /*
@@ -35,6 +38,183 @@
  */
 NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[] = {0, 3, 5, 10, 10, 20, 20, 20, 20};
 
+
+static PyObject *
+PyArray_GetGenericToVoidCastingImpl(void);
+
+static PyObject *
+PyArray_GetVoidToGenericCastingImpl(void);
+
+static PyObject *
+PyArray_GetGenericToObjectCastingImpl(void);
+
+static PyObject *
+PyArray_GetObjectToGenericCastingImpl(void);
+
+
+/**
+ * Fetch the casting implementation from one DType to another.
+ *
+ * @params from
+ * @params to
+ *
+ * @returns A castingimpl (PyArrayDTypeMethod *), None or NULL with an
+ *          error set.
+ */
+static PyObject *
+PyArray_GetCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyObject *res = PyDict_GetItem(from->castingimpls, (PyObject *)to);
+    if (res != NULL || PyErr_Occurred()) {
+        Py_XINCREF(res);
+        return res;
+    }
+    /*
+     * The following code looks up CastingImpl based on the fact that anything
+     * can be cast to and from objects or structured (void) dtypes.
+     *
+     * The last part adds casts dynamically based on legacy definition
+     */
+    if (from->type_num == NPY_OBJECT) {
+        res = PyArray_GetObjectToGenericCastingImpl();
+    }
+    else if (to->type_num == NPY_OBJECT) {
+        res = PyArray_GetGenericToObjectCastingImpl();
+    }
+    else if (from->type_num == NPY_VOID) {
+        res = PyArray_GetVoidToGenericCastingImpl();
+    }
+    else if (to->type_num == NPY_VOID) {
+        res = PyArray_GetGenericToVoidCastingImpl();
+    }
+    else if (from->type_num < NPY_NTYPES && to->type_num < NPY_NTYPES) {
+        /* All builtin dtypes have their casts explicitly defined. */
+        PyErr_Format(PyExc_RuntimeError,
+                "builtin cast from %S to %s not found, this should not "
+                "be possible.", from, to);
+        return NULL;
+    }
+    else {
+        if (from->parametric || to->parametric) {
+            Py_RETURN_NONE;
+        }
+        /* Reject non-legacy dtypes (they need to use the new API) */
+        if (!from->legacy || !to->legacy) {
+            Py_RETURN_NONE;
+        }
+        if (from != to) {
+            /* A cast function must have been registered */
+            PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc(
+                    from->singleton, to->type_num);
+            if (castfunc == NULL) {
+                PyErr_Clear();
+                /* Remember that this cast is not possible */
+                if (PyDict_SetItem(from->castingimpls, (PyObject *) to, Py_None) < 0) {
+                    return NULL;
+                }
+                Py_RETURN_NONE;
+            }
+        }
+
+        /* PyArray_AddLegacyWrapping_CastingImpl find the correct casting level: */
+        /*
+         * TODO: Possibly move this to the cast registration time. But if we do
+         *       that, we have to also update the cast when the casting safety
+         *       is registered.
+         */
+        if (PyArray_AddLegacyWrapping_CastingImpl(from, to, -1) < 0) {
+            return NULL;
+        }
+        return PyArray_GetCastingImpl(from, to);
+    }
+
+    if (res == NULL) {
+        return NULL;
+    }
+    if (PyDict_SetItem(from->castingimpls, (PyObject *)to, res) < 0) {
+        Py_DECREF(res);
+        return NULL;
+    }
+    return res;
+}
+
+
+/**
+ * Fetch the (bound) casting implementation from one DType to another.
+ *
+ * @params from
+ * @params to
+ *
+ * @returns A bound casting implementation or None (or NULL for error).
+ */
+static PyObject *
+PyArray_GetBoundCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyObject *method = PyArray_GetCastingImpl(from, to);
+    if (method == NULL || method == Py_None) {
+        return method;
+    }
+
+    /* TODO: Create better way to wrap method into bound method */
+    PyBoundArrayMethodObject *res;
+    res = PyObject_New(PyBoundArrayMethodObject, &PyBoundArrayMethod_Type);
+    if (res == NULL) {
+        return NULL;
+    }
+    res->method = (PyArrayMethodObject *)method;
+    res->dtypes = PyMem_Malloc(2 * sizeof(PyArray_DTypeMeta *));
+    if (res->dtypes == NULL) {
+        Py_DECREF(res);
+        return NULL;
+    }
+    Py_INCREF(from);
+    res->dtypes[0] = from;
+    Py_INCREF(to);
+    res->dtypes[1] = to;
+
+    return (PyObject *)res;
+}
+
+
+NPY_NO_EXPORT PyObject *
+_get_castingimpl(PyObject *NPY_UNUSED(module), PyObject *args)
+{
+    PyArray_DTypeMeta *from, *to;
+    if (!PyArg_ParseTuple(args, "O!O!:_get_castingimpl",
+            &PyArrayDTypeMeta_Type, &from, &PyArrayDTypeMeta_Type, &to)) {
+        return NULL;
+    }
+    return PyArray_GetBoundCastingImpl(from, to);
+}
+
+
+/**
+ * Find the minimal cast safety level given two cast-levels as input.
+ * Supports the NPY_CAST_IS_VIEW check, and should be preferred to allow
+ * extending cast-levels if necessary.
+ * It is not valid for one of the arguments to be -1 to indicate an error.
+ *
+ * @param casting1
+ * @param casting2
+ * @return The minimal casting error (can be -1).
+ */
+NPY_NO_EXPORT NPY_CASTING
+PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2)
+{
+    if (casting1 < 0 || casting2 < 0) {
+        return -1;
+    }
+    NPY_CASTING view = casting1 & casting2 & _NPY_CAST_IS_VIEW;
+    casting1 = casting1 & ~_NPY_CAST_IS_VIEW;
+    casting2 = casting2 & ~_NPY_CAST_IS_VIEW;
+    /* larger casting values are less safe */
+    if (casting1 > casting2) {
+        return casting1 | view;
+    }
+    return casting2 | view;
+}
+
+
 /*NUMPY_API
  * For backward compatibility
  *
@@ -92,11 +272,14 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
             PyObject *key;
             PyObject *cobj;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             cobj = PyDict_GetItem(obj, key);
             Py_DECREF(key);
-            if (cobj && NpyCapsule_Check(cobj)) {
-                castfunc = NpyCapsule_AsVoidPtr(cobj);
+            if (cobj && PyCapsule_CheckExact(cobj)) {
+                castfunc = PyCapsule_GetPointer(cobj, NULL);
+                if (castfunc == NULL) {
+                    return NULL;
+                }
             }
         }
     }
@@ -129,170 +312,6 @@ PyArray_GetCastFunc(PyArray_Descr *descr, int type_num)
     return NULL;
 }
 
-/*
- * Legacy function to find the correct dtype when casting from any built-in
- * dtype to NPY_STRING, NPY_UNICODE, NPY_VOID, and NPY_DATETIME with generic
- * units.
- *
- * This function returns a dtype based on flex_dtype and the values in
- * data_dtype. It also calls Py_DECREF on the flex_dtype. If the
- * flex_dtype is not flexible, it returns it as-is.
- *
- * Usually, if data_obj is not an array, dtype should be the result
- * given by the PyArray_GetArrayParamsFromObject function.
- *
- * If *flex_dtype is NULL, returns immediately, without setting an
- * exception, leaving any previous error handling intact.
- */
-NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype)
-{
-    PyArray_DatetimeMetaData *meta;
-    PyArray_Descr *retval = NULL;
-    int flex_type_num;
-
-    if (flex_dtype == NULL) {
-        return retval;
-    }
-
-    flex_type_num = flex_dtype->type_num;
-
-    /* Flexible types with expandable size */
-    if (PyDataType_ISUNSIZED(flex_dtype)) {
-        /* First replace the flex_dtype */
-        retval = PyArray_DescrNew(flex_dtype);
-        Py_DECREF(flex_dtype);
-        if (retval == NULL) {
-            return retval;
-        }
-
-        if (data_dtype->type_num == flex_type_num ||
-                                    flex_type_num == NPY_VOID) {
-            (retval)->elsize = data_dtype->elsize;
-        }
-        else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
-            npy_intp size = 8;
-
-            /*
-             * Get a string-size estimate of the input. These
-             * are generallly the size needed, rounded up to
-             * a multiple of eight.
-             */
-            switch (data_dtype->type_num) {
-                case NPY_BOOL:
-                case NPY_UBYTE:
-                case NPY_BYTE:
-                case NPY_USHORT:
-                case NPY_SHORT:
-                case NPY_UINT:
-                case NPY_INT:
-                case NPY_ULONG:
-                case NPY_LONG:
-                case NPY_ULONGLONG:
-                case NPY_LONGLONG:
-                    if (data_dtype->kind == 'b') {
-                        /* 5 chars needed for cast to 'True' or 'False' */
-                        size = 5;
-                    }
-                    else if (data_dtype->elsize > 8 ||
-                             data_dtype->elsize < 0) {
-                        /*
-                         * Element size should never be greater than 8 or
-                         * less than 0 for integer type, but just in case...
-                         */
-                        break;
-                    }
-                    else if (data_dtype->kind == 'u') {
-                        size = REQUIRED_STR_LEN[data_dtype->elsize];
-                    }
-                    else if (data_dtype->kind == 'i') {
-                        /* Add character for sign symbol */
-                        size = REQUIRED_STR_LEN[data_dtype->elsize] + 1;
-                    }
-                    break;
-                case NPY_HALF:
-                case NPY_FLOAT:
-                case NPY_DOUBLE:
-                    size = 32;
-                    break;
-                case NPY_LONGDOUBLE:
-                    size = 48;
-                    break;
-                case NPY_CFLOAT:
-                case NPY_CDOUBLE:
-                    size = 2 * 32;
-                    break;
-                case NPY_CLONGDOUBLE:
-                    size = 2 * 48;
-                    break;
-                case NPY_OBJECT:
-                    size = 64;
-                    break;
-                case NPY_STRING:
-                case NPY_VOID:
-                    size = data_dtype->elsize;
-                    break;
-                case NPY_UNICODE:
-                    size = data_dtype->elsize / 4;
-                    break;
-                case NPY_DATETIME:
-                    meta = get_datetime_metadata_from_dtype(data_dtype);
-                    if (meta == NULL) {
-                        Py_DECREF(retval);
-                        return NULL;
-                    }
-                    size = get_datetime_iso_8601_strlen(0, meta->base);
-                    break;
-                case NPY_TIMEDELTA:
-                    size = 21;
-                    break;
-            }
-
-            if (flex_type_num == NPY_STRING) {
-                retval->elsize = size;
-            }
-            else if (flex_type_num == NPY_UNICODE) {
-                retval->elsize = size * 4;
-            }
-        }
-        else {
-            /*
-             * We should never get here, but just in case someone adds
-             * a new flex dtype...
-             */
-            PyErr_SetString(PyExc_TypeError,
-                    "don't know how to adapt flex dtype");
-            Py_DECREF(retval);
-            return NULL;
-        }
-    }
-    /* Flexible type with generic time unit that adapts */
-    else if (flex_type_num == NPY_DATETIME ||
-                flex_type_num == NPY_TIMEDELTA) {
-        meta = get_datetime_metadata_from_dtype(flex_dtype);
-        retval = flex_dtype;
-        if (meta == NULL) {
-            return NULL;
-        }
-
-        if (meta->base == NPY_FR_GENERIC) {
-            if (data_dtype->type_num == NPY_DATETIME ||
-                    data_dtype->type_num == NPY_TIMEDELTA) {
-                meta = get_datetime_metadata_from_dtype(data_dtype);
-                if (meta == NULL) {
-                    return NULL;
-                }
-
-                retval = create_datetime_dtype(flex_type_num, meta);
-                Py_DECREF(flex_dtype);
-            }
-        }
-    }
-    else {
-        retval = flex_dtype;
-    }
-    return retval;
-}
 
 /*
  * Must be broadcastable.
@@ -322,60 +341,115 @@ PyArray_CastAnyTo(PyArrayObject *out, PyArrayObject *mp)
     return PyArray_CopyAnyInto(out, mp);
 }
 
+
+/**
+ * Given two dtype instances, find the correct casting safety.
+ *
+ * Note that in many cases, it may be preferable to fetch the casting
+ * implementations fully to have them available for doing the actual cast
+ * later.
+ *
+ * @param from
+ * @param to The descriptor to cast to (may be NULL)
+ * @param to_dtype If `to` is NULL, must pass the to_dtype (otherwise this
+ *        is ignored).
+ * @return NPY_CASTING or -1 on error or if the cast is not possible.
+ */
+NPY_NO_EXPORT NPY_CASTING
+PyArray_GetCastSafety(
+        PyArray_Descr *from, PyArray_Descr *to, PyArray_DTypeMeta *to_dtype)
+{
+    NPY_CASTING casting;
+    if (to != NULL) {
+        to_dtype = NPY_DTYPE(to);
+    }
+    PyObject *meth = PyArray_GetCastingImpl(NPY_DTYPE(from), to_dtype);
+    if (meth == NULL) {
+        return -1;
+    }
+    if (meth == Py_None) {
+        Py_DECREF(Py_None);
+        return -1;
+    }
+
+    PyArrayMethodObject *castingimpl = (PyArrayMethodObject *)meth;
+
+    PyArray_DTypeMeta *dtypes[2] = {NPY_DTYPE(from), to_dtype};
+    PyArray_Descr *descrs[2] = {from, to};
+    PyArray_Descr *out_descrs[2];
+
+    casting = castingimpl->resolve_descriptors(
+            castingimpl, dtypes, descrs, out_descrs);
+    Py_DECREF(meth);
+    if (casting < 0) {
+        return -1;
+    }
+    /* The returned descriptors may not match, requiring a second check */
+    if (out_descrs[0] != descrs[0]) {
+        NPY_CASTING from_casting = PyArray_GetCastSafety(
+                descrs[0], out_descrs[0], NULL);
+        casting = PyArray_MinCastSafety(casting, from_casting);
+        if (casting < 0) {
+            goto finish;
+        }
+    }
+    if (descrs[1] != NULL && out_descrs[1] != descrs[1]) {
+        NPY_CASTING from_casting = PyArray_GetCastSafety(
+                descrs[1], out_descrs[1], NULL);
+        casting = PyArray_MinCastSafety(casting, from_casting);
+        if (casting < 0) {
+            goto finish;
+        }
+    }
+
+  finish:
+    Py_DECREF(out_descrs[0]);
+    Py_DECREF(out_descrs[1]);
+    /* NPY_NO_CASTING has to be used for (NPY_EQUIV_CASTING|_NPY_CAST_IS_VIEW) */
+    assert(casting != (NPY_EQUIV_CASTING|_NPY_CAST_IS_VIEW));
+    return casting;
+}
+
+
 /*NUMPY_API
  *Check the type coercion rules.
  */
 NPY_NO_EXPORT int
 PyArray_CanCastSafely(int fromtype, int totype)
 {
-    PyArray_Descr *from;
-
-    /* Fast table lookup for small type numbers */
-    if ((unsigned int)fromtype < NPY_NTYPES &&
-                                (unsigned int)totype < NPY_NTYPES) {
-        return _npy_can_cast_safely_table[fromtype][totype];
+#if NPY_USE_NEW_CASTINGIMPL
+    PyArray_DTypeMeta *from = PyArray_DTypeFromTypeNum(fromtype);
+    if (from == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-
-    /* Identity */
-    if (fromtype == totype) {
-        return 1;
+    PyArray_DTypeMeta *to = PyArray_DTypeFromTypeNum(totype);
+    if (to == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-    /* Special-cases for some types */
-    switch (fromtype) {
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-        case NPY_OBJECT:
-        case NPY_VOID:
-            return 0;
-        case NPY_BOOL:
-            return 1;
+    PyObject *castingimpl = PyArray_GetCastingImpl(from, to);
+    Py_DECREF(from);
+    Py_DECREF(to);
+
+    if (castingimpl == NULL) {
+        PyErr_WriteUnraisable(NULL);
+        return 0;
     }
-    switch (totype) {
-        case NPY_BOOL:
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-            return 0;
-        case NPY_OBJECT:
-        case NPY_VOID:
-            return 1;
+    else if (castingimpl == Py_None) {
+        Py_DECREF(Py_None);
+        return 0;
     }
+    NPY_CASTING safety = ((PyArrayMethodObject *)castingimpl)->casting;
+    int res = PyArray_MinCastSafety(safety, NPY_SAFE_CASTING) == NPY_SAFE_CASTING;
+    Py_DECREF(castingimpl);
+    return res;
+#else
+    return PyArray_LegacyCanCastSafely(fromtype, totype);
+#endif
+}
 
-    from = PyArray_DescrFromType(fromtype);
-    /*
-     * cancastto is a NPY_NOTYPE terminated C-int-array of types that
-     * the data-type can be cast to safely.
-     */
-    if (from->f->cancastto) {
-        int *curtype = from->f->cancastto;
 
-        while (*curtype != NPY_NOTYPE) {
-            if (*curtype++ == totype) {
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
 
 /*NUMPY_API
  * leaves reference count alone --- cannot be NULL
@@ -386,117 +460,16 @@ PyArray_CanCastSafely(int fromtype, int totype)
 NPY_NO_EXPORT npy_bool
 PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to)
 {
-    int from_type_num = from->type_num;
-    int to_type_num = to->type_num;
-    npy_bool ret;
-
-    ret = (npy_bool) PyArray_CanCastSafely(from_type_num, to_type_num);
-    if (ret) {
-        /* Check String and Unicode more closely */
-        if (from_type_num == NPY_STRING) {
-            if (to_type_num == NPY_STRING) {
-                ret = (from->elsize <= to->elsize);
-            }
-            else if (to_type_num == NPY_UNICODE) {
-                ret = (from->elsize << 2 <= to->elsize);
-            }
-        }
-        else if (from_type_num == NPY_UNICODE) {
-            if (to_type_num == NPY_UNICODE) {
-                ret = (from->elsize <= to->elsize);
-            }
-        }
-        /*
-         * For datetime/timedelta, only treat casts moving towards
-         * more precision as safe.
-         */
-        else if (from_type_num == NPY_DATETIME && to_type_num == NPY_DATETIME) {
-            PyArray_DatetimeMetaData *meta1, *meta2;
-            meta1 = get_datetime_metadata_from_dtype(from);
-            if (meta1 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-            meta2 = get_datetime_metadata_from_dtype(to);
-            if (meta2 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-
-            return can_cast_datetime64_metadata(meta1, meta2,
-                                                NPY_SAFE_CASTING);
-        }
-        else if (from_type_num == NPY_TIMEDELTA &&
-                                    to_type_num == NPY_TIMEDELTA) {
-            PyArray_DatetimeMetaData *meta1, *meta2;
-            meta1 = get_datetime_metadata_from_dtype(from);
-            if (meta1 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-            meta2 = get_datetime_metadata_from_dtype(to);
-            if (meta2 == NULL) {
-                PyErr_Clear();
-                return 0;
-            }
-
-            return can_cast_timedelta64_metadata(meta1, meta2,
-                                                 NPY_SAFE_CASTING);
-        }
-        /*
-         * If to_type_num is STRING or unicode
-         * see if the length is long enough to hold the
-         * stringified value of the object.
-         */
-        else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
-            /*
-             * Boolean value cast to string type is 5 characters max
-             * for string 'False'.
-             */
-            int char_size = 1;
-            if (to_type_num == NPY_UNICODE) {
-                char_size = 4;
-            }
-
-            ret = 0;
-            if (PyDataType_ISUNSIZED(to)) {
-                ret = 1;
-            }
-            /*
-             * Need at least 5 characters to convert from boolean
-             * to 'True' or 'False'.
-             */
-            else if (from->kind == 'b' && to->elsize >= 5 * char_size) {
-                ret = 1;
-            }
-            else if (from->kind == 'u') {
-                /* Guard against unexpected integer size */
-                if (from->elsize > 8 || from->elsize < 0) {
-                    ret = 0;
-                }
-                else if (to->elsize >=
-                        REQUIRED_STR_LEN[from->elsize] * char_size) {
-                    ret = 1;
-                }
-            }
-            else if (from->kind == 'i') {
-                /* Guard against unexpected integer size */
-                if (from->elsize > 8 || from->elsize < 0) {
-                    ret = 0;
-                }
-                /* Extra character needed for sign */
-                else if (to->elsize >=
-                        (REQUIRED_STR_LEN[from->elsize] + 1) * char_size) {
-                    ret = 1;
-                }
-            }
-        }
-    }
-    return ret;
+#if NPY_USE_NEW_CASTINGIMPL
+    return PyArray_CanCastTypeTo(from, to, NPY_SAFE_CASTING);
+#else
+    return PyArray_LegacyCanCastTo(from, to);
+#endif
 }
 
+
 /* Provides an ordering for the dtype 'kind' character codes */
-static int
+NPY_NO_EXPORT int
 dtype_kind_to_ordering(char kind)
 {
     switch (kind) {
@@ -557,51 +530,6 @@ type_num_unsigned_to_signed(int type_num)
     }
 }
 
-/*
- * Compare two field dictionaries for castability.
- *
- * Return 1 if 'field1' can be cast to 'field2' according to the rule
- * 'casting', 0 if not.
- *
- * Castabiliy of field dictionaries is defined recursively: 'field1' and
- * 'field2' must have the same field names (possibly in different
- * orders), and the corresponding field types must be castable according
- * to the given casting rule.
- */
-static int
-can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
-{
-    Py_ssize_t ppos;
-    PyObject *key;
-    PyObject *tuple1, *tuple2;
-
-    if (field1 == field2) {
-        return 1;
-    }
-    if (field1 == NULL || field2 == NULL) {
-        return 0;
-    }
-    if (PyDict_Size(field1) != PyDict_Size(field2)) {
-        return 0;
-    }
-
-    /* Iterate over all the fields and compare for castability */
-    ppos = 0;
-    while (PyDict_Next(field1, &ppos, &key, &tuple1)) {
-        if ((tuple2 = PyDict_GetItem(field2, key)) == NULL) {
-            return 0;
-        }
-        /* Compare the dtype of the field for castability */
-        if (!PyArray_CanCastTypeTo(
-                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple1, 0),
-                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple2, 0),
-                        casting)) {
-            return 0;
-        }
-    }
-
-    return 1;
-}
 
 /*NUMPY_API
  * Returns true if data of type 'from' may be cast to data of type
@@ -609,224 +537,41 @@ can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
  */
 NPY_NO_EXPORT npy_bool
 PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
-                                                    NPY_CASTING casting)
+        NPY_CASTING casting)
 {
+#if NPY_USE_NEW_CASTINGIMPL
     /*
-     * Fast paths for equality and for basic types.
+     * NOTE: This code supports U and S, this is identical to the code
+     *       in `ctors.c` which does not allow these dtypes to be attached
+     *       to an array. Unlike the code for `np.array(..., dtype=)`
+     *       which uses `PyArray_ExtractDTypeAndDescriptor` it rejects "m8"
+     *       as a flexible dtype instance representing a DType.
      */
-    if (from == to ||
-        ((NPY_LIKELY(PyDataType_ISNUMBER(from)) ||
-          PyDataType_ISOBJECT(from)) &&
-         NPY_LIKELY(from->type_num == to->type_num) &&
-         NPY_LIKELY(from->byteorder == to->byteorder))) {
-        return 1;
-    }
     /*
-     * Cases with subarrays and fields need special treatment.
+     * TODO: We should grow support for `np.can_cast("d", "S")` being
+     *       different from `np.can_cast("d", "S0")` here, at least for
+     *       the python side API.
      */
-    if (PyDataType_HASFIELDS(from)) {
-        /*
-         * If from is a structured data type, then it can be cast to a simple
-         * non-object one only for unsafe casting *and* if it has a single
-         * field; recurse just in case the single field is itself structured.
-         */
-        if (!PyDataType_HASFIELDS(to) && !PyDataType_ISOBJECT(to)) {
-            if (casting == NPY_UNSAFE_CASTING &&
-                    PyDict_Size(from->fields) == 1) {
-                Py_ssize_t ppos = 0;
-                PyObject *tuple;
-                PyArray_Descr *field;
-                PyDict_Next(from->fields, &ppos, NULL, &tuple);
-                field = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
-                /*
-                 * For a subarray, we need to get the underlying type;
-                 * since we already are casting unsafely, we can ignore
-                 * the shape.
-                 */
-                if (PyDataType_HASSUBARRAY(field)) {
-                    field = field->subarray->base;
-                }
-                return PyArray_CanCastTypeTo(field, to, casting);
-            }
-            else {
-                return 0;
-            }
-        }
-        /*
-         * Casting from one structured data type to another depends on the fields;
-         * we pass that case on to the EquivTypenums case below.
-         *
-         * TODO: move that part up here? Need to check whether equivalent type
-         * numbers is an addition constraint that is needed.
-         *
-         * TODO/FIXME: For now, always allow structured to structured for unsafe
-         * casting; this is not correct, but needed since the treatment in can_cast
-         * below got out of sync with astype; see gh-13667.
-         */
-        if (casting == NPY_UNSAFE_CASTING) {
-            return 1;
-        }
+    NPY_CASTING safety;
+    if (PyDataType_ISUNSIZED(to) && to->subarray == NULL) {
+        safety = PyArray_GetCastSafety(from, NULL, NPY_DTYPE(to));
     }
-    else if (PyDataType_HASFIELDS(to)) {
-        /*
-         * If "from" is a simple data type and "to" has fields, then only
-         * unsafe casting works (and that works always, even to multiple fields).
-         */
-        return casting == NPY_UNSAFE_CASTING;
-    }
-    /*
-     * Everything else we consider castable for unsafe for now.
-     * FIXME: ensure what we do here is consistent with "astype",
-     * i.e., deal more correctly with subarrays and user-defined dtype.
-     */
-    else if (casting == NPY_UNSAFE_CASTING) {
-        return 1;
-    }
-    /*
-     * Equivalent simple types can be cast with any value of 'casting', but
-     * we need to be careful about structured to structured.
-     */
-    if (PyArray_EquivTypenums(from->type_num, to->type_num)) {
-        /* For complicated case, use EquivTypes (for now) */
-        if (PyTypeNum_ISUSERDEF(from->type_num) ||
-                        from->subarray != NULL) {
-            int ret;
-
-            /* Only NPY_NO_CASTING prevents byte order conversion */
-            if ((casting != NPY_NO_CASTING) &&
-                                (!PyArray_ISNBO(from->byteorder) ||
-                                 !PyArray_ISNBO(to->byteorder))) {
-                PyArray_Descr *nbo_from, *nbo_to;
-
-                nbo_from = PyArray_DescrNewByteorder(from, NPY_NATIVE);
-                nbo_to = PyArray_DescrNewByteorder(to, NPY_NATIVE);
-                if (nbo_from == NULL || nbo_to == NULL) {
-                    Py_XDECREF(nbo_from);
-                    Py_XDECREF(nbo_to);
-                    PyErr_Clear();
-                    return 0;
-                }
-                ret = PyArray_EquivTypes(nbo_from, nbo_to);
-                Py_DECREF(nbo_from);
-                Py_DECREF(nbo_to);
-            }
-            else {
-                ret = PyArray_EquivTypes(from, to);
-            }
-            return ret;
-        }
-
-        if (PyDataType_HASFIELDS(from)) {
-            switch (casting) {
-                case NPY_EQUIV_CASTING:
-                case NPY_SAFE_CASTING:
-                case NPY_SAME_KIND_CASTING:
-                    /*
-                     * `from' and `to' must have the same fields, and
-                     * corresponding fields must be (recursively) castable.
-                     */
-                    return can_cast_fields(from->fields, to->fields, casting);
-
-                case NPY_NO_CASTING:
-                default:
-                    return PyArray_EquivTypes(from, to);
-            }
-        }
-
-        switch (from->type_num) {
-            case NPY_DATETIME: {
-                PyArray_DatetimeMetaData *meta1, *meta2;
-                meta1 = get_datetime_metadata_from_dtype(from);
-                if (meta1 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-                meta2 = get_datetime_metadata_from_dtype(to);
-                if (meta2 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-
-                if (casting == NPY_NO_CASTING) {
-                    return PyArray_ISNBO(from->byteorder) ==
-                                        PyArray_ISNBO(to->byteorder) &&
-                            can_cast_datetime64_metadata(meta1, meta2, casting);
-                }
-                else {
-                    return can_cast_datetime64_metadata(meta1, meta2, casting);
-                }
-            }
-            case NPY_TIMEDELTA: {
-                PyArray_DatetimeMetaData *meta1, *meta2;
-                meta1 = get_datetime_metadata_from_dtype(from);
-                if (meta1 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-                meta2 = get_datetime_metadata_from_dtype(to);
-                if (meta2 == NULL) {
-                    PyErr_Clear();
-                    return 0;
-                }
-
-                if (casting == NPY_NO_CASTING) {
-                    return PyArray_ISNBO(from->byteorder) ==
-                                        PyArray_ISNBO(to->byteorder) &&
-                        can_cast_timedelta64_metadata(meta1, meta2, casting);
-                }
-                else {
-                    return can_cast_timedelta64_metadata(meta1, meta2, casting);
-                }
-            }
-            default:
-                switch (casting) {
-                    case NPY_NO_CASTING:
-                        return PyArray_EquivTypes(from, to);
-                    case NPY_EQUIV_CASTING:
-                        return (from->elsize == to->elsize);
-                    case NPY_SAFE_CASTING:
-                        return (from->elsize <= to->elsize);
-                    default:
-                        return 1;
-                }
-                break;
-        }
+    else {
+        safety = PyArray_GetCastSafety(from, to, NPY_DTYPE(to));
     }
-    /* If safe or same-kind casts are allowed */
-    else if (casting == NPY_SAFE_CASTING || casting == NPY_SAME_KIND_CASTING) {
-        if (PyArray_CanCastTo(from, to)) {
-            return 1;
-        }
-        else if(casting == NPY_SAME_KIND_CASTING) {
-            /*
-             * Also allow casting from lower to higher kinds, according
-             * to the ordering provided by dtype_kind_to_ordering.
-             * Some kinds, like datetime, don't fit in the hierarchy,
-             * and are special cased as -1.
-             */
-            int from_order, to_order;
-
-            from_order = dtype_kind_to_ordering(from->kind);
-            to_order = dtype_kind_to_ordering(to->kind);
 
-            if (to->kind == 'm') {
-                /* both types being timedelta is already handled before. */
-                int integer_order = dtype_kind_to_ordering('i');
-                return (from_order != -1) && (from_order <= integer_order);
-            }
-
-            return (from_order != -1) && (from_order <= to_order);
-        }
-        else {
-            return 0;
-        }
-    }
-    /* NPY_NO_CASTING or NPY_EQUIV_CASTING was specified */
-    else {
+    if (safety < 0) {
+        PyErr_Clear();
         return 0;
     }
+    /* If casting is the smaller (or equal) safety we match */
+    return PyArray_MinCastSafety(safety, casting) == casting;
+#else
+    return PyArray_LegacyCanCastTypeTo(from, to, casting);
+#endif
 }
 
+
 /* CanCastArrayTo needs this function */
 static int min_scalar_type_num(char *valueptr, int type_num,
                                             int *is_small_unsigned);
@@ -1035,7 +780,7 @@ promote_types(PyArray_Descr *type1, PyArray_Descr *type2,
  * Returns a new reference to type if it is already NBO, otherwise
  * returns a copy converted to NBO.
  */
-static PyArray_Descr *
+NPY_NO_EXPORT PyArray_Descr *
 ensure_dtype_nbo(PyArray_Descr *type)
 {
     if (PyArray_ISNBO(type->byteorder)) {
@@ -1047,327 +792,178 @@ ensure_dtype_nbo(PyArray_Descr *type)
     }
 }
 
-/*NUMPY_API
- * Produces the smallest size and lowest kind type to which both
- * input types can be cast.
+
+/**
+ * This function should possibly become public API eventually.  At this
+ * time it is implemented by falling back to `PyArray_AdaptFlexibleDType`.
+ * We will use `CastingImpl[from, to].resolve_descriptors(...)` to implement
+ * this logic.
+ * Before that, the API needs to be reviewed though.
+ *
+ * WARNING: This function currently does not guarantee that `descr` can
+ *          actually be cast to the given DType.
+ *
+ * @param descr The dtype instance to adapt "cast"
+ * @param given_DType The DType class for which we wish to find an instance able
+ *        to represent `descr`.
+ * @returns Instance of `given_DType`. If `given_DType` is parametric the
+ *          descr may be adapted to hold it.
  */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType)
 {
-    int type_num1, type_num2, ret_type_num;
-
-    /*
-     * Fast path for identical dtypes.
-     *
-     * Non-native-byte-order types are converted to native ones below, so we
-     * can't quit early.
-     */
-    if (type1 == type2 && PyArray_ISNBO(type1->byteorder)) {
-        Py_INCREF(type1);
-        return type1;
+    if (NPY_DTYPE(descr) == given_DType) {
+        Py_INCREF(descr);
+        return descr;
     }
-
-    type_num1 = type1->type_num;
-    type_num2 = type2->type_num;
-
-    /* If they're built-in types, use the promotion table */
-    if (type_num1 < NPY_NTYPES && type_num2 < NPY_NTYPES) {
-        ret_type_num = _npy_type_promotion_table[type_num1][type_num2];
+    if (!given_DType->parametric) {
         /*
-         * The table doesn't handle string/unicode/void/datetime/timedelta,
-         * so check the result
+         * Don't actually do anything, the default is always the result
+         * of any cast.
          */
-        if (ret_type_num >= 0) {
-            return PyArray_DescrFromType(ret_type_num);
-        }
+        return given_DType->default_descr(given_DType);
+    }
+    if (PyObject_TypeCheck((PyObject *)descr, (PyTypeObject *)given_DType)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+
+#if NPY_USE_NEW_CASTINGIMPL
+    PyObject *tmp = PyArray_GetCastingImpl(NPY_DTYPE(descr), given_DType);
+    if (tmp == NULL || tmp == Py_None) {
+        Py_XDECREF(tmp);
+        goto error;
+    }
+    PyArray_DTypeMeta *dtypes[2] = {NPY_DTYPE(descr), given_DType};
+    PyArray_Descr *given_descrs[2] = {descr, NULL};
+    PyArray_Descr *loop_descrs[2];
+
+    PyArrayMethodObject *meth = (PyArrayMethodObject *)tmp;
+    NPY_CASTING casting = meth->resolve_descriptors(
+            meth, dtypes, given_descrs, loop_descrs);
+    Py_DECREF(tmp);
+    if (casting < 0) {
+        goto error;
+    }
+    Py_DECREF(loop_descrs[0]);
+    return loop_descrs[1];
+
+  error:;  /* (; due to compiler limitations) */
+    PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
+    PyErr_Fetch(&err_type, &err_value, &err_traceback);
+    PyErr_Format(PyExc_ValueError,
+            "cannot cast dtype %S to %S.", descr, given_DType);
+    npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
+    return NULL;
+
+#else  /* NPY_USE_NEW_CASTS */
+    if (!given_DType->legacy) {
+        PyErr_SetString(PyExc_NotImplementedError,
+                "Must use casting to find the correct DType for a parametric "
+                "user DType. This is not yet implemented (this error should be "
+                "unreachable).");
+        return NULL;
     }
-    /* If one or both are user defined, calculate it */
-    else {
-        int skind1 = NPY_NOSCALAR, skind2 = NPY_NOSCALAR, skind;
-
-        if (PyArray_CanCastTo(type2, type1)) {
-            /* Promoted types are always native byte order */
-            return ensure_dtype_nbo(type1);
-        }
-        else if (PyArray_CanCastTo(type1, type2)) {
-            /* Promoted types are always native byte order */
-            return ensure_dtype_nbo(type2);
-        }
-
-        /* Convert the 'kind' char into a scalar kind */
-        switch (type1->kind) {
-            case 'b':
-                skind1 = NPY_BOOL_SCALAR;
-                break;
-            case 'u':
-                skind1 = NPY_INTPOS_SCALAR;
-                break;
-            case 'i':
-                skind1 = NPY_INTNEG_SCALAR;
-                break;
-            case 'f':
-                skind1 = NPY_FLOAT_SCALAR;
-                break;
-            case 'c':
-                skind1 = NPY_COMPLEX_SCALAR;
-                break;
-        }
-        switch (type2->kind) {
-            case 'b':
-                skind2 = NPY_BOOL_SCALAR;
-                break;
-            case 'u':
-                skind2 = NPY_INTPOS_SCALAR;
-                break;
-            case 'i':
-                skind2 = NPY_INTNEG_SCALAR;
-                break;
-            case 'f':
-                skind2 = NPY_FLOAT_SCALAR;
-                break;
-            case 'c':
-                skind2 = NPY_COMPLEX_SCALAR;
-                break;
-        }
-
-        /* If both are scalars, there may be a promotion possible */
-        if (skind1 != NPY_NOSCALAR && skind2 != NPY_NOSCALAR) {
-
-            /* Start with the larger scalar kind */
-            skind = (skind1 > skind2) ? skind1 : skind2;
-            ret_type_num = _npy_smallest_type_of_kind_table[skind];
-
-            for (;;) {
-
-                /* If there is no larger type of this kind, try a larger kind */
-                if (ret_type_num < 0) {
-                    ++skind;
-                    /* Use -1 to signal no promoted type found */
-                    if (skind < NPY_NSCALARKINDS) {
-                        ret_type_num = _npy_smallest_type_of_kind_table[skind];
-                    }
-                    else {
-                        break;
-                    }
-                }
 
-                /* If we found a type to which we can promote both, done! */
-                if (PyArray_CanCastSafely(type_num1, ret_type_num) &&
-                            PyArray_CanCastSafely(type_num2, ret_type_num)) {
-                    return PyArray_DescrFromType(ret_type_num);
-                }
+    PyArray_Descr *flex_dtype = PyArray_DescrNew(given_DType->singleton);
+    return PyArray_AdaptFlexibleDType(descr, flex_dtype);
+#endif  /* NPY_USE_NEW_CASTS */
+}
 
-                /* Try the next larger type of this kind */
-                ret_type_num = _npy_next_larger_type_table[ret_type_num];
-            }
 
-        }
+/**
+ * This function defines the common DType operator.
+ *
+ * Note that the common DType will not be "object" (unless one of the dtypes
+ * is object), even though object can technically represent all values
+ * correctly.
+ *
+ * TODO: Before exposure, we should review the return value (e.g. no error
+ *       when no common DType is found).
+ *
+ * @param dtype1 DType class to find the common type for.
+ * @param dtype2 Second DType class.
+ * @return The common DType or NULL with an error set
+ */
+NPY_NO_EXPORT PyArray_DTypeMeta *
+PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2)
+{
+    if (dtype1 == dtype2) {
+        Py_INCREF(dtype1);
+        return dtype1;
+    }
 
-        PyErr_SetString(PyExc_TypeError,
-                "invalid type promotion with custom data type");
+    PyArray_DTypeMeta *common_dtype;
+
+    common_dtype = dtype1->common_dtype(dtype1, dtype2);
+    if (common_dtype == (PyArray_DTypeMeta *)Py_NotImplemented) {
+        Py_DECREF(common_dtype);
+        common_dtype = dtype2->common_dtype(dtype2, dtype1);
+    }
+    if (common_dtype == NULL) {
         return NULL;
     }
+    if (common_dtype == (PyArray_DTypeMeta *)Py_NotImplemented) {
+        Py_DECREF(Py_NotImplemented);
+        PyErr_Format(PyExc_TypeError,
+                "The DTypes %S and %S do not have a common DType. "
+                "For example they cannot be stored in a single array unless "
+                "the dtype is `object`.", dtype1, dtype2);
+        return NULL;
+    }
+    return common_dtype;
+}
 
-    switch (type_num1) {
-        /* BOOL can convert to anything except datetime/void */
-        case NPY_BOOL:
-            if (type_num2 == NPY_STRING || type_num2 == NPY_UNICODE) {
-                int char_size = 1;
-                if (type_num2 == NPY_UNICODE) {
-                    char_size = 4;
-                }
-                if (type2->elsize < 5 * char_size) {
-                    PyArray_Descr *ret = NULL;
-                    PyArray_Descr *temp = PyArray_DescrNew(type2);
-                    ret = ensure_dtype_nbo(temp);
-                    ret->elsize = 5 * char_size;
-                    Py_DECREF(temp);
-                    return ret;
-                }
-                return ensure_dtype_nbo(type2);
-            }
-            else if (type_num2 != NPY_DATETIME && type_num2 != NPY_VOID) {
-                return ensure_dtype_nbo(type2);
-            }
-            break;
-        /* For strings and unicodes, take the larger size */
-        case NPY_STRING:
-            if (type_num2 == NPY_STRING) {
-                if (type1->elsize > type2->elsize) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    return ensure_dtype_nbo(type2);
-                }
-            }
-            else if (type_num2 == NPY_UNICODE) {
-                if (type2->elsize >= type1->elsize * 4) {
-                    return ensure_dtype_nbo(type2);
-                }
-                else {
-                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
-                    if (d == NULL) {
-                        return NULL;
-                    }
-                    d->elsize = type1->elsize * 4;
-                    return d;
-                }
-            }
-            /* Allow NUMBER -> STRING */
-            else if (PyTypeNum_ISNUMBER(type_num2)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type1);
-                PyDataType_MAKEUNSIZED(temp);
 
-                temp = PyArray_AdaptFlexibleDType(type2, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type1->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type1);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_UNICODE:
-            if (type_num2 == NPY_UNICODE) {
-                if (type1->elsize > type2->elsize) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    return ensure_dtype_nbo(type2);
-                }
-            }
-            else if (type_num2 == NPY_STRING) {
-                if (type1->elsize >= type2->elsize * 4) {
-                    return ensure_dtype_nbo(type1);
-                }
-                else {
-                    PyArray_Descr *d = PyArray_DescrNewFromType(NPY_UNICODE);
-                    if (d == NULL) {
-                        return NULL;
-                    }
-                    d->elsize = type2->elsize * 4;
-                    return d;
-                }
-            }
-            /* Allow NUMBER -> UNICODE */
-            else if (PyTypeNum_ISNUMBER(type_num2)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type1);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type2, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type1->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type1);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_DATETIME:
-        case NPY_TIMEDELTA:
-            if (type_num2 == NPY_DATETIME || type_num2 == NPY_TIMEDELTA) {
-                return datetime_type_promotion(type1, type2);
-            }
-            break;
+/*NUMPY_API
+ * Produces the smallest size and lowest kind type to which both
+ * input types can be cast.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+{
+    PyArray_DTypeMeta *common_dtype;
+    PyArray_Descr *res;
+
+    /* Fast path for identical inputs (NOTE: This path preserves metadata!) */
+    if (type1 == type2 && PyArray_ISNBO(type1->byteorder)) {
+        Py_INCREF(type1);
+        return type1;
     }
 
-    switch (type_num2) {
-        /* BOOL can convert to almost anything */
-        case NPY_BOOL:
-            if (type_num2 == NPY_STRING || type_num2 == NPY_UNICODE) {
-                int char_size = 1;
-                if (type_num2 == NPY_UNICODE) {
-                    char_size = 4;
-                }
-                if (type2->elsize < 5 * char_size) {
-                    PyArray_Descr *ret = NULL;
-                    PyArray_Descr *temp = PyArray_DescrNew(type2);
-                    ret = ensure_dtype_nbo(temp);
-                    ret->elsize = 5 * char_size;
-                    Py_DECREF(temp);
-                    return ret;
-                }
-                return ensure_dtype_nbo(type2);
-            }
-            else if (type_num1 != NPY_DATETIME && type_num1 != NPY_TIMEDELTA &&
-                                    type_num1 != NPY_VOID) {
-                return ensure_dtype_nbo(type1);
-            }
-            break;
-        case NPY_STRING:
-            /* Allow NUMBER -> STRING */
-            if (PyTypeNum_ISNUMBER(type_num1)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type2);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type1, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type2->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type2);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_UNICODE:
-            /* Allow NUMBER -> UNICODE */
-            if (PyTypeNum_ISNUMBER(type_num1)) {
-                PyArray_Descr *ret = NULL;
-                PyArray_Descr *temp = PyArray_DescrNew(type2);
-                PyDataType_MAKEUNSIZED(temp);
-                temp = PyArray_AdaptFlexibleDType(type1, temp);
-                if (temp == NULL) {
-                    return NULL;
-                }
-                if (temp->elsize > type2->elsize) {
-                    ret = ensure_dtype_nbo(temp);
-                }
-                else {
-                    ret = ensure_dtype_nbo(type2);
-                }
-                Py_DECREF(temp);
-                return ret;
-            }
-            break;
-        case NPY_TIMEDELTA:
-            if (PyTypeNum_ISSIGNED(type_num1)) {
-                return ensure_dtype_nbo(type2);
-            }
-            break;
+    common_dtype = PyArray_CommonDType(NPY_DTYPE(type1), NPY_DTYPE(type2));
+    if (common_dtype == NULL) {
+        return NULL;
     }
 
-    /* For types equivalent up to endianness, can return either */
-    if (PyArray_CanCastTypeTo(type1, type2, NPY_EQUIV_CASTING)) {
-        return ensure_dtype_nbo(type1);
+    if (!common_dtype->parametric) {
+        res = common_dtype->default_descr(common_dtype);
+        Py_DECREF(common_dtype);
+        return res;
     }
 
-    /* TODO: Also combine fields, subarrays, strings, etc */
+    /* Cast the input types to the common DType if necessary */
+    type1 = PyArray_CastDescrToDType(type1, common_dtype);
+    if (type1 == NULL) {
+        Py_DECREF(common_dtype);
+        return NULL;
+    }
+    type2 = PyArray_CastDescrToDType(type2, common_dtype);
+    if (type2 == NULL) {
+        Py_DECREF(type1);
+        Py_DECREF(common_dtype);
+        return NULL;
+    }
 
     /*
-    printf("invalid type promotion: ");
-    PyObject_Print(type1, stdout, 0);
-    printf(" ");
-    PyObject_Print(type2, stdout, 0);
-    printf("\n");
-    */
-    PyErr_SetString(PyExc_TypeError, "invalid type promotion");
-    return NULL;
+     * And find the common instance of the two inputs
+     * NOTE: Common instance preserves metadata (normally and of one input)
+     */
+    res = common_dtype->common_instance(type1, type2);
+    Py_DECREF(type1);
+    Py_DECREF(type2);
+    Py_DECREF(common_dtype);
+    return res;
 }
 
 /*
@@ -1989,7 +1585,7 @@ PyArray_Zero(PyArrayObject *arr)
     }
 
     if (zero_obj == NULL) {
-        zero_obj = PyInt_FromLong((long) 0);
+        zero_obj = PyLong_FromLong((long) 0);
         if (zero_obj == NULL) {
             return NULL;
         }
@@ -2035,7 +1631,7 @@ PyArray_One(PyArrayObject *arr)
     }
 
     if (one_obj == NULL) {
-        one_obj = PyInt_FromLong((long) 1);
+        one_obj = PyLong_FromLong((long) 1);
         if (one_obj == NULL) {
             return NULL;
         }
@@ -2202,3 +1798,1108 @@ PyArray_ConvertToCommonType(PyObject *op, int *retn)
     PyDataMem_FREE(mps);
     return NULL;
 }
+
+
+/**
+ * Private function to add a casting implementation by unwrapping a bound
+ * array method.
+ *
+ * @param meth
+ * @return 0 on success -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_AddCastingImplmentation(PyBoundArrayMethodObject *meth)
+{
+    if (meth->method->nin != 1 || meth->method->nout != 1) {
+        PyErr_SetString(PyExc_TypeError,
+                "A cast must have one input and one output.");
+        return -1;
+    }
+    if (meth->dtypes[0] == meth->dtypes[1]) {
+        if (!(meth->method->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+            PyErr_Format(PyExc_TypeError,
+                    "A cast where input and output DType (class) are identical "
+                    "must currently support unaligned data. (method: %s)",
+                    meth->method->name);
+            return -1;
+        }
+        if ((meth->method->casting & ~_NPY_CAST_IS_VIEW) != NPY_NO_CASTING) {
+            PyErr_Format(PyExc_TypeError,
+                    "A cast where input and output DType (class) are identical "
+                    "must signal `no-casting`. (method: %s)",
+                    meth->method->name);
+            return -1;
+        }
+    }
+    if (PyDict_Contains(meth->dtypes[0]->castingimpls,
+            (PyObject *)meth->dtypes[1])) {
+        PyErr_Format(PyExc_RuntimeError,
+                "A cast was already added for %S -> %S. (method: %s)",
+                meth->dtypes[0], meth->dtypes[1], meth->method->name);
+        return -1;
+    }
+    if (PyDict_SetItem(meth->dtypes[0]->castingimpls,
+            (PyObject *)meth->dtypes[1], (PyObject *)meth->method) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * Add a new casting implementation using a PyArrayMethod_Spec.
+ *
+ * @param spec
+ * @param private If private, allow slots not publically exposed.
+ * @return 0 on success -1 on failure
+ */
+NPY_NO_EXPORT int
+PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private)
+{
+    /* Create a bound method, unbind and store it */
+    PyBoundArrayMethodObject *meth = PyArrayMethod_FromSpec_int(spec, private);
+    if (meth == NULL) {
+        return -1;
+    }
+    int res = PyArray_AddCastingImplmentation(meth);
+    Py_DECREF(meth);
+    if (res < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT NPY_CASTING
+legacy_same_dtype_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(loop_descrs[0]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    /* this function only makes sense for non-flexible legacy dtypes: */
+    assert(loop_descrs[0]->elsize == loop_descrs[1]->elsize);
+
+    /*
+     * Legacy dtypes (except datetime) only have byte-order and elsize as
+     * storage parameters.
+     */
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+                PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+    return NPY_EQUIV_CASTING;
+}
+
+
+/*
+ * Simple dtype resolver for casting between two different (non-parametric)
+ * (legacy) dtypes.
+ */
+NPY_NO_EXPORT NPY_CASTING
+simple_cast_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    assert(dtypes[0]->legacy && dtypes[1]->legacy);
+
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    if (given_descrs[1] != NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+    }
+
+    if (self->casting != NPY_NO_CASTING) {
+        return self->casting;
+    }
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+            PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+    return NPY_EQUIV_CASTING;
+}
+
+
+static int
+add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
+{
+    PyType_Slot slots[6];
+    PyArray_DTypeMeta *dtypes[2] = {from, to};
+    PyArrayMethod_Spec spec = {
+            .name = "numeric_cast",
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
+            .slots = slots,
+            .dtypes = dtypes,
+    };
+
+    npy_intp from_itemsize = dtypes[0]->singleton->elsize;
+    npy_intp to_itemsize = dtypes[1]->singleton->elsize;
+
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &simple_cast_resolve_descriptors;
+    /* Fetch the optimized loops (2<<10 is a non-contiguous stride) */
+    slots[1].slot = NPY_METH_strided_loop;
+    slots[1].pfunc = PyArray_GetStridedNumericCastFn(
+            1, 2<<10, 2<<10, from->type_num, to->type_num);
+    slots[2].slot = NPY_METH_contiguous_loop;
+    slots[2].pfunc = PyArray_GetStridedNumericCastFn(
+            1, from_itemsize, to_itemsize, from->type_num, to->type_num);
+    slots[3].slot = NPY_METH_unaligned_strided_loop;
+    slots[3].pfunc = PyArray_GetStridedNumericCastFn(
+            0, 2<<10, 2<<10, from->type_num, to->type_num);
+    slots[4].slot = NPY_METH_unaligned_contiguous_loop;
+    slots[4].pfunc = PyArray_GetStridedNumericCastFn(
+            0, from_itemsize, to_itemsize, from->type_num, to->type_num);
+    slots[5].slot = 0;
+    slots[5].pfunc = NULL;
+
+    assert(slots[1].pfunc && slots[2].pfunc && slots[3].pfunc && slots[4].pfunc);
+
+    /* Find the correct casting level, and special case no-cast */
+    if (dtypes[0]->kind == dtypes[1]->kind && from_itemsize == to_itemsize) {
+        spec.casting = NPY_NO_CASTING;
+
+        /* When there is no casting (equivalent C-types) use byteswap loops */
+        slots[0].slot = NPY_METH_resolve_descriptors;
+        slots[0].pfunc = &legacy_same_dtype_resolve_descriptors;
+        slots[1].slot = NPY_METH_get_loop;
+        slots[1].pfunc = NULL;
+        slots[2].slot = 0;
+        slots[2].pfunc = NULL;
+
+        spec.name = "numeric_copy_or_byteswap";
+        spec.flags |= NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    }
+    else if (_npy_can_cast_safely_table[from->type_num][to->type_num]) {
+        spec.casting = NPY_SAFE_CASTING;
+    }
+    else if (dtype_kind_to_ordering(dtypes[0]->kind) <=
+             dtype_kind_to_ordering(dtypes[1]->kind)) {
+        spec.casting = NPY_SAME_KIND_CASTING;
+    }
+    else {
+        spec.casting = NPY_UNSAFE_CASTING;
+    }
+
+    /* Create a bound method, unbind and store it */
+    return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+}
+
+
+/*
+ * This registers the castingimpl for all casts between numeric types.
+ * Eventually, this function should likely be defined as part of a .c.src
+ * file to remove `PyArray_GetStridedNumericCastFn` entirely.
+ */
+static int
+PyArray_InitializeNumericCasts(void)
+{
+    for (int from = 0; from < NPY_NTYPES; from++) {
+        if (!PyTypeNum_ISNUMBER(from) && from != NPY_BOOL) {
+            continue;
+        }
+        PyArray_DTypeMeta *from_dt = PyArray_DTypeFromTypeNum(from);
+
+        for (int to = 0; to < NPY_NTYPES; to++) {
+            if (!PyTypeNum_ISNUMBER(to) && to != NPY_BOOL) {
+                continue;
+            }
+            PyArray_DTypeMeta *to_dt = PyArray_DTypeFromTypeNum(to);
+            int res = add_numeric_cast(from_dt, to_dt);
+            Py_DECREF(to_dt);
+            if (res < 0) {
+                Py_DECREF(from_dt);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+static int
+cast_to_string_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /*
+     * NOTE: The following code used to be part of PyArray_AdaptFlexibleDType
+     *
+     * Get a string-size estimate of the input. These
+     * are generallly the size needed, rounded up to
+     * a multiple of eight.
+     */
+    npy_intp size = -1;
+    switch (dtypes[0]->type_num) {
+        case NPY_BOOL:
+        case NPY_UBYTE:
+        case NPY_BYTE:
+        case NPY_USHORT:
+        case NPY_SHORT:
+        case NPY_UINT:
+        case NPY_INT:
+        case NPY_ULONG:
+        case NPY_LONG:
+        case NPY_ULONGLONG:
+        case NPY_LONGLONG:
+            assert(dtypes[0]->singleton->elsize <= 8);
+            assert(dtypes[0]->singleton->elsize > 0);
+            if (dtypes[0]->kind == 'b') {
+                /* 5 chars needed for cast to 'True' or 'False' */
+                size = 5;
+            }
+            else if (dtypes[0]->kind == 'u') {
+                size = REQUIRED_STR_LEN[dtypes[0]->singleton->elsize];
+            }
+            else if (dtypes[0]->kind == 'i') {
+                /* Add character for sign symbol */
+                size = REQUIRED_STR_LEN[dtypes[0]->singleton->elsize] + 1;
+            }
+            break;
+        case NPY_HALF:
+        case NPY_FLOAT:
+        case NPY_DOUBLE:
+            size = 32;
+            break;
+        case NPY_LONGDOUBLE:
+            size = 48;
+            break;
+        case NPY_CFLOAT:
+        case NPY_CDOUBLE:
+            size = 2 * 32;
+            break;
+        case NPY_CLONGDOUBLE:
+            size = 2 * 48;
+            break;
+        case NPY_STRING:
+        case NPY_VOID:
+            size = given_descrs[0]->elsize;
+            break;
+        case NPY_UNICODE:
+            size = given_descrs[0]->elsize / 4;
+            break;
+        default:
+            PyErr_SetString(PyExc_SystemError,
+                    "Impossible cast to string path requested.");
+            return -1;
+    }
+    if (dtypes[1]->type_num == NPY_UNICODE) {
+        size *= 4;
+    }
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = PyArray_DescrNewFromType(dtypes[1]->type_num);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+        loop_descrs[1]->elsize = size;
+    }
+    else {
+        /* The legacy loop can handle mismatching itemsizes */
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+
+    /* Set the input one as well (late for easier error management) */
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+
+    if (self->casting == NPY_UNSAFE_CASTING) {
+        assert(dtypes[0]->type_num == NPY_UNICODE &&
+               dtypes[1]->type_num == NPY_STRING);
+        return NPY_UNSAFE_CASTING;
+    }
+    assert(self->casting == NPY_SAFE_CASTING);
+
+    if (loop_descrs[1]->elsize >= size) {
+        return NPY_SAFE_CASTING;
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+static int
+add_other_to_and_from_string_cast(
+        PyArray_DTypeMeta *string, PyArray_DTypeMeta *other)
+{
+    if (string == other) {
+        return 0;
+    }
+
+    /* Casting from string, is always a simple legacy-style cast */
+    if (other->type_num != NPY_STRING && other->type_num != NPY_UNICODE) {
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                string, other, NPY_UNSAFE_CASTING) < 0) {
+            return -1;
+        }
+    }
+    /*
+     * Casting to strings, is almost the same, but requires a custom resolver
+     * to define the correct string length. Right now we use a generic function
+     * for this.
+     */
+    PyArray_DTypeMeta *dtypes[2] = {other, string};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &cast_to_string_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+        .name = "legacy_cast_to_string",
+        .nin = 1,
+        .nout = 1,
+        .flags = NPY_METH_REQUIRES_PYAPI,
+        .dtypes = dtypes,
+        .slots = slots,
+    };
+    /* Almost everything can be safely cast to string (except unicode) */
+    if (other->type_num != NPY_UNICODE) {
+        spec.casting = NPY_SAFE_CASTING;
+    }
+    else {
+        spec.casting = NPY_UNSAFE_CASTING;
+    }
+
+    return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+}
+
+
+NPY_NO_EXPORT NPY_CASTING
+string_to_string_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(loop_descrs[0]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    if (loop_descrs[0]->elsize == loop_descrs[1]->elsize) {
+        if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+                PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+            return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else {
+            return NPY_EQUIV_CASTING;
+        }
+    }
+    else if (loop_descrs[0]->elsize <= loop_descrs[1]->elsize) {
+        return NPY_SAFE_CASTING;
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+/*
+ * Add string casts. Right now all string casts are just legacy-wrapped ones
+ * (except string<->string and unicode<->unicode), but they do require
+ * custom type resolution for the string length.
+ *
+ * A bit like `object`, it could make sense to define a simpler protocol for
+ * string casts, however, we also need to remember that the itemsize of the
+ * output has to be found.
+ */
+static int
+PyArray_InitializeStringCasts(void)
+{
+    int result = -1;
+    PyArray_DTypeMeta *string = PyArray_DTypeFromTypeNum(NPY_STRING);
+    PyArray_DTypeMeta *unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE);
+    PyArray_DTypeMeta *other_dt = NULL;
+
+    /* Add most casts as legacy ones */
+    for (int other = 0; other < NPY_NTYPES; other++) {
+        if (PyTypeNum_ISDATETIME(other) || other == NPY_VOID ||
+                other == NPY_OBJECT) {
+            continue;
+        }
+        other_dt = PyArray_DTypeFromTypeNum(other);
+
+        /* The functions skip string == other_dt or unicode == other_dt */
+        if (add_other_to_and_from_string_cast(string, other_dt) < 0) {
+            goto finish;
+        }
+        if (add_other_to_and_from_string_cast(unicode, other_dt) < 0) {
+            goto finish;
+        }
+
+        Py_SETREF(other_dt, NULL);
+    }
+
+    /* string<->string and unicode<->unicode have their own specialized casts */
+    PyArray_DTypeMeta *dtypes[2];
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "string_to_string_cast",
+            .casting = NPY_NO_CASTING,
+            .nin = 1,
+            .nout = 1,
+            .flags = (NPY_METH_REQUIRES_PYAPI |
+                      NPY_METH_NO_FLOATINGPOINT_ERRORS |
+                      NPY_METH_SUPPORTS_UNALIGNED),
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    dtypes[0] = string;
+    dtypes[1] = string;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto finish;
+    }
+
+    dtypes[0] = unicode;
+    dtypes[1] = unicode;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto finish;
+    }
+
+    result = 0;
+  finish:
+    Py_DECREF(string);
+    Py_DECREF(unicode);
+    Py_XDECREF(other_dt);
+    return result;
+}
+
+
+/*
+ * Small helper function to handle the case of `arr.astype(dtype="V")`.
+ * When the output descriptor is not passed, we always use `V<itemsize>`
+ * of the other dtype.
+ */
+static NPY_CASTING
+cast_to_void_dtype_class(
+        PyArray_Descr **given_descrs, PyArray_Descr **loop_descrs)
+{
+    /* `dtype="V"` means unstructured currently (compare final path) */
+    loop_descrs[1] = PyArray_DescrNewFromType(NPY_VOID);
+    if (loop_descrs[1] == NULL) {
+        return -1;
+    }
+    loop_descrs[1]->elsize = given_descrs[0]->elsize;
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_SAFE_CASTING | _NPY_CAST_IS_VIEW;
+}
+
+
+static NPY_CASTING
+nonstructured_to_structured_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    NPY_CASTING casting;
+
+    if (given_descrs[1] == NULL) {
+        return cast_to_void_dtype_class(given_descrs, loop_descrs);
+    }
+
+    if (given_descrs[1]->subarray != NULL) {
+        /*
+         * We currently consider this at most a safe cast. It would be
+         * possible to allow a view if the field has exactly one element.
+         */
+        casting = NPY_SAFE_CASTING;
+        /* Subarray dtype */
+        NPY_CASTING base_casting = PyArray_GetCastSafety(
+                given_descrs[0], given_descrs[1]->subarray->base, NULL);
+        if (base_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, base_casting);
+    }
+    else if (given_descrs[1]->names != NULL) {
+        /* Structured dtype */
+        if (PyTuple_Size(given_descrs[1]->names) == 0) {
+            /* TODO: This retained behaviour, but likely should be changed. */
+            casting = NPY_UNSAFE_CASTING;
+        }
+        else {
+            /* Considered at most unsafe casting (but this could be changed) */
+            casting = NPY_UNSAFE_CASTING;
+            if (PyTuple_Size(given_descrs[1]->names) == 1) {
+                /* A view may be acceptable */
+                casting |= _NPY_CAST_IS_VIEW;
+            }
+
+            Py_ssize_t pos = 0;
+            PyObject *key, *tuple;
+            while (PyDict_Next(given_descrs[1]->fields, &pos, &key, &tuple)) {
+                PyArray_Descr *field_descr = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
+                NPY_CASTING field_casting = PyArray_GetCastSafety(
+                        given_descrs[0], field_descr, NULL);
+                casting = PyArray_MinCastSafety(casting, field_casting);
+                if (casting < 0) {
+                    return -1;
+                }
+            }
+        }
+    }
+    else {
+        /* Plain void type. This behaves much like a "view" */
+        if (given_descrs[0]->elsize == given_descrs[1]->elsize &&
+                !PyDataType_REFCHK(given_descrs[0])) {
+            /*
+             * A simple view, at the moment considered "safe" (the refcheck is
+             * probably not necessary, but more future proof
+             */
+            casting = NPY_SAFE_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else if (given_descrs[0]->elsize <= given_descrs[1]->elsize) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else {
+            casting = NPY_UNSAFE_CASTING;
+        }
+    }
+
+    /* Void dtypes always do the full cast. */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    Py_INCREF(given_descrs[1]);
+    loop_descrs[1] = given_descrs[1];
+
+    return casting;
+}
+
+
+int give_bad_field_error(PyObject *key)
+{
+    if (!PyErr_Occurred()) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Invalid or missing field %R, this should be impossible "
+                "and indicates a NumPy bug.", key);
+    }
+    return -1;
+}
+
+
+static PyObject *
+PyArray_GetGenericToVoidCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->name = "any_to_void_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_SAFE_CASTING;
+    method->resolve_descriptors = &nonstructured_to_structured_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+static NPY_CASTING
+structured_to_nonstructured_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    PyArray_Descr *base_descr;
+
+    if (given_descrs[0]->subarray != NULL) {
+        base_descr = given_descrs[0]->subarray->base;
+    }
+    else if (given_descrs[0]->names != NULL) {
+        if (PyTuple_Size(given_descrs[0]->names) != 1) {
+            /* Only allow casting a single field */
+            return -1;
+        }
+        PyObject *key = PyTuple_GetItem(given_descrs[0]->names, 0);
+        PyObject *base_tup = PyDict_GetItem(given_descrs[0]->fields, key);
+        base_descr = (PyArray_Descr *)PyTuple_GET_ITEM(base_tup, 0);
+    }
+    else {
+        /*
+         * unstructured voids are considered unsafe casts and defined, albeit,
+         * at this time they go back to legacy behaviour using getitem/setitem.
+         */
+        base_descr = NULL;
+    }
+
+    /*
+     * The cast is always considered unsafe, so the PyArray_GetCastSafety
+     * result currently does not matter.
+     */
+    if (base_descr != NULL && PyArray_GetCastSafety(
+            base_descr, given_descrs[1], dtypes[1]) < 0) {
+        return -1;
+    }
+
+    /* Void dtypes always do the full cast. */
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        /*
+         * Special case strings here, it should be useless (and only actually
+         * work for empty arrays).  Possibly this should simply raise for
+         * all parametric DTypes.
+         */
+        if (dtypes[1]->type_num == NPY_STRING) {
+            loop_descrs[1]->elsize = given_descrs[0]->elsize;
+        }
+        else if (dtypes[1]->type_num == NPY_UNICODE) {
+            loop_descrs[1]->elsize = given_descrs[0]->elsize * 4;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    return NPY_UNSAFE_CASTING;
+}
+
+
+static PyObject *
+PyArray_GetVoidToGenericCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->name = "void_to_any_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_UNSAFE_CASTING;
+    method->resolve_descriptors = &structured_to_nonstructured_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+/*
+ * Find the correct field casting safety.  See the TODO note below, including
+ * in 1.20 (and later) this was based on field names rather than field order
+ * which it should be using.
+ *
+ * NOTE: In theory it would be possible to cache the all the field casting
+ *       implementations on the dtype, to avoid duplicate work.
+ */
+static NPY_CASTING
+can_cast_fields_safety(PyArray_Descr *from, PyArray_Descr *to)
+{
+    NPY_CASTING casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+
+    Py_ssize_t field_count = PyTuple_Size(from->names);
+    if (field_count != PyTuple_Size(to->names)) {
+        /* TODO: This should be rejected! */
+        return NPY_UNSAFE_CASTING;
+    }
+    for (Py_ssize_t i = 0; i < field_count; i++) {
+        PyObject *from_key = PyTuple_GET_ITEM(from->names, i);
+        PyObject *from_tup = PyDict_GetItemWithError(from->fields, from_key);
+        if (from_tup == NULL) {
+            return give_bad_field_error(from_key);
+        }
+        PyArray_Descr *from_base = (PyArray_Descr*)PyTuple_GET_ITEM(from_tup, 0);
+
+        /*
+         * TODO: This should use to_key (order), compare gh-15509 by
+         *       by Allan Haldane.  And raise an error on failure.
+         *       (Fixing that may also requires fixing/changing promotion.)
+         */
+        PyObject *to_tup = PyDict_GetItem(to->fields, from_key);
+        if (to_tup == NULL) {
+            return NPY_UNSAFE_CASTING;
+        }
+        PyArray_Descr *to_base = (PyArray_Descr*)PyTuple_GET_ITEM(to_tup, 0);
+
+        NPY_CASTING field_casting = PyArray_GetCastSafety(from_base, to_base, NULL);
+        if (field_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, field_casting);
+    }
+    if (!(casting & _NPY_CAST_IS_VIEW)) {
+        assert((casting & ~_NPY_CAST_IS_VIEW) != NPY_NO_CASTING);
+        return casting;
+    }
+
+    /*
+     * If the itemsize (includes padding at the end), fields, or names
+     * do not match, this cannot be a view and also not a "no" cast
+     * (identical dtypes).
+     * It may be possible that this can be relaxed in some cases.
+     */
+    if (from->elsize != to->elsize) {
+        /*
+         * The itemsize may mismatch even if all fields and formats match
+         * (due to additional padding).
+         */
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+
+    int cmp = PyObject_RichCompareBool(from->fields, to->fields, Py_EQ);
+    if (cmp != 1) {
+        if (cmp == -1) {
+            PyErr_Clear();
+        }
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+    cmp = PyObject_RichCompareBool(from->names, to->names, Py_EQ);
+    if (cmp != 1) {
+        if (cmp == -1) {
+            PyErr_Clear();
+        }
+        return PyArray_MinCastSafety(casting, NPY_EQUIV_CASTING);
+    }
+    return casting;
+}
+
+
+static NPY_CASTING
+void_to_void_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    NPY_CASTING casting;
+
+    if (given_descrs[1] == NULL) {
+        /* This is weird, since it doesn't return the original descr, but... */
+        return cast_to_void_dtype_class(given_descrs, loop_descrs);
+    }
+
+    if (given_descrs[0]->names != NULL && given_descrs[1]->names != NULL) {
+        /* From structured to structured, need to check fields */
+        casting = can_cast_fields_safety(given_descrs[0], given_descrs[1]);
+    }
+    else if (given_descrs[0]->names != NULL) {
+        return structured_to_nonstructured_resolve_descriptors(
+                self, dtypes, given_descrs, loop_descrs);
+    }
+    else if (given_descrs[1]->names != NULL) {
+        return nonstructured_to_structured_resolve_descriptors(
+                self, dtypes, given_descrs, loop_descrs);
+    }
+    else if (given_descrs[0]->subarray == NULL &&
+                given_descrs[1]->subarray == NULL) {
+        /* Both are plain void dtypes */
+        if (given_descrs[0]->elsize == given_descrs[1]->elsize) {
+            casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+        }
+        else if (given_descrs[0]->elsize < given_descrs[1]->elsize) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else {
+            casting = NPY_SAME_KIND_CASTING;
+        }
+    }
+    else {
+        /*
+         * At this point, one of the dtypes must be a subarray dtype, the
+         * other is definitely not a structured one.
+         */
+        PyArray_ArrayDescr *from_sub = given_descrs[0]->subarray;
+        PyArray_ArrayDescr *to_sub = given_descrs[1]->subarray;
+        assert(from_sub || to_sub);
+
+        /* If the shapes do not match, this is at most an unsafe cast */
+        casting = NPY_UNSAFE_CASTING;
+        if (from_sub && to_sub) {
+            int res = PyObject_RichCompareBool(from_sub->shape, to_sub->shape, Py_EQ);
+            if (res < 0) {
+                return -1;
+            }
+            else if (res) {
+                /* Both are subarrays and the shape matches */
+                casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+            }
+        }
+        NPY_CASTING field_casting = PyArray_GetCastSafety(
+                given_descrs[0]->subarray->base, given_descrs[1]->subarray->base, NULL);
+        if (field_casting < 0) {
+            return -1;
+        }
+        casting = PyArray_MinCastSafety(casting, field_casting);
+    }
+
+    /* Void dtypes always do the full cast. */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    Py_INCREF(given_descrs[1]);
+    loop_descrs[1] = given_descrs[1];
+
+    return casting;
+}
+
+
+/*
+ * This initializes the void to void cast. Voids include structured dtypes,
+ * which means that they can cast from and to any other dtype and, in that
+ * sense, are special (similar to Object).
+ */
+static int
+PyArray_InitializeVoidToVoidCast(void)
+{
+    PyArray_DTypeMeta *Void = PyArray_DTypeFromTypeNum(NPY_VOID);
+    PyArray_DTypeMeta *dtypes[2] = {Void, Void};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &void_to_void_resolve_descriptors},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "void_to_void_cast",
+            .casting = NPY_NO_CASTING,
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    Py_DECREF(Void);
+    return res;
+}
+
+
+/*
+ * Implement object to any casting implementation. Casting from object may
+ * require inspecting of all array elements (for parametric dtypes), and
+ * the resolver will thus reject all parametric dtypes if the out dtype
+ * is not provided.
+ */
+static NPY_CASTING
+object_to_any_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    if (given_descrs[1] == NULL) {
+        /*
+         * This should not really be called, since object -> parametric casts
+         * require inspecting the object array. Allow legacy ones, the path
+         * here is that e.g. "M8" input is considered to be the DType class,
+         * and by allowing it here, we go back to the "M8" instance.
+         */
+        if (dtypes[1]->parametric) {
+            PyErr_Format(PyExc_TypeError,
+                    "casting from object to the parametric DType %S requires "
+                    "the specified output dtype instance. "
+                    "This may be a NumPy issue, since the correct instance "
+                    "should be discovered automatically, however.", dtypes[1]);
+            return -1;
+        }
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/*
+ * Casting to object is special since it is generic to all input dtypes.
+ */
+static PyObject *
+PyArray_GetObjectToGenericCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->nin = 1;
+    method->nout = 1;
+    method->name = "object_to_any_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_UNSAFE_CASTING;
+    method->resolve_descriptors = &object_to_any_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+
+/* Any object object is simple (could even use the default) */
+static NPY_CASTING
+any_to_object_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    return NPY_SAFE_CASTING;
+}
+
+
+/*
+ * Casting to object is special since it is generic to all input dtypes.
+ */
+static PyObject *
+PyArray_GetGenericToObjectCastingImpl(void)
+{
+    static PyArrayMethodObject *method = NULL;
+
+    if (method != NULL) {
+        Py_INCREF(method);
+        return (PyObject *)method;
+    }
+
+    method = PyObject_New(PyArrayMethodObject, &PyArrayMethod_Type);
+    if (method == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    method->nin = 1;
+    method->nout = 1;
+    method->name = "any_to_object_cast";
+    method->flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    method->casting = NPY_SAFE_CASTING;
+    method->resolve_descriptors = &any_to_object_resolve_descriptors;
+    method->get_strided_loop = NULL;
+
+    return (PyObject *)method;
+}
+
+
+static int
+PyArray_InitializeObjectToObjectCast(void)
+{
+    /*
+     * The object dtype does not support byte order changes, so its cast
+     * is always a direct view.
+     */
+    PyArray_DTypeMeta *Object = PyArray_DTypeFromTypeNum(NPY_OBJECT);
+    PyArray_DTypeMeta *dtypes[2] = {Object, Object};
+    PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {0,                 NULL}};
+    PyArrayMethod_Spec spec = {
+            .name = "object_to_object_cast",
+            .casting = NPY_NO_CASTING | _NPY_CAST_IS_VIEW,
+            .nin = 1,
+            .nout = 1,
+            .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED,
+            .dtypes = dtypes,
+            .slots = slots,
+    };
+
+    int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    Py_DECREF(Object);
+    return res;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_InitializeCasts()
+{
+    if (PyArray_InitializeNumericCasts() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeStringCasts() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeVoidToVoidCast() < 0) {
+        return -1;
+    }
+    if (PyArray_InitializeObjectToObjectCast() < 0) {
+        return -1;
+    }
+    /* Datetime casts are defined in datetime.c */
+    if (PyArray_InitializeDatetimeCasts() < 0) {
+        return -1;
+    }
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 9b7f39db2..cc1930f77 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -1,6 +1,13 @@
 #ifndef _NPY_ARRAY_CONVERT_DATATYPE_H_
 #define _NPY_ARRAY_CONVERT_DATATYPE_H_
 
+#include "array_method.h"
+
+extern NPY_NO_EXPORT npy_intp REQUIRED_STR_LEN[];
+
+NPY_NO_EXPORT PyObject *
+_get_castingimpl(PyObject *NPY_UNUSED(module), PyObject *args);
+
 NPY_NO_EXPORT PyArray_VectorUnaryFunc *
 PyArray_GetCastFunc(PyArray_Descr *descr, int type_num);
 
@@ -10,14 +17,23 @@ PyArray_ObjectType(PyObject *op, int minimum_type);
 NPY_NO_EXPORT PyArrayObject **
 PyArray_ConvertToCommonType(PyObject *op, int *retn);
 
+NPY_NO_EXPORT PyArray_DTypeMeta *
+PyArray_CommonDType(PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
+
 NPY_NO_EXPORT int
 PyArray_ValidType(int type);
 
+NPY_NO_EXPORT int
+dtype_kind_to_ordering(char kind);
+
 /* Like PyArray_CanCastArrayTo */
 NPY_NO_EXPORT npy_bool
 can_cast_scalar_to(PyArray_Descr *scal_type, char *scal_data,
                     PyArray_Descr *to, NPY_CASTING casting);
 
+NPY_NO_EXPORT PyArray_Descr *
+ensure_dtype_nbo(PyArray_Descr *type);
+
 NPY_NO_EXPORT int
 should_use_min_scalar(npy_intp narrs, PyArrayObject **arr,
                       npy_intp ndtypes, PyArray_Descr **dtypes);
@@ -30,23 +46,37 @@ npy_set_invalid_cast_error(
         PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
         NPY_CASTING casting, npy_bool scalar);
 
-/*
- * This function calls Py_DECREF on flex_dtype, and replaces it with
- * a new dtype that has been adapted based on the values in data_dtype
- * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
- *
- * Usually, if data_obj is not an array, dtype should be the result
- * given by the PyArray_GetArrayParamsFromObject function.
- *
- * The data_obj may be NULL if just a dtype is known for the source.
- *
- * If *flex_dtype is NULL, returns immediately, without setting an
- * exception, leaving any previous error handling intact.
- *
- * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
- * and NPY_DATETIME with generic units.
- */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype);
+PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType);
+
+NPY_NO_EXPORT int
+PyArray_AddCastingImplmentation(PyBoundArrayMethodObject *meth);
+
+NPY_NO_EXPORT int
+PyArray_AddCastingImplementation_FromSpec(PyArrayMethod_Spec *spec, int private);
+
+NPY_NO_EXPORT NPY_CASTING
+PyArray_MinCastSafety(NPY_CASTING casting1, NPY_CASTING casting2);
+
+NPY_NO_EXPORT NPY_CASTING
+PyArray_GetCastSafety(
+        PyArray_Descr *from, PyArray_Descr *to, PyArray_DTypeMeta *to_dtype);
+
+NPY_NO_EXPORT NPY_CASTING
+legacy_same_dtype_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs);
+
+NPY_NO_EXPORT NPY_CASTING
+simple_cast_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **input_descrs,
+        PyArray_Descr **loop_descrs);
+
+NPY_NO_EXPORT int
+PyArray_InitializeCasts(void);
 
 #endif
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index cb448756b..f6031e370 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -300,12 +300,12 @@ _update_descr_and_dimensions(PyArray_Descr **des, npy_intp *newdims,
     }
     if (tuple) {
         for (i = 0; i < numnew; i++) {
-            mydim[i] = (npy_intp) PyInt_AsLong(
+            mydim[i] = (npy_intp) PyLong_AsLong(
                     PyTuple_GET_ITEM(old->subarray->shape, i));
         }
     }
     else {
-        mydim[0] = (npy_intp) PyInt_AsLong(old->subarray->shape);
+        mydim[0] = (npy_intp) PyLong_AsLong(old->subarray->shape);
     }
 
     if (newstrides) {
@@ -610,6 +610,7 @@ PyArray_AssignFromCache(PyArrayObject *self, coercion_cache_obj *cache) {
         PyErr_SetString(PyExc_RuntimeError,
                 "Inconsistent object during array creation? "
                 "Content of sequences changed (cache not consumed).");
+        npy_free_coercion_cache(cache);
         return -1;
     }
     return 0;
@@ -755,9 +756,11 @@ PyArray_NewFromDescr_int(
         Py_DECREF(descr);
         return NULL;
     }
+    fa->_buffer_info = NULL;
     fa->nd = nd;
     fa->dimensions = NULL;
     fa->data = NULL;
+
     if (data == NULL) {
         fa->flags = NPY_ARRAY_DEFAULT;
         if (flags) {
@@ -868,11 +871,14 @@ PyArray_NewFromDescr_int(
 
         func = PyObject_GetAttr((PyObject *)fa, npy_ma_str_array_finalize);
         if (func && func != Py_None) {
-            if (NpyCapsule_Check(func)) {
+            if (PyCapsule_CheckExact(func)) {
                 /* A C-function is stored here */
                 PyArray_FinalizeFunc *cfunc;
-                cfunc = NpyCapsule_AsVoidPtr(func);
+                cfunc = PyCapsule_GetPointer(func, NULL);
                 Py_DECREF(func);
+                if (cfunc == NULL) {
+                    goto fail;
+                }
                 if (cfunc((PyArrayObject *)fa, obj) < 0) {
                     goto fail;
                 }
@@ -1364,6 +1370,160 @@ PyArray_GetArrayParamsFromObject(PyObject *NPY_UNUSED(op),
 }
 
 
+/*
+ * This function is a legacy implementation to retain subarray dtype
+ * behaviour in array coercion. The behaviour here makes sense if tuples
+ * of matching dimensionality are being coerced. Due to the difficulty
+ * that the result is ill-defined for lists of array-likes, this is deprecated.
+ *
+ * WARNING: Do not use this function, it exists purely to support a deprecated
+ *          code path.
+ */
+static int
+setArrayFromSequence(PyArrayObject *a, PyObject *s,
+                        int dim, PyArrayObject * dst)
+{
+    Py_ssize_t i, slen;
+    int res = -1;
+
+    /* first recursion, view equal destination */
+    if (dst == NULL)
+        dst = a;
+
+    /*
+     * This code is to ensure that the sequence access below will
+     * return a lower-dimensional sequence.
+     */
+
+    /* INCREF on entry DECREF on exit */
+    Py_INCREF(s);
+
+    PyObject *seq = NULL;
+
+    if (PyArray_Check(s)) {
+        if (!(PyArray_CheckExact(s))) {
+            /*
+             * make sure a base-class array is used so that the dimensionality
+             * reduction assumption is correct.
+             */
+            /* This will DECREF(s) if replaced */
+            s = PyArray_EnsureArray(s);
+            if (s == NULL) {
+                goto fail;
+            }
+        }
+
+        /* dst points to correct array subsection */
+        if (PyArray_CopyInto(dst, (PyArrayObject *)s) < 0) {
+            goto fail;
+        }
+
+        Py_DECREF(s);
+        return 0;
+    }
+
+    if (dim > PyArray_NDIM(a)) {
+        PyErr_Format(PyExc_ValueError,
+                 "setArrayFromSequence: sequence/array dimensions mismatch.");
+        goto fail;
+    }
+
+    /* Try __array__ before using s as a sequence */
+    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    else if (tmp == Py_NotImplemented) {
+        Py_DECREF(tmp);
+    }
+    else {
+        int r = PyArray_CopyInto(dst, (PyArrayObject *)tmp);
+        Py_DECREF(tmp);
+        if (r < 0) {
+            goto fail;
+        }
+        Py_DECREF(s);
+        return 0;
+    }
+
+    seq = PySequence_Fast(s, "Could not convert object to sequence");
+    if (seq == NULL) {
+        goto fail;
+    }
+    slen = PySequence_Fast_GET_SIZE(seq);
+
+    /*
+     * Either the dimensions match, or the sequence has length 1 and can
+     * be broadcast to the destination.
+     */
+    if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
+        PyErr_Format(PyExc_ValueError,
+                 "cannot copy sequence with size %zd to array axis "
+                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
+        goto fail;
+    }
+
+    /* Broadcast the one element from the sequence to all the outputs */
+    if (slen == 1) {
+        PyObject *o = PySequence_Fast_GET_ITEM(seq, 0);
+        npy_intp alen = PyArray_DIM(a, dim);
+
+        for (i = 0; i < alen; i++) {
+            if ((PyArray_NDIM(a) - dim) > 1) {
+                PyArrayObject * tmp =
+                    (PyArrayObject *)array_item_asarray(dst, i);
+                if (tmp == NULL) {
+                    goto fail;
+                }
+
+                res = setArrayFromSequence(a, o, dim+1, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
+                res = PyArray_SETITEM(dst, b, o);
+            }
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+    /* Copy element by element */
+    else {
+        for (i = 0; i < slen; i++) {
+            PyObject * o = PySequence_Fast_GET_ITEM(seq, i);
+            if ((PyArray_NDIM(a) - dim) > 1) {
+                PyArrayObject * tmp =
+                    (PyArrayObject *)array_item_asarray(dst, i);
+                if (tmp == NULL) {
+                    goto fail;
+                }
+
+                res = setArrayFromSequence(a, o, dim+1, tmp);
+                Py_DECREF(tmp);
+            }
+            else {
+                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
+                res = PyArray_SETITEM(dst, b, o);
+            }
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+
+    Py_DECREF(seq);
+    Py_DECREF(s);
+    return 0;
+
+ fail:
+    Py_XDECREF(seq);
+    Py_DECREF(s);
+    return res;
+}
+
+
+
 /*NUMPY_API
  * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
  * Steals a reference to newtype --- which can be NULL
@@ -1404,6 +1564,71 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     if (ndim < 0) {
         return NULL;
     }
+
+    if (NPY_UNLIKELY(fixed_descriptor != NULL && PyDataType_HASSUBARRAY(dtype))) {
+        /*
+         * When a subarray dtype was passed in, its dimensions are appended
+         * to the array dimension (causing a dimension mismatch).
+         * There is a problem with that, because if we coerce from non-arrays
+         * we do this correctly by element (as defined by tuples), but for
+         * arrays we first append the dimensions and then assign to the base
+         * dtype and then assign which causes the problem.
+         *
+         * Thus, we check if there is an array included, in that case we
+         * give a FutureWarning.
+         * When the warning is removed, PyArray_Pack will have to ensure
+         * that that it does not append the dimensions when creating the
+         * subarrays to assign `arr[0] = obj[0]`.
+         */
+        int includes_array = 0;
+        if (cache != NULL) {
+            /* This is not ideal, but it is a pretty special case */
+            coercion_cache_obj *next = cache;
+            while (next != NULL) {
+                if (!next->sequence) {
+                    includes_array = 1;
+                    break;
+                }
+                next = next->next;
+            }
+        }
+        if (includes_array) {
+            npy_free_coercion_cache(cache);
+
+            ret = (PyArrayObject *) PyArray_NewFromDescr(
+                    &PyArray_Type, dtype, ndim, dims, NULL, NULL,
+                    flags & NPY_ARRAY_F_CONTIGUOUS, NULL);
+            if (ret == NULL) {
+                return NULL;
+            }
+            assert(PyArray_NDIM(ret) != ndim);
+
+            /* NumPy 1.20, 2020-10-01 */
+            if (DEPRECATE_FUTUREWARNING(
+                    "creating an array with a subarray dtype will behave "
+                    "differently when the `np.array()` (or `asarray`, etc.) "
+                    "call includes an array or array object.\n"
+                    "If you are converting a single array or a list of arrays,"
+                    "you can opt-in to the future behaviour using:\n"
+                    "    np.array(arr, dtype=np.dtype(['f', dtype]))['f']\n"
+                    "    np.array([arr1, arr2], dtype=np.dtype(['f', dtype]))['f']\n"
+                    "\n"
+                    "By including a new field and indexing it after the "
+                    "conversion.\n"
+                    "This may lead to a different result or to current failures "
+                    "succeeding.  (FutureWarning since NumPy 1.20)") < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+
+            if (setArrayFromSequence(ret, op, 0, NULL) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+            return (PyObject *)ret;
+        }
+    }
+
     if (dtype == NULL) {
         dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -1457,6 +1682,31 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 ((PyVoidScalarObject *)op)->flags,
                 NULL, op);
     }
+    else if (cache == 0 && newtype != NULL &&
+            PyDataType_ISSIGNED(newtype) && PyArray_IsScalar(op, Generic)) {
+        assert(ndim == 0);
+        /*
+         * This is an (possible) inconsistency where:
+         *
+         *     np.array(np.float64(np.nan), dtype=np.int64)
+         *
+         * behaves differently from:
+         *
+         *     np.array([np.float64(np.nan)], dtype=np.int64)
+         *     arr1d_int64[0] = np.float64(np.nan)
+         *     np.array(np.array(np.nan), dtype=np.int64)
+         *
+         * by not raising an error instead of using typical casting.
+         * The error is desirable, but to always error seems like a
+         * larger change to be considered at some other time and it is
+         * undesirable that 0-D arrays behave differently from scalars.
+         * This retains the behaviour, largely due to issues in pandas
+         * which relied on a try/except (although hopefully that will
+         * have a better solution at some point):
+         * https://github.com/pandas-dev/pandas/issues/35481
+         */
+        return PyArray_FromScalar(op, dtype);
+    }
 
     /* There was no array (or array-like) passed in directly. */
     if ((flags & NPY_ARRAY_WRITEBACKIFCOPY) ||
@@ -1464,28 +1714,57 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         PyErr_SetString(PyExc_TypeError,
                         "WRITEBACKIFCOPY used for non-array input.");
         Py_DECREF(dtype);
+        npy_free_coercion_cache(cache);
         return NULL;
     }
 
     /* Create a new array and copy the data */
+    Py_INCREF(dtype);  /* hold on in case of a subarray that is replaced */
     ret = (PyArrayObject *)PyArray_NewFromDescr(
             &PyArray_Type, dtype, ndim, dims, NULL, NULL,
             flags&NPY_ARRAY_F_CONTIGUOUS, NULL);
     if (ret == NULL) {
+        npy_free_coercion_cache(cache);
+        Py_DECREF(dtype);
         return NULL;
     }
+    if (ndim == PyArray_NDIM(ret)) {
+        /*
+         * Appending of dimensions did not occur, so use the actual dtype
+         * below. This is relevant for S0 or U0 which can be replaced with
+         * S1 or U1, although that should likely change.
+         */
+        Py_SETREF(dtype, PyArray_DESCR(ret));
+        Py_INCREF(dtype);
+    }
+
     if (cache == NULL) {
         /* This is a single item. Set it directly. */
         assert(ndim == 0);
-        if (PyArray_Pack(PyArray_DESCR(ret), PyArray_DATA(ret), op) < 0) {
+
+        if (PyArray_Pack(dtype, PyArray_BYTES(ret), op) < 0) {
+            Py_DECREF(dtype);
             Py_DECREF(ret);
             return NULL;
         }
+        Py_DECREF(dtype);
         return (PyObject *)ret;
     }
     assert(ndim != 0);
     assert(op == cache->converted_obj);
-    if (PyArray_AssignFromCache(ret, cache) < 0) {
+
+    /* Decrease the number of dimensions to the detected ones */
+    int out_ndim = PyArray_NDIM(ret);
+    PyArray_Descr *out_descr = PyArray_DESCR(ret);
+    ((PyArrayObject_fields *)ret)->nd = ndim;
+    ((PyArrayObject_fields *)ret)->descr = dtype;
+
+    int success = PyArray_AssignFromCache(ret, cache);
+
+    ((PyArrayObject_fields *)ret)->nd = out_ndim;
+    ((PyArrayObject_fields *)ret)->descr = out_descr;
+    Py_DECREF(dtype);
+    if (success < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -1575,6 +1854,7 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
     return obj;
 }
 
+
 /*NUMPY_API
  * steals reference to newtype --- acc. NULL
  */
@@ -1733,10 +2013,8 @@ NPY_NO_EXPORT PyObject *
 PyArray_FromStructInterface(PyObject *input)
 {
     PyArray_Descr *thetype = NULL;
-    char buf[40];
     PyArrayInterface *inter;
     PyObject *attr;
-    PyArrayObject *ret;
     char endian = NPY_NATBYTE;
 
     attr = PyArray_LookupSpecial_OnInstance(input, "__array_struct__");
@@ -1747,10 +2025,22 @@ PyArray_FromStructInterface(PyObject *input)
             return Py_NotImplemented;
         }
     }
-    if (!NpyCapsule_Check(attr)) {
+    if (!PyCapsule_CheckExact(attr)) {
+        if (PyType_Check(input) && PyObject_HasAttrString(attr, "__get__")) {
+            /*
+             * If the input is a class `attr` should be a property-like object.
+             * This cannot be interpreted as an array, but is a valid.
+             * (Needed due to the lookup being on the instance rather than type)
+             */
+            Py_DECREF(attr);
+            return Py_NotImplemented;
+        }
+        goto fail;
+    }
+    inter = PyCapsule_GetPointer(attr, NULL);
+    if (inter == NULL) {
         goto fail;
     }
-    inter = NpyCapsule_AsVoidPtr(attr);
     if (inter->two != 2) {
         goto fail;
     }
@@ -1767,20 +2057,26 @@ PyArray_FromStructInterface(PyObject *input)
     }
 
     if (thetype == NULL) {
-        PyOS_snprintf(buf, sizeof(buf),
-                "%c%c%d", endian, inter->typekind, inter->itemsize);
-        if (!(thetype=_array_typedescr_fromstr(buf))) {
+        PyObject *type_str = PyUnicode_FromFormat(
+            "%c%c%d", endian, inter->typekind, inter->itemsize);
+        if (type_str == NULL) {
+            Py_DECREF(attr);
+            return NULL;
+        }
+        int ok = PyArray_DescrConverter(type_str, &thetype);
+        Py_DECREF(type_str);
+        if (ok != NPY_SUCCEED) {
             Py_DECREF(attr);
             return NULL;
         }
     }
 
-    ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+    PyObject *ret = PyArray_NewFromDescrAndBase(
             &PyArray_Type, thetype,
             inter->nd, inter->shape, inter->strides, inter->data,
             inter->flags, NULL, input);
     Py_DECREF(attr);
-    return (PyObject *)ret;
+    return ret;
 
  fail:
     PyErr_SetString(PyExc_ValueError, "invalid __array_struct__");
@@ -1794,41 +2090,21 @@ PyArray_FromStructInterface(PyObject *input)
  */
 NPY_NO_EXPORT int
 _is_default_descr(PyObject *descr, PyObject *typestr) {
-    PyObject *tuple, *name, *typestr2;
-    PyObject *tmp = NULL;
-    int ret = 0;
-
     if (!PyList_Check(descr) || PyList_GET_SIZE(descr) != 1) {
         return 0;
     }
-    tuple = PyList_GET_ITEM(descr, 0);
+    PyObject *tuple = PyList_GET_ITEM(descr, 0);
     if (!(PyTuple_Check(tuple) && PyTuple_GET_SIZE(tuple) == 2)) {
         return 0;
     }
-    name = PyTuple_GET_ITEM(tuple, 0);
+    PyObject *name = PyTuple_GET_ITEM(tuple, 0);
     if (!(PyUnicode_Check(name) && PyUnicode_GetLength(name) == 0)) {
         return 0;
     }
-    typestr2 = PyTuple_GET_ITEM(tuple, 1);
-    /* Allow unicode type strings */
-    if (PyUnicode_Check(typestr2)) {
-        tmp = PyUnicode_AsASCIIString(typestr2);
-        if (tmp == NULL) {
-            return 0;
-        }
-        typestr2 = tmp;
-    }
-    if (PyBytes_Check(typestr2) &&
-            PyObject_RichCompareBool(typestr, typestr2, Py_EQ)) {
-        ret = 1;
-    }
-    Py_XDECREF(tmp);
-
-    return ret;
+    PyObject *typestr2 = PyTuple_GET_ITEM(tuple, 1);
+    return PyObject_RichCompareBool(typestr, typestr2, Py_EQ);
 }
 
-#define PyIntOrLong_Check(obj) (PyInt_Check(obj) || PyLong_Check(obj))
-
 /*NUMPY_API*/
 NPY_NO_EXPORT PyObject *
 PyArray_FromInterface(PyObject *origin)
@@ -1840,12 +2116,12 @@ PyArray_FromInterface(PyObject *origin)
     PyArray_Descr *dtype = NULL;
     char *data = NULL;
     Py_buffer view;
-    int res, i, n;
+    int i, n;
     npy_intp dims[NPY_MAXDIMS], strides[NPY_MAXDIMS];
     int dataflags = NPY_ARRAY_BEHAVED;
 
-    iface = PyArray_LookupSpecial_OnInstance(origin,
-                                                    "__array_interface__");
+    iface = PyArray_LookupSpecial_OnInstance(origin, "__array_interface__");
+
     if (iface == NULL) {
         if (PyErr_Occurred()) {
             PyErr_Clear(); /* TODO[gh-14801]: propagate crashes during attribute access? */
@@ -1853,6 +2129,16 @@ PyArray_FromInterface(PyObject *origin)
         return Py_NotImplemented;
     }
     if (!PyDict_Check(iface)) {
+        if (PyType_Check(origin) && PyObject_HasAttrString(iface, "__get__")) {
+            /*
+             * If the input is a class `iface` should be a property-like object.
+             * This cannot be interpreted as an array, but is a valid.
+             * (Needed due to the lookup being on the instance rather than type)
+             */
+            Py_DECREF(iface);
+            return Py_NotImplemented;
+        }
+
         Py_DECREF(iface);
         PyErr_SetString(PyExc_ValueError,
                 "Invalid __array_interface__ value, must be a dict");
@@ -1870,26 +2156,15 @@ PyArray_FromInterface(PyObject *origin)
         return NULL;
     }
 
-    /* Allow unicode type strings */
-    if (PyUnicode_Check(attr)) {
-        PyObject *tmp = PyUnicode_AsASCIIString(attr);
-        if (tmp == NULL) {
-            goto fail;
-        }
-        attr = tmp;
-    }
-    else {
-        Py_INCREF(attr);
-    }
-
-    if (!PyBytes_Check(attr)) {
+    /* allow bytes for backwards compatibility */
+    if (!PyBytes_Check(attr) && !PyUnicode_Check(attr)) {
         PyErr_SetString(PyExc_TypeError,
                     "__array_interface__ typestr must be a string");
         goto fail;
     }
+
     /* Get dtype from type string */
-    dtype = _array_typedescr_fromstr(PyString_AS_STRING(attr));
-    if (dtype == NULL) {
+    if (PyArray_DescrConverter(attr, &dtype) != NPY_SUCCEED) {
         goto fail;
     }
 
@@ -1903,16 +2178,24 @@ PyArray_FromInterface(PyObject *origin)
             goto fail;
         }
         PyArray_Descr *new_dtype = NULL;
+        if (descr != NULL) {
+            int is_default = _is_default_descr(descr, attr);
+            if (is_default < 0) {
+                goto fail;
+            }
+            if (!is_default) {
+                if (PyArray_DescrConverter2(descr, &new_dtype) != NPY_SUCCEED) {
+                    goto fail;
+                }
+                if (new_dtype != NULL) {
+                    Py_DECREF(dtype);
+                    dtype = new_dtype;
+                }
+            }
 
-        if (descr != NULL && !_is_default_descr(descr, attr) &&
-                PyArray_DescrConverter2(descr, &new_dtype) == NPY_SUCCEED &&
-                new_dtype != NULL) {
-            Py_DECREF(dtype);
-            dtype = new_dtype;
         }
-    }
 
-    Py_DECREF(attr);  /* Pairs with the unicode handling above */
+    }
 
     /* Get shape tuple from interface specification */
     attr = _PyDict_GetItemStringWithError(iface, "shape");
@@ -1971,22 +2254,16 @@ PyArray_FromInterface(PyObject *origin)
             goto fail;
         }
         dataptr = PyTuple_GET_ITEM(attr, 0);
-        if (PyString_Check(dataptr)) {
-            res = sscanf(PyString_AsString(dataptr),
-                         "%p", (void **)&data);
-            if (res < 1) {
-                PyErr_SetString(PyExc_TypeError,
-                        "__array_interface__ data string cannot be converted");
+        if (PyLong_Check(dataptr)) {
+            data = PyLong_AsVoidPtr(dataptr);
+            if (data == NULL && PyErr_Occurred()) {
                 goto fail;
             }
         }
-        else if (PyIntOrLong_Check(dataptr)) {
-            data = PyLong_AsVoidPtr(dataptr);
-        }
         else {
             PyErr_SetString(PyExc_TypeError,
                     "first element of __array_interface__ data tuple "
-                    "must be integer or string.");
+                    "must be an integer.");
             goto fail;
         }
         if (PyObject_IsTrue(PyTuple_GET_ITEM(attr,1))) {
@@ -2119,6 +2396,16 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
         }
         return Py_NotImplemented;
     }
+    if (PyType_Check(op) && PyObject_HasAttrString(array_meth, "__get__")) {
+        /*
+         * If the input is a class `array_meth` may be a property-like object.
+         * This cannot be interpreted as an array (called), but is a valid.
+         * Trying `array_meth.__call__()` on this should not be useful.
+         * (Needed due to the lookup being on the instance rather than type)
+         */
+        Py_DECREF(array_meth);
+        return Py_NotImplemented;
+    }
     if (typecode == NULL) {
         new = PyObject_CallFunction(array_meth, NULL);
     }
@@ -2236,7 +2523,10 @@ PyArray_EnsureAnyArray(PyObject *op)
     return PyArray_EnsureArray(op);
 }
 
-/* TODO: Put the order parameter in PyArray_CopyAnyInto and remove this */
+/*
+ * Private implementation of PyArray_CopyAnyInto with an additional order
+ * parameter.
+ */
 NPY_NO_EXPORT int
 PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
 {
@@ -2362,16 +2652,21 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     src_count = *src_countptr;
     dst_data = dst_dataptr[0];
     src_data = src_dataptr[0];
+    int res = 0;
     for(;;) {
         /* Transfer the biggest amount that fits both */
         count = (src_count < dst_count) ? src_count : dst_count;
-        stransfer(dst_data, dst_stride,
-                    src_data, src_stride,
-                    count, src_itemsize, transferdata);
+        if (stransfer(
+                dst_data, dst_stride, src_data, src_stride,
+                count, src_itemsize, transferdata) < 0) {
+            res = -1;
+            break;
+        }
 
         /* If we exhausted the dst block, refresh it */
         if (dst_count == count) {
-            if (!dst_iternext(dst_iter)) {
+            res = dst_iternext(dst_iter);
+            if (!res) {
                 break;
             }
             dst_count = *dst_countptr;
@@ -2384,7 +2679,8 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
 
         /* If we exhausted the src block, refresh it */
         if (src_count == count) {
-            if (!src_iternext(src_iter)) {
+            res = src_iternext(src_iter);
+            if (!res) {
                 break;
             }
             src_count = *src_countptr;
@@ -2401,8 +2697,11 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     NPY_AUXDATA_FREE(transferdata);
     NpyIter_Deallocate(dst_iter);
     NpyIter_Deallocate(src_iter);
-
-    return PyErr_Occurred() ? -1 : 0;
+    if (res > 0) {
+        /* The iteration stopped successfully, do not report an error */
+        return 0;
+    }
+    return res;
 }
 
 /*NUMPY_API
@@ -2712,7 +3011,7 @@ _calc_length(PyObject *start, PyObject *stop, PyObject *step, PyObject **next, i
         return -1;
     }
 
-    zero = PyInt_FromLong(0);
+    zero = PyLong_FromLong(0);
     if (!zero) {
         Py_DECREF(*next);
         *next = NULL;
@@ -2857,14 +3156,14 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
         Py_INCREF(dtype);
     }
     if (!step || step == Py_None) {
-        step = PyInt_FromLong(1);
+        step = PyLong_FromLong(1);
     }
     else {
         Py_XINCREF(step);
     }
     if (!stop || stop == Py_None) {
         stop = start;
-        start = PyInt_FromLong(0);
+        start = PyLong_FromLong(0);
     }
     else {
         Py_INCREF(start);
@@ -2957,7 +3256,7 @@ PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject *step, PyArray_Descr
     return NULL;
 }
 
-/* This array creation function steals the reference to dtype. */
+/* This array creation function does not steal the reference to dtype. */
 static PyArrayObject *
 array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nread)
 {
@@ -2985,7 +3284,6 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
         if (fail) {
             PyErr_SetString(PyExc_IOError,
                             "could not seek in file");
-            Py_DECREF(dtype);
             return NULL;
         }
         num = numbytes / dtype->elsize;
@@ -2997,6 +3295,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
      */
     elsize = dtype->elsize;
 
+    Py_INCREF(dtype);  /* do not steal the original dtype. */
     r = (PyArrayObject *)PyArray_NewFromDescr(&PyArray_Type, dtype, 1, &num,
                                               NULL, NULL, 0, NULL);
     if (r == NULL) {
@@ -3012,7 +3311,7 @@ array_fromfile_binary(FILE *fp, PyArray_Descr *dtype, npy_intp num, size_t *nrea
 /*
  * Create an array by reading from the given stream, using the passed
  * next_element and skip_separator functions.
- * As typical for array creation functions, it steals the reference to dtype.
+ * Does not steal the reference to dtype.
  */
 #define FROM_BUFFER_SIZE 4096
 static PyArrayObject *
@@ -3041,7 +3340,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         PyArray_NewFromDescr(&PyArray_Type, dtype, 1, &size,
                              NULL, NULL, 0, NULL);
     if (r == NULL) {
-        Py_DECREF(dtype);
         return NULL;
     }
 
@@ -3104,7 +3402,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         if (PyErr_Occurred()) {
             /* If an error is already set (unlikely), do not create new one */
             Py_DECREF(r);
-            Py_DECREF(dtype);
             return NULL;
         }
         /* 2019-09-12, NumPy 1.18 */
@@ -3116,7 +3413,6 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
     }
 
 fail:
-    Py_DECREF(dtype);
     if (err == 1) {
         PyErr_NoMemory();
     }
@@ -3182,20 +3478,26 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
                 (skip_separator) fromfile_skip_separator, NULL);
     }
     if (ret == NULL) {
+        Py_DECREF(dtype);
         return NULL;
     }
     if (((npy_intp) nread) < num) {
-        /* Realloc memory for smaller number of elements */
-        const size_t nsize = PyArray_MAX(nread,1)*PyArray_DESCR(ret)->elsize;
+        /*
+         * Realloc memory for smaller number of elements, use original dtype
+         * which may have include a subarray (and is used for `nread`).
+         */
+        const size_t nsize = PyArray_MAX(nread,1) * dtype->elsize;
         char *tmp;
 
-        if((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+        if ((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+            Py_DECREF(dtype);
             Py_DECREF(ret);
             return PyErr_NoMemory();
         }
         ((PyArrayObject_fields *)ret)->data = tmp;
         PyArray_DIMS(ret)[0] = nread;
     }
+    Py_DECREF(dtype);
     return (PyObject *)ret;
 }
 
@@ -3406,6 +3708,7 @@ PyArray_FromString(char *data, npy_intp slen, PyArray_Descr *dtype,
                               (next_element) fromstr_next_element,
                               (skip_separator) fromstr_skip_separator,
                               end);
+        Py_DECREF(dtype);
     }
     return (PyObject *)ret;
 }
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 8f3948c23..9c1b606bb 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -25,6 +25,9 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 #include "convert_datatype.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "usertypes.h"
 
 /*
  * Computes the python `ret, d = divmod(d, unit)`.
@@ -1434,18 +1437,20 @@ raise_if_datetime64_metadata_cast_error(char *object_type,
         return 0;
     }
     else {
-        PyObject *errmsg;
-        errmsg = PyUString_FromFormat("Cannot cast %s "
-                    "from metadata ", object_type);
-        errmsg = append_metastr_to_string(src_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" to "));
-        errmsg = append_metastr_to_string(dst_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromFormat(" according to the rule %s",
-                        npy_casting_to_string(casting)));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *src = metastr_to_unicode(src_meta, 0);
+        if (src == NULL) {
+            return -1;
+        }
+        PyObject *dst = metastr_to_unicode(dst_meta, 0);
+        if (dst == NULL) {
+            Py_DECREF(src);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+            "Cannot cast %s from metadata %S to %S according to the rule %s",
+            object_type, src, dst, npy_casting_to_string(casting));
+        Py_DECREF(src);
+        Py_DECREF(dst);
         return -1;
     }
 }
@@ -1466,18 +1471,20 @@ raise_if_timedelta64_metadata_cast_error(char *object_type,
         return 0;
     }
     else {
-        PyObject *errmsg;
-        errmsg = PyUString_FromFormat("Cannot cast %s "
-                    "from metadata ", object_type);
-        errmsg = append_metastr_to_string(src_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" to "));
-        errmsg = append_metastr_to_string(dst_meta, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromFormat(" according to the rule %s",
-                        npy_casting_to_string(casting)));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *src = metastr_to_unicode(src_meta, 0);
+        if (src == NULL) {
+            return -1;
+        }
+        PyObject *dst = metastr_to_unicode(dst_meta, 0);
+        if (dst == NULL) {
+            Py_DECREF(src);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+             "Cannot cast %s from metadata %S to %S according to the rule %s",
+             object_type, src, dst, npy_casting_to_string(casting));
+        Py_DECREF(src);
+        Py_DECREF(dst);
         return -1;
     }
 }
@@ -1600,32 +1607,38 @@ compute_datetime_metadata_greatest_common_divisor(
     return 0;
 
 incompatible_units: {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Cannot get "
-                    "a common metadata divisor for "
-                    "NumPy datetime metadata ");
-        errmsg = append_metastr_to_string(meta1, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" and "));
-        errmsg = append_metastr_to_string(meta2, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" because they have "
-                    "incompatible nonlinear base time units"));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *umeta1 = metastr_to_unicode(meta1, 0);
+        if (umeta1 == NULL) {
+            return -1;
+        }
+        PyObject *umeta2 = metastr_to_unicode(meta2, 0);
+        if (umeta2 == NULL) {
+            Py_DECREF(umeta1);
+            return -1;
+        }
+        PyErr_Format(PyExc_TypeError,
+            "Cannot get a common metadata divisor for Numpy datatime "
+            "metadata %S and %S because they have incompatible nonlinear "
+            "base time units.", umeta1, umeta2);
+        Py_DECREF(umeta1);
+        Py_DECREF(umeta2);
         return -1;
     }
 units_overflow: {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Integer overflow "
-                    "getting a common metadata divisor for "
-                    "NumPy datetime metadata ");
-        errmsg = append_metastr_to_string(meta1, 0, errmsg);
-        PyUString_ConcatAndDel(&errmsg,
-                PyUString_FromString(" and "));
-        errmsg = append_metastr_to_string(meta2, 0, errmsg);
-        PyErr_SetObject(PyExc_OverflowError, errmsg);
-        Py_DECREF(errmsg);
+        PyObject *umeta1 = metastr_to_unicode(meta1, 0);
+        if (umeta1 == NULL) {
+            return -1;
+        }
+        PyObject *umeta2 = metastr_to_unicode(meta2, 0);
+        if (umeta2 == NULL) {
+            Py_DECREF(umeta1);
+            return -1;
+        }
+        PyErr_Format(PyExc_OverflowError,
+            "Integer overflow getting a common metadata divisor for "
+            "NumPy datetime metadata %S and %S.", umeta1, umeta2);
+        Py_DECREF(umeta1);
+        Py_DECREF(umeta2);
         return -1;
     }
 }
@@ -1717,6 +1730,10 @@ parse_datetime_unit_from_string(char const *str, Py_ssize_t len, char const *met
                 return NPY_FR_as;
         }
     }
+    else if (len == 3 && !strncmp(str, "\xce\xbcs", 3)) {
+        /* greek small letter mu, utf8-encoded */
+        return NPY_FR_us;
+    }
     else if (len == 7 && !strncmp(str, "generic", 7)) {
         return NPY_FR_GENERIC;
     }
@@ -1747,9 +1764,9 @@ convert_datetime_metadata_to_tuple(PyArray_DatetimeMetaData *meta)
     }
 
     PyTuple_SET_ITEM(dt_tuple, 0,
-            PyUString_FromString(_datetime_strings[meta->base]));
+            PyUnicode_FromString(_datetime_strings[meta->base]));
     PyTuple_SET_ITEM(dt_tuple, 1,
-            PyInt_FromLong(meta->num));
+            PyLong_FromLong(meta->num));
 
     return dt_tuple;
 }
@@ -1764,22 +1781,16 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
                                         PyArray_DatetimeMetaData *out_meta,
                                         npy_bool from_pickle)
 {
-    char *basestr = NULL;
-    Py_ssize_t len = 0, tuple_size;
     int den = 1;
-    PyObject *unit_str = NULL;
 
     if (!PyTuple_Check(tuple)) {
-        PyObject *errmsg;
-        errmsg = PyUString_FromString("Require tuple for tuple to NumPy "
-                                      "datetime metadata conversion, not ");
-        PyUString_ConcatAndDel(&errmsg, PyObject_Repr(tuple));
-        PyErr_SetObject(PyExc_TypeError, errmsg);
-        Py_DECREF(errmsg);
+        PyErr_Format(PyExc_TypeError,
+                "Require tuple for tuple to NumPy "
+                "datetime metadata conversion, not %R", tuple);
         return -1;
     }
 
-    tuple_size = PyTuple_GET_SIZE(tuple);
+    Py_ssize_t tuple_size = PyTuple_GET_SIZE(tuple);
     if (tuple_size < 2 || tuple_size > 4) {
         PyErr_SetString(PyExc_TypeError,
                         "Require tuple of size 2 to 4 for "
@@ -1787,18 +1798,22 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
         return -1;
     }
 
-    unit_str = PyTuple_GET_ITEM(tuple, 0);
-    Py_INCREF(unit_str);
-    if (PyUnicode_Check(unit_str)) {
-        /* Allow unicode format strings: convert to bytes */
-        PyObject *tmp = PyUnicode_AsASCIIString(unit_str);
-        Py_DECREF(unit_str);
+    PyObject *unit_str = PyTuple_GET_ITEM(tuple, 0);
+    if (PyBytes_Check(unit_str)) {
+        /* Allow bytes format strings: convert to unicode */
+        PyObject *tmp = PyUnicode_FromEncodedObject(unit_str, NULL, NULL);
         if (tmp == NULL) {
             return -1;
         }
         unit_str = tmp;
     }
-    if (PyBytes_AsStringAndSize(unit_str, &basestr, &len) < 0) {
+    else {
+        Py_INCREF(unit_str);
+    }
+
+    Py_ssize_t len;
+    char const *basestr = PyUnicode_AsUTF8AndSize(unit_str, &len);
+    if (basestr == NULL) {
         Py_DECREF(unit_str);
         return -1;
     }
@@ -1812,7 +1827,7 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
     Py_DECREF(unit_str);
 
     /* Convert the values to longs */
-    out_meta->num = PyInt_AsLong(PyTuple_GET_ITEM(tuple, 1));
+    out_meta->num = PyLong_AsLong(PyTuple_GET_ITEM(tuple, 1));
     if (error_converting(out_meta->num)) {
         return -1;
     }
@@ -1837,11 +1852,10 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
         if (from_pickle) {
             /* if (event == 1) */
             PyObject *one = PyLong_FromLong(1);
-            int equal_one;
             if (one == NULL) {
                 return -1;
             }
-            equal_one = PyObject_RichCompareBool(event, one, Py_EQ);
+            int equal_one = PyObject_RichCompareBool(event, one, Py_EQ);
             Py_DECREF(one);
             if (equal_one == -1) {
                 return -1;
@@ -1868,7 +1882,7 @@ convert_datetime_metadata_tuple_to_datetime_metadata(PyObject *tuple,
                 return -1;
             }
         }
-        den = PyInt_AsLong(PyTuple_GET_ITEM(tuple, 2));
+        den = PyLong_AsLong(PyTuple_GET_ITEM(tuple, 2));
         if (error_converting(den)) {
             return -1;
         }
@@ -1900,26 +1914,23 @@ NPY_NO_EXPORT int
 convert_pyobject_to_datetime_metadata(PyObject *obj,
                                       PyArray_DatetimeMetaData *out_meta)
 {
-    PyObject *ascii = NULL;
-    char *str = NULL;
-    Py_ssize_t len = 0;
-
     if (PyTuple_Check(obj)) {
         return convert_datetime_metadata_tuple_to_datetime_metadata(
             obj, out_meta, NPY_FALSE);
     }
 
-    /* Get an ASCII string */
-    if (PyUnicode_Check(obj)) {
-        /* Allow unicode format strings: convert to bytes */
-        ascii = PyUnicode_AsASCIIString(obj);
-        if (ascii == NULL) {
+    /* Get a UTF8 string */
+    PyObject *utf8 = NULL;
+    if (PyBytes_Check(obj)) {
+        /* Allow bytes format strings: convert to unicode */
+        utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+        if (utf8 == NULL) {
             return -1;
         }
     }
-    else if (PyBytes_Check(obj)) {
-        ascii = obj;
-        Py_INCREF(ascii);
+    else if (PyUnicode_Check(obj)) {
+        utf8 = obj;
+        Py_INCREF(utf8);
     }
     else {
         PyErr_SetString(PyExc_TypeError,
@@ -1927,58 +1938,52 @@ convert_pyobject_to_datetime_metadata(PyObject *obj,
         return -1;
     }
 
-    if (PyBytes_AsStringAndSize(ascii, &str, &len) < 0) {
-        Py_DECREF(ascii);
+    Py_ssize_t len = 0;
+    char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+    if (str == NULL) {
+        Py_DECREF(utf8);
         return -1;
     }
 
     if (len > 0 && str[0] == '[') {
         int r = parse_datetime_metadata_from_metastr(str, len, out_meta);
-        Py_DECREF(ascii);
+        Py_DECREF(utf8);
         return r;
     }
     else {
         if (parse_datetime_extended_unit_from_string(str, len,
                                                 NULL, out_meta) < 0) {
-            Py_DECREF(ascii);
+            Py_DECREF(utf8);
             return -1;
         }
 
-        Py_DECREF(ascii);
+        Py_DECREF(utf8);
         return 0;
     }
 }
 
 /*
- * 'ret' is a PyUString containing the datetime string, and this
- * function appends the metadata string to it.
+ * Return the datetime metadata as a Unicode object.
+ *
+ * Returns new reference, NULL on error.
  *
  * If 'skip_brackets' is true, skips the '[]'.
  *
- * This function steals the reference 'ret'
  */
 NPY_NO_EXPORT PyObject *
-append_metastr_to_string(PyArray_DatetimeMetaData *meta,
-                                    int skip_brackets,
-                                    PyObject *ret)
+metastr_to_unicode(PyArray_DatetimeMetaData *meta, int skip_brackets)
 {
-    PyObject *res;
     int num;
     char const *basestr;
 
-    if (ret == NULL) {
-        return NULL;
-    }
-
     if (meta->base == NPY_FR_GENERIC) {
         /* Without brackets, give a string "generic" */
         if (skip_brackets) {
-            PyUString_ConcatAndDel(&ret, PyUString_FromString("generic"));
-            return ret;
+            return PyUnicode_FromString("generic");
         }
-        /* But with brackets, append nothing */
+        /* But with brackets, return nothing */
         else {
-            return ret;
+            return PyUnicode_FromString("");
         }
     }
 
@@ -1994,25 +1999,23 @@ append_metastr_to_string(PyArray_DatetimeMetaData *meta,
 
     if (num == 1) {
         if (skip_brackets) {
-            res = PyUString_FromFormat("%s", basestr);
+            return PyUnicode_FromFormat("%s", basestr);
         }
         else {
-            res = PyUString_FromFormat("[%s]", basestr);
+            return PyUnicode_FromFormat("[%s]", basestr);
         }
     }
     else {
         if (skip_brackets) {
-            res = PyUString_FromFormat("%d%s", num, basestr);
+            return PyUnicode_FromFormat("%d%s", num, basestr);
         }
         else {
-            res = PyUString_FromFormat("[%d%s]", num, basestr);
+            return PyUnicode_FromFormat("[%d%s]", num, basestr);
         }
     }
-
-    PyUString_ConcatAndDel(&ret, res);
-    return ret;
 }
 
+
 /*
  * Adjusts a datetimestruct based on a seconds offset. Assumes
  * the current values are valid.
@@ -2108,7 +2111,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->year = PyInt_AsLong(tmp);
+    out->year = PyLong_AsLong(tmp);
     if (error_converting(out->year)) {
         Py_DECREF(tmp);
         return -1;
@@ -2120,7 +2123,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->month = PyInt_AsLong(tmp);
+    out->month = PyLong_AsLong(tmp);
     if (error_converting(out->month)) {
         Py_DECREF(tmp);
         return -1;
@@ -2132,7 +2135,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->day = PyInt_AsLong(tmp);
+    out->day = PyLong_AsLong(tmp);
     if (error_converting(out->day)) {
         Py_DECREF(tmp);
         return -1;
@@ -2166,7 +2169,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->hour = PyInt_AsLong(tmp);
+    out->hour = PyLong_AsLong(tmp);
     if (error_converting(out->hour)) {
         Py_DECREF(tmp);
         return -1;
@@ -2178,7 +2181,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->min = PyInt_AsLong(tmp);
+    out->min = PyLong_AsLong(tmp);
     if (error_converting(out->min)) {
         Py_DECREF(tmp);
         return -1;
@@ -2190,7 +2193,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->sec = PyInt_AsLong(tmp);
+    out->sec = PyLong_AsLong(tmp);
     if (error_converting(out->sec)) {
         Py_DECREF(tmp);
         return -1;
@@ -2202,7 +2205,7 @@ convert_pydatetime_to_datetimestruct(PyObject *obj, npy_datetimestruct *out,
     if (tmp == NULL) {
         return -1;
     }
-    out->us = PyInt_AsLong(tmp);
+    out->us = PyLong_AsLong(tmp);
     if (error_converting(out->us)) {
         Py_DECREF(tmp);
         return -1;
@@ -2350,32 +2353,33 @@ convert_pyobject_to_datetime(PyArray_DatetimeMetaData *meta, PyObject *obj,
                                 NPY_CASTING casting, npy_datetime *out)
 {
     if (PyBytes_Check(obj) || PyUnicode_Check(obj)) {
-        PyObject *bytes = NULL;
-        char *str = NULL;
-        Py_ssize_t len = 0;
-        npy_datetimestruct dts;
-        NPY_DATETIMEUNIT bestunit = NPY_FR_ERROR;
+        PyObject *utf8 = NULL;
 
-        /* Convert to an ASCII string for the date parser */
-        if (PyUnicode_Check(obj)) {
-            bytes = PyUnicode_AsASCIIString(obj);
-            if (bytes == NULL) {
+        /* Convert to an UTF8 string for the date parser */
+        if (PyBytes_Check(obj)) {
+            utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+            if (utf8 == NULL) {
                 return -1;
             }
         }
         else {
-            bytes = obj;
-            Py_INCREF(bytes);
+            utf8 = obj;
+            Py_INCREF(utf8);
         }
-        if (PyBytes_AsStringAndSize(bytes, &str, &len) < 0) {
-            Py_DECREF(bytes);
+
+        Py_ssize_t len = 0;
+        char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+        if (str == NULL) {
+            Py_DECREF(utf8);
             return -1;
         }
 
         /* Parse the ISO date */
+        npy_datetimestruct dts;
+        NPY_DATETIMEUNIT bestunit = NPY_FR_ERROR;
         if (parse_iso_8601_datetime(str, len, meta->base, casting,
                                 &dts, &bestunit, NULL) < 0) {
-            Py_DECREF(bytes);
+            Py_DECREF(utf8);
             return -1;
         }
 
@@ -2386,15 +2390,15 @@ convert_pyobject_to_datetime(PyArray_DatetimeMetaData *meta, PyObject *obj,
         }
 
         if (convert_datetimestruct_to_datetime(meta, &dts, out) < 0) {
-            Py_DECREF(bytes);
+            Py_DECREF(utf8);
             return -1;
         }
 
-        Py_DECREF(bytes);
+        Py_DECREF(utf8);
         return 0;
     }
     /* Do no conversion on raw integers */
-    else if (PyInt_Check(obj) || PyLong_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Don't allow conversion from an integer without specifying a unit */
         if (meta->base == NPY_FR_ERROR || meta->base == NPY_FR_GENERIC) {
             PyErr_SetString(PyExc_ValueError, "Converting an integer to a "
@@ -2544,24 +2548,25 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                                 NPY_CASTING casting, npy_timedelta *out)
 {
     if (PyBytes_Check(obj) || PyUnicode_Check(obj)) {
-        PyObject *bytes = NULL;
-        char *str = NULL;
-        Py_ssize_t len = 0;
+        PyObject *utf8 = NULL;
         int succeeded = 0;
 
-        /* Convert to an ASCII string for the date parser */
-        if (PyUnicode_Check(obj)) {
-            bytes = PyUnicode_AsASCIIString(obj);
-            if (bytes == NULL) {
+        /* Convert to an UTF8 string for the date parser */
+        if (PyBytes_Check(obj)) {
+            utf8 = PyUnicode_FromEncodedObject(obj, NULL, NULL);
+            if (utf8 == NULL) {
                 return -1;
             }
         }
         else {
-            bytes = obj;
-            Py_INCREF(bytes);
+            utf8 = obj;
+            Py_INCREF(utf8);
         }
-        if (PyBytes_AsStringAndSize(bytes, &str, &len) < 0) {
-            Py_DECREF(bytes);
+
+        Py_ssize_t len = 0;
+        char const *str = PyUnicode_AsUTF8AndSize(utf8, &len);
+        if (str == NULL) {
+            Py_DECREF(utf8);
             return -1;
         }
 
@@ -2582,7 +2587,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                 succeeded = 1;
             }
         }
-        Py_DECREF(bytes);
+        Py_DECREF(utf8);
 
         if (succeeded) {
             /* Use generic units if none was specified */
@@ -2595,7 +2600,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         }
     }
     /* Do no conversion on raw integers */
-    else if (PyInt_Check(obj) || PyLong_Check(obj)) {
+    else if (PyLong_Check(obj)) {
         /* Use the default unit if none was specified */
         if (meta->base == NPY_FR_ERROR) {
             meta->base = NPY_DATETIME_DEFAULTUNIT;
@@ -2699,7 +2704,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         if (tmp == NULL) {
             return -1;
         }
-        seconds = PyInt_AsLong(tmp);
+        seconds = PyLong_AsLong(tmp);
         if (error_converting(seconds)) {
             Py_DECREF(tmp);
             return -1;
@@ -2711,7 +2716,7 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
         if (tmp == NULL) {
             return -1;
         }
-        useconds = PyInt_AsLong(tmp);
+        useconds = PyLong_AsLong(tmp);
         if (error_converting(useconds)) {
             Py_DECREF(tmp);
             return -1;
@@ -3320,8 +3325,7 @@ datetime_arange(PyObject *start, PyObject *stop, PyObject *step,
         type_nums[2] = NPY_TIMEDELTA;
     }
     else {
-        if (PyInt_Check(objs[1]) ||
-                        PyLong_Check(objs[1]) ||
+        if (PyLong_Check(objs[1]) ||
                         PyArray_IsScalar(objs[1], Integer) ||
                         is_any_numpy_timedelta(objs[1])) {
             type_nums[1] = NPY_TIMEDELTA;
@@ -3724,3 +3728,375 @@ find_object_datetime_type(PyObject *obj, int type_num)
         return NULL;
     }
 }
+
+
+
+
+/*
+ * Describes casting within datetimes or timedelta
+ */
+static NPY_CASTING
+time_to_time_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /* This is a within-dtype cast, which currently must handle byteswapping */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (given_descrs[1] == NULL) {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[0]);
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    int is_timedelta = given_descrs[0]->type_num == NPY_TIMEDELTA;
+
+    if (given_descrs[0] == given_descrs[1]) {
+        return NPY_NO_CASTING | _NPY_CAST_IS_VIEW;
+    }
+
+    NPY_CASTING byteorder_may_allow_view = 0;
+    if (PyDataType_ISNOTSWAPPED(loop_descrs[0]) ==
+            PyDataType_ISNOTSWAPPED(loop_descrs[1])) {
+        byteorder_may_allow_view = _NPY_CAST_IS_VIEW;
+    }
+    PyArray_DatetimeMetaData *meta1, *meta2;
+    meta1 = get_datetime_metadata_from_dtype(loop_descrs[0]);
+    assert(meta1 != NULL);
+    meta2 = get_datetime_metadata_from_dtype(loop_descrs[1]);
+    assert(meta2 != NULL);
+
+    if (meta1->base == meta2->base && meta1->num == meta2->num) {
+        if (byteorder_may_allow_view) {
+            return NPY_NO_CASTING | byteorder_may_allow_view;
+        }
+        return NPY_EQUIV_CASTING;
+    }
+    else if (meta1->base == NPY_FR_GENERIC) {
+        return NPY_SAFE_CASTING | byteorder_may_allow_view;
+    }
+    else if (meta2->base == NPY_FR_GENERIC) {
+        /* TODO: This is actually an invalid cast (casting will error) */
+        return NPY_UNSAFE_CASTING;
+    }
+    else if (is_timedelta && (
+            /* jump between time units and date units is unsafe for timedelta */
+            (meta1->base <= NPY_FR_M && meta2->base > NPY_FR_M) ||
+            (meta1->base > NPY_FR_M && meta2->base <= NPY_FR_M))) {
+        return NPY_UNSAFE_CASTING;
+    }
+    else if (meta1->base <= meta2->base) {
+        /* Casting to a more precise unit is currently considered safe */
+        if (datetime_metadata_divides(meta1, meta2, is_timedelta)) {
+            /* If it divides, we consider it to be a safe cast */
+            return NPY_SAFE_CASTING;
+        }
+        else {
+            return NPY_SAME_KIND_CASTING;
+        }
+    }
+    return NPY_SAME_KIND_CASTING;
+}
+
+
+/* Handles datetime<->timedelta type resolution (both directions) */
+static NPY_CASTING
+datetime_to_timedelta_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    loop_descrs[0] = ensure_dtype_nbo(given_descrs[0]);
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    if (given_descrs[1] == NULL) {
+        PyArray_DatetimeMetaData *meta = get_datetime_metadata_from_dtype(given_descrs[0]);
+        assert(meta != NULL);
+        loop_descrs[1] = create_datetime_dtype(dtypes[1]->type_num, meta);
+    }
+    else {
+        loop_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+    }
+    if (loop_descrs[1] == NULL) {
+        Py_DECREF(loop_descrs[0]);
+        return -1;
+    }
+    /*
+     * Mostly NPY_UNSAFE_CASTING is not true, the cast will fail.
+     * TODO: Once ufuncs use dtype specific promotion rules,
+     *       this is likely unnecessary
+     */
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/* In the current setup both strings and unicode casts support all outputs */
+static NPY_CASTING
+time_to_string_resolve_descriptors(
+        PyArrayMethodObject *self,
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **loop_descrs)
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (given_descrs[1] != NULL) {
+        /*
+         * At the time of writing, NumPy does not check the length here,
+         * but will error if filling fails.
+         */
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+    else {
+        /* Find the correct string length, possibly based on the unit */
+        int size;
+        if (given_descrs[0]->type_num == NPY_DATETIME) {
+            PyArray_DatetimeMetaData *meta = get_datetime_metadata_from_dtype(given_descrs[0]);
+            assert(meta != NULL);
+            size = get_datetime_iso_8601_strlen(0, meta->base);
+        }
+        else {
+            size = 21;
+        }
+        if (dtypes[1]->type_num == NPY_UNICODE) {
+            size *= 4;
+        }
+        loop_descrs[1] = PyArray_DescrNewFromType(dtypes[1]->type_num);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+        loop_descrs[1]->elsize = size;
+    }
+    assert(self->casting == NPY_UNSAFE_CASTING);
+    return NPY_UNSAFE_CASTING;
+}
+
+
+static NPY_CASTING
+string_to_datetime_cast_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    /* We currently support byte-swapping, so any (unicode) string is OK */
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+
+    if (given_descrs[1] == NULL) {
+        /* NOTE: This doesn't actually work, and will error during the cast */
+        loop_descrs[1] = dtypes[1]->default_descr(dtypes[1]);
+        if (loop_descrs[1] == NULL) {
+            Py_DECREF(loop_descrs[0]);
+            return -1;
+        }
+    }
+    else {
+        Py_INCREF(given_descrs[1]);
+        loop_descrs[1] = given_descrs[1];
+    }
+
+    return NPY_UNSAFE_CASTING;
+}
+
+
+/*
+ * This registers the castingimpl for all datetime related casts.
+ */
+NPY_NO_EXPORT int
+PyArray_InitializeDatetimeCasts()
+{
+    int result = -1;
+
+    PyType_Slot slots[3];
+    PyArray_DTypeMeta *dtypes[2];
+    PyArrayMethod_Spec spec = {
+        .name = "datetime_casts",
+        .nin = 1,
+        .nout = 1,
+        .casting = NPY_NO_CASTING,
+        .flags = NPY_METH_SUPPORTS_UNALIGNED,
+        .slots = slots,
+        .dtypes = dtypes,
+    };
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &time_to_time_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    PyArray_DTypeMeta *datetime = PyArray_DTypeFromTypeNum(NPY_DATETIME);
+    PyArray_DTypeMeta *timedelta = PyArray_DTypeFromTypeNum(NPY_TIMEDELTA);
+    PyArray_DTypeMeta *string = PyArray_DTypeFromTypeNum(NPY_STRING);
+    PyArray_DTypeMeta *unicode = PyArray_DTypeFromTypeNum(NPY_UNICODE);
+    PyArray_DTypeMeta *tmp = NULL;
+
+    dtypes[0] = datetime;
+    dtypes[1] = datetime;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+    dtypes[0] = timedelta;
+    dtypes[1] = timedelta;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Casting between timedelta and datetime uses legacy casting loops, but
+     * custom dtype resolution (to handle copying of the time unit).
+     */
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &datetime_to_timedelta_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    spec.name = "timedelta_and_datetime_cast";
+    dtypes[0] = timedelta;
+    dtypes[1] = datetime;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+    spec.name = "datetime_to_timedelta_cast";
+    dtypes[0] = datetime;
+    dtypes[1] = timedelta;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Cast from numeric types to times.  These use the cast functions
+     * as stored on the datatype, which should be replaced at some point.
+     * Some of these casts can fail (casting to unitless datetime), but these
+     * are rather special.
+     */
+    for (int num = 0; num < NPY_NTYPES; num++) {
+        if (!PyTypeNum_ISNUMBER(num) && num != NPY_BOOL) {
+            continue;
+        }
+
+        Py_XSETREF(tmp, PyArray_DTypeFromTypeNum(num));
+
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                tmp, datetime, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                datetime, tmp, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+
+        NPY_CASTING to_timedelta_casting = NPY_UNSAFE_CASTING;
+        if (PyTypeNum_ISINTEGER(num) || num == NPY_BOOL) {
+            /* timedelta casts like int64 right now... */
+            if (PyTypeNum_ISUNSIGNED(num) && tmp->singleton->elsize == 8) {
+                to_timedelta_casting = NPY_SAME_KIND_CASTING;
+            }
+            else {
+                to_timedelta_casting = NPY_SAFE_CASTING;
+            }
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                tmp, timedelta, to_timedelta_casting) < 0) {
+            goto fail;
+        }
+        if (PyArray_AddLegacyWrapping_CastingImpl(
+                timedelta, tmp, NPY_UNSAFE_CASTING) < 0) {
+            goto fail;
+        }
+    }
+
+    /*
+     * Cast times to string and unicode
+     */
+    spec.casting = NPY_UNSAFE_CASTING;
+    /*
+     * Casts can error and need API (unicodes needs it for string->unicode).
+     * Unicode handling is currently implemented via a legacy cast.
+     */
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &time_to_string_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    for (int num = NPY_DATETIME; num <= NPY_TIMEDELTA; num++) {
+        for (int str = NPY_STRING; str <= NPY_UNICODE; str++) {
+            dtypes[0] = PyArray_DTypeFromTypeNum(num);
+            dtypes[1] = PyArray_DTypeFromTypeNum(str);
+
+            int res = PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+            Py_SETREF(dtypes[0], NULL);
+            Py_SETREF(dtypes[1], NULL);
+            if (res < 0) {
+                return -1;
+            }
+        }
+    }
+
+    /*
+     * Cast strings to timedelta are currently only legacy casts
+     */
+    if (PyArray_AddLegacyWrapping_CastingImpl(
+            string, timedelta, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
+    }
+    if (PyArray_AddLegacyWrapping_CastingImpl(
+            unicode, timedelta, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
+    }
+
+    /*
+     * Cast strings to datetime
+     */
+    dtypes[1] = datetime;
+    spec.casting = NPY_UNSAFE_CASTING;
+
+    /* The default type resolution should work fine. */
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &string_to_datetime_cast_resolve_descriptors;
+    slots[1].slot = NPY_METH_get_loop;
+    slots[1].pfunc = NULL;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    dtypes[0] = string;
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    dtypes[0] = unicode;
+    /*
+     * Unicode handling is currently implemented via a legacy cast, which
+     * requires the Python API.
+     */
+    spec.flags = NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_REQUIRES_PYAPI;
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 1) < 0) {
+        goto fail;
+    }
+
+    result = 0;
+  fail:
+    Py_DECREF(datetime);
+    Py_DECREF(timedelta);
+    Py_DECREF(string);
+    Py_DECREF(unicode);
+    Py_XDECREF(tmp);
+    return result;
+}
+
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index d3cce8a37..2cf157551 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -834,24 +834,23 @@ static int
 PyArray_BusDayRollConverter(PyObject *roll_in, NPY_BUSDAY_ROLL *roll)
 {
     PyObject *obj = roll_in;
-    char *str;
-    Py_ssize_t len;
 
-    /* Make obj into an ASCII string */
-    Py_INCREF(obj);
-    if (PyUnicode_Check(obj)) {
-        /* accept unicode input */
-        PyObject *obj_str;
-        obj_str = PyUnicode_AsASCIIString(obj);
+    /* Make obj into an UTF8 string */
+    if (PyBytes_Check(obj)) {
+        /* accept bytes input */
+        PyObject *obj_str = PyUnicode_FromEncodedObject(obj, NULL, NULL);
         if (obj_str == NULL) {
-            Py_DECREF(obj);
             return 0;
         }
-        Py_DECREF(obj);
         obj = obj_str;
     }
+    else {
+        Py_INCREF(obj);
+    }
 
-    if (PyBytes_AsStringAndSize(obj, &str, &len) < 0) {
+    Py_ssize_t len;
+    char const *str = PyUnicode_AsUTF8AndSize(obj, &len);
+    if (str == NULL) {
         Py_DECREF(obj);
         return 0;
     }
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index 6936a803f..d48141d4c 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -30,33 +30,31 @@ PyArray_WeekMaskConverter(PyObject *weekmask_in, npy_bool *weekmask)
 {
     PyObject *obj = weekmask_in;
 
-    /* Make obj into an ASCII string if it is UNICODE */
-    Py_INCREF(obj);
-    if (PyUnicode_Check(obj)) {
-        /* accept unicode input */
-        PyObject *obj_str;
-        obj_str = PyUnicode_AsASCIIString(obj);
+    /* Make obj into an UTF8 string */
+    if (PyBytes_Check(obj)) {
+        /* accept bytes input */
+        PyObject *obj_str = PyUnicode_FromEncodedObject(obj, NULL, NULL);
         if (obj_str == NULL) {
-            Py_DECREF(obj);
             return 0;
         }
-        Py_DECREF(obj);
         obj = obj_str;
     }
+    else {
+        Py_INCREF(obj);
+    }
 
-    if (PyBytes_Check(obj)) {
-        char *str;
-        Py_ssize_t len;
-        int i;
 
-        if (PyBytes_AsStringAndSize(obj, &str, &len) < 0) {
+    if (PyUnicode_Check(obj)) {
+        Py_ssize_t len;
+        char const *str = PyUnicode_AsUTF8AndSize(obj, &len);
+        if (str == NULL) {
             Py_DECREF(obj);
             return 0;
         }
 
         /* Length 7 is a string like "1111100" */
         if (len == 7) {
-            for (i = 0; i < 7; ++i) {
+            for (int i = 0; i < 7; ++i) {
                 switch(str[i]) {
                     case '0':
                         weekmask[i] = 0;
@@ -75,7 +73,7 @@ PyArray_WeekMaskConverter(PyObject *weekmask_in, npy_bool *weekmask)
 general_weekmask_string:
         /* a string like "SatSun" or "Mon Tue Wed" */
         memset(weekmask, 0, 7);
-        for (i = 0; i < len; i += 3) {
+        for (Py_ssize_t i = 0; i < len; i += 3) {
             while (isspace(str[i]))
                 ++i;
 
@@ -168,7 +166,7 @@ invalid_weekmask_string:
                     return 0;
                 }
 
-                val = PyInt_AsLong(f);
+                val = PyLong_AsLong(f);
                 if (error_converting(val)) {
                     Py_DECREF(f);
                     Py_DECREF(obj);
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index f847c7ea8..360868568 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -1385,21 +1385,23 @@ array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
     /* Parse the input unit if provided */
     if (unit_in != NULL && unit_in != Py_None) {
         PyObject *strobj;
-        char *str = NULL;
-        Py_ssize_t len = 0;
 
-        if (PyUnicode_Check(unit_in)) {
-            strobj = PyUnicode_AsASCIIString(unit_in);
-            if (strobj == NULL) {
-                goto fail;
+        if (PyBytes_Check(unit_in)) {
+            /* accept bytes input */
+            PyObject *obj_str = PyUnicode_FromEncodedObject(unit_in, NULL, NULL);
+            if (obj_str == NULL) {
+                return 0;
             }
+            strobj = obj_str;
         }
         else {
+            Py_INCREF(unit_in);
             strobj = unit_in;
-            Py_INCREF(strobj);
         }
 
-        if (PyBytes_AsStringAndSize(strobj, &str, &len) < 0) {
+        Py_ssize_t len;
+        char const *str = PyUnicode_AsUTF8AndSize(strobj, &len);
+        if (str == NULL) {
             Py_DECREF(strobj);
             goto fail;
         }
@@ -1434,24 +1436,27 @@ array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
 
     /* Get the input time zone */
     if (timezone_obj != NULL) {
-        /* Convert to ASCII if it's unicode */
-        if (PyUnicode_Check(timezone_obj)) {
-            /* accept unicode input */
-            PyObject *obj_str;
-            obj_str = PyUnicode_AsASCIIString(timezone_obj);
+        PyObject *strobj;
+        if (PyBytes_Check(timezone_obj)) {
+            /* accept bytes input */
+            PyObject *obj_str = PyUnicode_FromEncodedObject(timezone_obj, NULL, NULL);
             if (obj_str == NULL) {
                 goto fail;
             }
-            Py_DECREF(timezone_obj);
-            timezone_obj = obj_str;
+            strobj = obj_str;
         }
+        else {
+            Py_INCREF(timezone_obj);
+            strobj = timezone_obj;
+        }
+
+        Py_SETREF(timezone_obj, strobj);
 
         /* Check for the supported string inputs */
-        if (PyBytes_Check(timezone_obj)) {
-            char *str;
+        if (PyUnicode_Check(timezone_obj)) {
             Py_ssize_t len;
-
-            if (PyBytes_AsStringAndSize(timezone_obj, &str, &len) < 0) {
+            char const *str = PyUnicode_AsUTF8AndSize(timezone_obj, &len);
+            if (str == NULL) {
                 goto fail;
             }
 
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 67d57975b..a8d575248 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -386,7 +386,7 @@ _convert_from_tuple(PyObject *obj, int align)
         }
         for (int i=0; i < shape.len; i++) {
             PyTuple_SET_ITEM(newdescr->subarray->shape, i,
-                             PyInt_FromLong((long)shape.ptr[i]));
+                             PyLong_FromLong((long)shape.ptr[i]));
 
             if (PyTuple_GET_ITEM(newdescr->subarray->shape, i) == NULL) {
                 Py_DECREF(newdescr);
@@ -441,7 +441,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         }
         PyObject *name = PyTuple_GET_ITEM(item, 0);
         PyObject *title;
-        if (PyBaseString_Check(name)) {
+        if (PyUnicode_Check(name)) {
             title = NULL;
         }
         else if (PyTuple_Check(name)) {
@@ -454,7 +454,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             }
             title = PyTuple_GET_ITEM(name, 0);
             name = PyTuple_GET_ITEM(name, 1);
-            if (!PyBaseString_Check(name)) {
+            if (!PyUnicode_Check(name)) {
                 PyErr_SetString(PyExc_TypeError, "Field name must be a str");
                 goto fail;
             }
@@ -472,7 +472,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         if (PyUnicode_GetLength(name) == 0) {
             Py_DECREF(name);
             if (title == NULL) {
-                name = PyUString_FromFormat("f%d", i);
+                name = PyUnicode_FromFormat("f%d", i);
                 if (name == NULL) {
                     goto fail;
                 }
@@ -512,7 +512,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         }
         if ((PyDict_GetItemWithError(fields, name) != NULL)
              || (title
-                 && PyBaseString_Check(title)
+                 && PyUnicode_Check(title)
                  && (PyDict_GetItemWithError(fields, title) != NULL))) {
             PyErr_Format(PyExc_ValueError,
                     "field %R occurs more than once", name);
@@ -537,7 +537,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             goto fail;
         }
         PyTuple_SET_ITEM(tup, 0, (PyObject *)conv);
-        PyTuple_SET_ITEM(tup, 1, PyInt_FromLong((long) totalsize));
+        PyTuple_SET_ITEM(tup, 1, PyLong_FromLong((long) totalsize));
 
         /*
          * Title can be "meta-data".  Only insert it
@@ -550,7 +550,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             if (PyDict_SetItem(fields, name, tup) < 0) {
                 goto fail;
             }
-            if (PyBaseString_Check(title)) {
+            if (PyUnicode_Check(title)) {
                 PyObject *existing = PyDict_GetItemWithError(fields, title);
                 if (existing == NULL && PyErr_Occurred()) {
                     goto fail;
@@ -660,7 +660,7 @@ _convert_from_list(PyObject *obj, int align)
             }
             maxalign = PyArray_MAX(maxalign, _align);
         }
-        PyObject *size_obj = PyInt_FromLong((long) totalsize);
+        PyObject *size_obj = PyLong_FromLong((long) totalsize);
         if (!size_obj) {
             Py_DECREF(conv);
             goto fail;
@@ -673,7 +673,7 @@ _convert_from_list(PyObject *obj, int align)
         }
         PyTuple_SET_ITEM(tup, 0, (PyObject *)conv);
         PyTuple_SET_ITEM(tup, 1, size_obj);
-        PyObject *key = PyUString_FromFormat("f%d", i);
+        PyObject *key = PyUnicode_FromFormat("f%d", i);
         if (!key) {
             Py_DECREF(tup);
             goto fail;
@@ -1112,7 +1112,7 @@ _convert_from_dict(PyObject *obj, int align)
         /* Build item to insert (descr, offset, [title])*/
         int len = 2;
         PyObject *title = NULL;
-        PyObject *ind = PyInt_FromLong(i);
+        PyObject *ind = PyLong_FromLong(i);
         if (titles) {
             title=PyObject_GetItem(titles, ind);
             if (title && title != Py_None) {
@@ -1166,7 +1166,7 @@ _convert_from_dict(PyObject *obj, int align)
                 goto fail;
             }
 
-            PyTuple_SET_ITEM(tup, 1, PyInt_FromLong(offset));
+            PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(offset));
             /* Flag whether the fields are specified out of order */
             if (offset < totalsize) {
                 has_out_of_order_fields = 1;
@@ -1190,7 +1190,7 @@ _convert_from_dict(PyObject *obj, int align)
             if (align && _align > 1) {
                 totalsize = NPY_NEXT_ALIGNED_OFFSET(totalsize, _align);
             }
-            PyTuple_SET_ITEM(tup, 1, PyInt_FromLong(totalsize));
+            PyTuple_SET_ITEM(tup, 1, PyLong_FromLong(totalsize));
             totalsize += newdescr->elsize;
         }
         if (len == 3) {
@@ -1202,7 +1202,7 @@ _convert_from_dict(PyObject *obj, int align)
             Py_DECREF(tup);
             goto fail;
         }
-        if (!PyBaseString_Check(name)) {
+        if (!PyUnicode_Check(name)) {
             PyErr_SetString(PyExc_ValueError,
                     "field names must be strings");
             Py_DECREF(tup);
@@ -1228,7 +1228,7 @@ _convert_from_dict(PyObject *obj, int align)
             goto fail;
         }
         if (len == 3) {
-            if (PyBaseString_Check(title)) {
+            if (PyUnicode_Check(title)) {
                 if (PyDict_GetItemWithError(fields, title) != NULL) {
                     PyErr_SetString(PyExc_ValueError,
                             "title already used as a name or title.");
@@ -1497,15 +1497,36 @@ _convert_from_any(PyObject *obj, int align)
     }
     else if (PyTuple_Check(obj)) {
         /* or a tuple */
-        return _convert_from_tuple(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a tuple object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_tuple(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyList_Check(obj)) {
         /* or a list */
-        return _convert_from_array_descr(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a list object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_array_descr(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyDict_Check(obj) || PyDictProxy_Check(obj)) {
         /* or a dictionary */
-        return _convert_from_dict(obj, align);
+        if (Py_EnterRecursiveCall(
+                " while trying to convert the given data type from"
+                " a dict object" ) != 0) {
+            return NULL;
+        }
+        PyArray_Descr *ret = _convert_from_dict(obj, align);
+        Py_LeaveRecursiveCall();
+        return ret;
     }
     else if (PyArray_Check(obj)) {
         PyErr_SetString(PyExc_TypeError, "Cannot construct a dtype from an array");
@@ -1887,23 +1908,31 @@ arraydescr_protocol_typestr_get(PyArray_Descr *self)
         size >>= 2;
     }
     if (self->type_num == NPY_OBJECT) {
-        ret = PyUString_FromFormat("%c%c", endian, basic_);
+        ret = PyUnicode_FromFormat("%c%c", endian, basic_);
     }
     else {
-        ret = PyUString_FromFormat("%c%c%d", endian, basic_, size);
+        ret = PyUnicode_FromFormat("%c%c%d", endian, basic_, size);
     }
+    if (ret == NULL) {
+        return NULL;
+    }
+
     if (PyDataType_ISDATETIME(self)) {
         PyArray_DatetimeMetaData *meta;
-
         meta = get_datetime_metadata_from_dtype(self);
         if (meta == NULL) {
             Py_DECREF(ret);
             return NULL;
         }
+        PyObject *umeta = metastr_to_unicode(meta, 0);
+        if (umeta == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
 
-        ret = append_metastr_to_string(meta, 0, ret);
+        Py_SETREF(ret, PyUnicode_Concat(ret, umeta));
+        Py_DECREF(umeta);
     }
-
     return ret;
 }
 
@@ -1950,7 +1979,7 @@ arraydescr_ndim_get(PyArray_Descr *self)
     Py_ssize_t ndim;
 
     if (!PyDataType_HASSUBARRAY(self)) {
-        return PyInt_FromLong(0);
+        return PyLong_FromLong(0);
     }
 
     /*
@@ -1958,7 +1987,7 @@ arraydescr_ndim_get(PyArray_Descr *self)
      * for tuple argument
      */
     ndim = PyTuple_Size(self->subarray->shape);
-    return PyInt_FromLong(ndim);
+    return PyLong_FromLong(ndim);
 }
 
 
@@ -1974,7 +2003,7 @@ arraydescr_protocol_descr_get(PyArray_Descr *self)
         if (dobj == NULL) {
             return NULL;
         }
-        PyTuple_SET_ITEM(dobj, 0, PyUString_FromString(""));
+        PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString(""));
         PyTuple_SET_ITEM(dobj, 1, arraydescr_protocol_typestr_get(self));
         res = PyList_New(1);
         if (res == NULL) {
@@ -2010,7 +2039,7 @@ arraydescr_isbuiltin_get(PyArray_Descr *self)
     if (PyTypeNum_ISUSERDEF(self->type_num)) {
         val = 2;
     }
-    return PyInt_FromLong(val);
+    return PyLong_FromLong(val);
 }
 
 static int
@@ -2153,7 +2182,7 @@ arraydescr_names_set(PyArray_Descr *self, PyObject *val)
         PyObject *item;
         int valid = 1;
         item = PySequence_GetItem(val, i);
-        valid = PyUString_Check(item);
+        valid = PyUnicode_Check(item);
         Py_DECREF(item);
         if (!valid) {
             PyErr_Format(PyExc_ValueError,
@@ -2391,11 +2420,11 @@ _get_pickleabletype_from_datetime_metadata(PyArray_Descr *dtype)
     PyTuple_SET_ITEM(dt_tuple, 0,
             PyBytes_FromString(_datetime_strings[meta->base]));
     PyTuple_SET_ITEM(dt_tuple, 1,
-            PyInt_FromLong(meta->num));
+            PyLong_FromLong(meta->num));
     PyTuple_SET_ITEM(dt_tuple, 2,
-            PyInt_FromLong(1));
+            PyLong_FromLong(1));
     PyTuple_SET_ITEM(dt_tuple, 3,
-            PyInt_FromLong(1));
+            PyLong_FromLong(1));
 
     PyTuple_SET_ITEM(ret, 1, dt_tuple);
 
@@ -2450,7 +2479,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
         if (self->type_num == NPY_UNICODE) {
             elsize >>= 2;
         }
-        obj = PyUString_FromFormat("%c%d",self->kind, elsize);
+        obj = PyUnicode_FromFormat("%c%d",self->kind, elsize);
     }
     PyTuple_SET_ITEM(ret, 1, Py_BuildValue("(NOO)", obj, Py_False, Py_True));
 
@@ -2468,7 +2497,7 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
     if (PyDataType_ISDATETIME(self)) {
         PyObject *newobj;
         state = PyTuple_New(9);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
         /*
          * newobj is a tuple of the Python metadata dictionary
          * and tuple of date_time info (str, num)
@@ -2483,16 +2512,16 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
     }
     else if (self->metadata) {
         state = PyTuple_New(9);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
         Py_INCREF(self->metadata);
         PyTuple_SET_ITEM(state, 8, self->metadata);
     }
     else { /* Use version 3 pickle format */
         state = PyTuple_New(8);
-        PyTuple_SET_ITEM(state, 0, PyInt_FromLong(3));
+        PyTuple_SET_ITEM(state, 0, PyLong_FromLong(3));
     }
 
-    PyTuple_SET_ITEM(state, 1, PyUString_FromFormat("%c", endian));
+    PyTuple_SET_ITEM(state, 1, PyUnicode_FromFormat("%c", endian));
     PyTuple_SET_ITEM(state, 2, arraydescr_subdescr_get(self));
     if (PyDataType_HASFIELDS(self)) {
         Py_INCREF(self->names);
@@ -2516,9 +2545,9 @@ arraydescr_reduce(PyArray_Descr *self, PyObject *NPY_UNUSED(args))
         elsize = -1;
         alignment = -1;
     }
-    PyTuple_SET_ITEM(state, 5, PyInt_FromLong(elsize));
-    PyTuple_SET_ITEM(state, 6, PyInt_FromLong(alignment));
-    PyTuple_SET_ITEM(state, 7, PyInt_FromLong(self->flags));
+    PyTuple_SET_ITEM(state, 5, PyLong_FromLong(elsize));
+    PyTuple_SET_ITEM(state, 6, PyLong_FromLong(alignment));
+    PyTuple_SET_ITEM(state, 7, PyLong_FromLong(self->flags));
 
     PyTuple_SET_ITEM(ret, 2, state);
     return ret;
@@ -2628,7 +2657,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     default:
         /* raise an error */
         if (PyTuple_GET_SIZE(PyTuple_GET_ITEM(args,0)) > 5) {
-            version = PyInt_AsLong(PyTuple_GET_ITEM(args, 0));
+            version = PyLong_AsLong(PyTuple_GET_ITEM(args, 0));
         }
         else {
             version = -1;
@@ -2651,7 +2680,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     if (version == 1 || version == 0) {
         if (fields != Py_None) {
             PyObject *key, *list;
-            key = PyInt_FromLong(-1);
+            key = PyLong_FromLong(-1);
             list = PyDict_GetItemWithError(fields, key);
             if (!list) {
                 if (!PyErr_Occurred()) {
@@ -2788,7 +2817,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
 
         for (i = 0; i < PyTuple_GET_SIZE(names); ++i) {
             name = PyTuple_GET_ITEM(names, i);
-            if (!PyUString_Check(name)) {
+            if (!PyUnicode_Check(name)) {
                 names_ok = 0;
                 break;
             }
@@ -2890,14 +2919,13 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
     }
 
     if (PyDataType_ISDATETIME(self) && (metadata != NULL)) {
-        PyObject *old_metadata, *errmsg;
+        PyObject *old_metadata;
         PyArray_DatetimeMetaData temp_dt_data;
 
         if ((! PyTuple_Check(metadata)) || (PyTuple_Size(metadata) != 2)) {
-            errmsg = PyUString_FromString("Invalid datetime dtype (metadata, c_metadata): ");
-            PyUString_ConcatAndDel(&errmsg, PyObject_Repr(metadata));
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
+            PyErr_Format(PyExc_ValueError,
+                    "Invalid datetime dtype (metadata, c_metadata): %R",
+                    metadata);
             return NULL;
         }
 
@@ -3020,7 +3048,7 @@ PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian)
             if (NPY_TITLE_KEY(key, value)) {
                 continue;
             }
-            if (!PyUString_Check(key) || !PyTuple_Check(value) ||
+            if (!PyUnicode_Check(key) || !PyTuple_Check(value) ||
                 ((len=PyTuple_GET_SIZE(value)) < 2)) {
                 continue;
             }
@@ -3321,7 +3349,7 @@ _is_list_of_strings(PyObject *obj)
     seqlen = PyList_GET_SIZE(obj);
     for (i = 0; i < seqlen; i++) {
         PyObject *item = PyList_GET_ITEM(obj, i);
-        if (!PyBaseString_Check(item)) {
+        if (!PyUnicode_Check(item)) {
             return NPY_FALSE;
         }
     }
@@ -3393,7 +3421,7 @@ arraydescr_field_subset_view(PyArray_Descr *self, PyObject *ind)
         /* disallow duplicate field indices */
         if (PyDict_Contains(fields, name)) {
             PyObject *msg = NULL;
-            PyObject *fmt = PyUString_FromString(
+            PyObject *fmt = PyUnicode_FromString(
                                    "duplicate field of name {!r}");
             if (fmt != NULL) {
                 msg = PyObject_CallMethod(fmt, "format", "O", name);
@@ -3431,7 +3459,7 @@ descr_subscript(PyArray_Descr *self, PyObject *op)
         return NULL;
     }
 
-    if (PyBaseString_Check(op)) {
+    if (PyUnicode_Check(op)) {
         return _subscript_by_name(self, op);
     }
     else if (_is_list_of_strings(op)) {
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index 553d0effb..a7b252a77 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -3093,7 +3093,7 @@ Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
         free_dragon4_bigint_scratch(scratch);\
         return NULL;\
     }\
-    ret = PyUString_FromString(scratch->repr);\
+    ret = PyUnicode_FromString(scratch->repr);\
     free_dragon4_bigint_scratch(scratch);\
     return ret;\
 }\
@@ -3130,7 +3130,7 @@ Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
         free_dragon4_bigint_scratch(scratch);\
         return NULL;\
     }\
-    ret = PyUString_FromString(scratch->repr);\
+    ret = PyUnicode_FromString(scratch->repr);\
     free_dragon4_bigint_scratch(scratch);\
     return ret;\
 }\
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 3a58b5849..630bd76f3 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -17,7 +17,6 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
-#include <numpy/npy_cpu.h>
 
 #include "npy_pycompat.h"
 
@@ -106,7 +105,7 @@ get_bool_setdstone_transfer_function(npy_intp dst_stride,
 /*************************** COPY REFERENCES *******************************/
 
 /* Moves references from src to dst */
-static void
+static int
 _strided_to_strided_move_references(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -114,27 +113,28 @@ _strided_to_strided_move_references(char *dst, npy_intp dst_stride,
 {
     PyObject *src_ref = NULL, *dst_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&src_ref, src, sizeof(src_ref));
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
         /* Release the reference in dst */
         NPY_DT_DBG_REFTRACE("dec dst ref", dst_ref);
         Py_XDECREF(dst_ref);
         /* Move the reference */
         NPY_DT_DBG_REFTRACE("move src ref", src_ref);
-        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        memcpy(dst, &src_ref, sizeof(src_ref));
         /* Set the source reference to NULL */
         src_ref = NULL;
-        NPY_COPY_PYOBJECT_PTR(src, &src_ref);
+        memcpy(src, &src_ref, sizeof(src_ref));
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /* Copies references from src to dst */
-static void
+static int
 _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -142,12 +142,12 @@ _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
 {
     PyObject *src_ref = NULL, *dst_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&src_ref, src, sizeof(src_ref));
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
         /* Copy the reference */
         NPY_DT_DBG_REFTRACE("copy src ref", src_ref);
-        NPY_COPY_PYOBJECT_PTR(dst, &src_ref);
+        memcpy(dst, &src_ref, sizeof(src_ref));
         /* Claim the reference */
         Py_XINCREF(src_ref);
         /* Release the reference in dst */
@@ -158,6 +158,7 @@ _strided_to_strided_copy_references(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -188,7 +189,7 @@ static NpyAuxData *_strided_zero_pad_data_clone(NpyAuxData *data)
  * Does a strided to strided zero-padded copy for the case where
  * dst_itemsize > src_itemsize
  */
-static void
+static int
 _strided_to_strided_zero_pad_copy(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -205,13 +206,14 @@ _strided_to_strided_zero_pad_copy(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Does a strided to strided zero-padded copy for the case where
  * dst_itemsize < src_itemsize
  */
-static void
+static int
 _strided_to_strided_truncate_copy(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -226,13 +228,14 @@ _strided_to_strided_truncate_copy(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Does a strided to strided zero-padded or truncated copy for the case where
  * unicode swapping is needed.
  */
-static void
+static int
 _strided_to_strided_unicode_copyswap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -260,6 +263,7 @@ _strided_to_strided_unicode_copyswap(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -379,7 +383,7 @@ static NpyAuxData *_align_wrap_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -395,47 +399,50 @@ _strided_to_strided_contig_align_wrap(char *dst, npy_intp dst_stride,
             *todata = d->todata,
             *fromdata = d->fromdata;
     char *bufferin = d->bufferin, *bufferout = d->bufferout;
-    npy_bool init_dest = d->init_dest, out_needs_api = d->out_needs_api;
+    npy_bool init_dest = d->init_dest;
 
     for(;;) {
-        /*
-         * The caller does not know if a previous call resulted in a Python
-         * exception. Much of the Python API is unsafe while an exception is in
-         * flight, so just skip all the work. Someone higher in the call stack
-         * will check for errors and propagate them.
-         */
-        if (out_needs_api && PyErr_Occurred()) {
-            return;
-        }
         if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
-            tobuffer(bufferin, inner_src_itemsize, src, src_stride,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    src_itemsize, todata);
+            if (tobuffer(
+                    bufferin, inner_src_itemsize, src, src_stride,
+                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE, src_itemsize, todata) < 0) {
+                return -1;
+            }
             if (init_dest) {
                 memset(bufferout, 0,
-                        dst_itemsize*NPY_LOWLEVEL_BUFFER_BLOCKSIZE);
+                       dst_itemsize*NPY_LOWLEVEL_BUFFER_BLOCKSIZE);
+            }
+            if (wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
+                        NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                        inner_src_itemsize, wrappeddata) < 0) {
+                return -1;
+            }
+            if (frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                           NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                           dst_itemsize, fromdata) < 0) {
+                return -1;
             }
-            wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    inner_src_itemsize, wrappeddata);
-            frombuffer(dst, dst_stride, bufferout, dst_itemsize,
-                                    NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                    dst_itemsize, fromdata);
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
             dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
         }
         else {
-            tobuffer(bufferin, inner_src_itemsize, src, src_stride, N,
-                                            src_itemsize, todata);
+            if (tobuffer(bufferin, inner_src_itemsize, src, src_stride,
+                         N, src_itemsize, todata) < 0) {
+                return -1;
+            }
             if (init_dest) {
                 memset(bufferout, 0, dst_itemsize*N);
             }
-            wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize, N,
-                                            inner_src_itemsize, wrappeddata);
-            frombuffer(dst, dst_stride, bufferout, dst_itemsize, N,
-                                            dst_itemsize, fromdata);
-            return;
+            if (wrapped(bufferout, dst_itemsize, bufferin, inner_src_itemsize,
+                        N, inner_src_itemsize, wrappeddata) < 0) {
+                return -1;
+            }
+            if (frombuffer(dst, dst_stride, bufferout, dst_itemsize,
+                           N, dst_itemsize, fromdata) < 0) {
+                return -1;
+            }
+            return 0;
         }
     }
 }
@@ -538,7 +545,7 @@ static NpyAuxData *_wrap_copy_swap_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_wrap_copy_swap(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -546,7 +553,9 @@ _strided_to_strided_wrap_copy_swap(char *dst, npy_intp dst_stride,
 {
     _wrap_copy_swap_data *d = (_wrap_copy_swap_data *)data;
 
+    /* We assume that d->copyswapn should not be able to error. */
     d->copyswapn(dst, dst_stride, src, src_stride, N, d->swap, d->arr);
+    return 0;
 }
 
 /* This only gets used for custom data types and for Unicode when swapping */
@@ -603,6 +612,7 @@ typedef struct {
     NpyAuxData base;
     PyArray_VectorUnaryFunc *castfunc;
     PyArrayObject *aip, *aop;
+    npy_bool needs_api;
 } _strided_cast_data;
 
 /* strided cast data free function */
@@ -630,7 +640,7 @@ static NpyAuxData *_strided_cast_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -639,17 +649,29 @@ _aligned_strided_to_strided_cast(char *dst, npy_intp dst_stride,
     _strided_cast_data *d = (_strided_cast_data *)data;
     PyArray_VectorUnaryFunc *castfunc = d->castfunc;
     PyArrayObject *aip = d->aip, *aop = d->aop;
+    npy_bool needs_api = d->needs_api;
 
     while (N > 0) {
         castfunc(src, dst, 1, aip, aop);
+        /*
+         * Since error handling in ufuncs is not ideal (at the time of
+         * writing this, an error could be in process before calling this
+         * function. For most of NumPy history these checks were completely
+         * missing, so this is hopefully OK for the time being (until ufuncs
+         * are fixed).
+         */
+        if (needs_api && PyErr_Occurred()) {
+            return -1;
+        }
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 /* This one requires src be of type NPY_OBJECT */
-static void
+static int
 _aligned_strided_to_strided_cast_decref_src(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -658,31 +680,49 @@ _aligned_strided_to_strided_cast_decref_src(char *dst, npy_intp dst_stride,
     _strided_cast_data *d = (_strided_cast_data *)data;
     PyArray_VectorUnaryFunc *castfunc = d->castfunc;
     PyArrayObject *aip = d->aip, *aop = d->aop;
+    npy_bool needs_api = d->needs_api;
     PyObject *src_ref;
 
     while (N > 0) {
         castfunc(src, dst, 1, aip, aop);
-
-        /* After casting, decrement the source ref */
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-        NPY_DT_DBG_REFTRACE("dec src ref (cast object -> not object)", src_ref);
+        /*
+         * See comment in `_aligned_strided_to_strided_cast`, an error could
+         * in principle be set before `castfunc` is called.
+         */
+        if (needs_api && PyErr_Occurred()) {
+            return -1;
+        }
+        /* After casting, decrement the source ref and set it to NULL */
+        memcpy(&src_ref, src, sizeof(src_ref));
         Py_XDECREF(src_ref);
+        memset(src, 0, sizeof(PyObject *));
+        NPY_DT_DBG_REFTRACE("dec src ref (cast object -> not object)", src_ref);
 
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _aligned_contig_to_contig_cast(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp NPY_UNUSED(itemsize),
                         NpyAuxData *data)
 {
     _strided_cast_data *d = (_strided_cast_data *)data;
+    npy_bool needs_api = d->needs_api;
 
     d->castfunc(src, dst, N, d->aip, d->aop);
+    /*
+     * See comment in `_aligned_strided_to_strided_cast`, an error could
+     * in principle be set before `castfunc` is called.
+     */
+    if (needs_api && PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
 }
 
 static int
@@ -777,7 +817,7 @@ static NpyAuxData *_strided_datetime_cast_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -792,12 +832,12 @@ _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
 
         if (convert_datetime_to_datetimestruct(&d->src_meta,
                                                dt, &dts) < 0) {
-            dt = NPY_DATETIME_NAT;
+            return -1;
         }
         else {
             if (convert_datetimestruct_to_datetime(&d->dst_meta,
                                                    &dts, &dt) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
 
@@ -807,9 +847,10 @@ _strided_to_strided_datetime_general_cast(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_datetime_cast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -838,9 +879,10 @@ _strided_to_strided_datetime_cast(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _aligned_strided_to_strided_datetime_cast(char *dst,
                         npy_intp dst_stride,
                         char *src, npy_intp src_stride,
@@ -870,9 +912,10 @@ _aligned_strided_to_strided_datetime_cast(char *dst,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_datetime_to_string(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -888,28 +931,26 @@ _strided_to_strided_datetime_to_string(char *dst, npy_intp dst_stride,
 
         if (convert_datetime_to_datetimestruct(&d->src_meta,
                                                dt, &dts) < 0) {
-            /* For an error, produce a 'NaT' string */
-            dts.year = NPY_DATETIME_NAT;
+            return -1;
         }
 
         /* Initialize the destination to all zeros */
         memset(dst, 0, dst_itemsize);
 
-        /*
-         * This may also raise an error, but the caller needs
-         * to use PyErr_Occurred().
-         */
-        make_iso_8601_datetime(&dts, dst, dst_itemsize,
+        if (make_iso_8601_datetime(&dts, dst, dst_itemsize,
                                 0, 0, d->src_meta.base, -1,
-                                NPY_UNSAFE_CASTING);
+                                NPY_UNSAFE_CASTING) < 0) {
+            return -1;
+        }
 
         dst += dst_stride;
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -934,7 +975,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
             if (parse_iso_8601_datetime(tmp_buffer, src_itemsize,
                                     d->dst_meta.base, NPY_SAME_KIND_CASTING,
                                     &dts, NULL, NULL) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
         /* Otherwise parse the data in place */
@@ -942,7 +983,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
             if (parse_iso_8601_datetime(src, tmp - src,
                                     d->dst_meta.base, NPY_SAME_KIND_CASTING,
                                     &dts, NULL, NULL) < 0) {
-                dt = NPY_DATETIME_NAT;
+                return -1;
             }
         }
 
@@ -950,7 +991,7 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
         if (dt != NPY_DATETIME_NAT &&
                 convert_datetimestruct_to_datetime(&d->dst_meta,
                                                &dts, &dt) < 0) {
-            dt = NPY_DATETIME_NAT;
+            return -1;
         }
 
         memcpy(dst, &dt, sizeof(dt));
@@ -959,14 +1000,14 @@ _strided_to_strided_string_to_datetime(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 /*
  * Assumes src_dtype and dst_dtype are both datetimes or both timedeltas
  */
-static int
+NPY_NO_EXPORT int
 get_nbo_cast_datetime_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
                             PyArray_StridedUnaryOp **out_stransfer,
                             NpyAuxData **out_transferdata)
@@ -1040,12 +1081,10 @@ get_nbo_cast_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_datetime_to_string_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata)
+NPY_NO_EXPORT int
+get_nbo_datetime_to_string_transfer_function(
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *src_meta;
     _strided_datetime_cast_data *data;
@@ -1085,7 +1124,7 @@ get_nbo_datetime_to_string_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
+NPY_NO_EXPORT int
 get_datetime_to_unicode_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
@@ -1098,8 +1137,8 @@ get_datetime_to_unicode_transfer_function(int aligned,
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
-    str_dtype = PyArray_DescrFromType(NPY_STRING);
-    str_dtype = PyArray_AdaptFlexibleDType(dst_dtype, str_dtype);
+    str_dtype = PyArray_DescrNewFromType(NPY_STRING);
+    str_dtype->elsize = dst_dtype->elsize / 4;
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
@@ -1114,10 +1153,9 @@ get_datetime_to_unicode_transfer_function(int aligned,
     }
 
     /* Get the NBO datetime to string aligned contig function */
-    if (get_nbo_datetime_to_string_transfer_function(1,
-                            src_dtype->elsize, str_dtype->elsize,
-                            src_dtype, str_dtype,
-                            &caststransfer, &castdata) != NPY_SUCCEED) {
+    if (get_nbo_datetime_to_string_transfer_function(
+            src_dtype, str_dtype,
+            &caststransfer, &castdata) != NPY_SUCCEED) {
         Py_DECREF(str_dtype);
         NPY_AUXDATA_FREE(todata);
         return NPY_FAIL;
@@ -1156,12 +1194,10 @@ get_datetime_to_unicode_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_string_to_datetime_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata)
+NPY_NO_EXPORT int
+get_nbo_string_to_datetime_transfer_function(
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata)
 {
     PyArray_DatetimeMetaData *dst_meta;
     _strided_datetime_cast_data *data;
@@ -1208,7 +1244,7 @@ get_nbo_string_to_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
+NPY_NO_EXPORT int
 get_unicode_to_datetime_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
                             PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
@@ -1221,11 +1257,12 @@ get_unicode_to_datetime_transfer_function(int aligned,
     PyArray_Descr *str_dtype;
 
     /* Get an ASCII string data type, adapted to match the UNICODE one */
-    str_dtype = PyArray_DescrFromType(NPY_STRING);
-    str_dtype = PyArray_AdaptFlexibleDType(src_dtype, str_dtype);
+    str_dtype = PyArray_DescrNewFromType(NPY_STRING);
     if (str_dtype == NULL) {
         return NPY_FAIL;
     }
+    assert(src_dtype->type_num == NPY_UNICODE);
+    str_dtype->elsize = src_dtype->elsize / 4;
 
     /* Get the cast operation from src */
     if (PyArray_GetDTypeTransferFunction(aligned,
@@ -1239,10 +1276,9 @@ get_unicode_to_datetime_transfer_function(int aligned,
     }
 
     /* Get the string to NBO datetime aligned contig function */
-    if (get_nbo_string_to_datetime_transfer_function(1,
-                            str_dtype->elsize, dst_dtype->elsize,
-                            str_dtype, dst_dtype,
-                            &caststransfer, &castdata) != NPY_SUCCEED) {
+    if (get_nbo_string_to_datetime_transfer_function(
+            str_dtype, dst_dtype,
+            &caststransfer, &castdata) != NPY_SUCCEED) {
         Py_DECREF(str_dtype);
         NPY_AUXDATA_FREE(todata);
         return NPY_FAIL;
@@ -1280,95 +1316,21 @@ get_unicode_to_datetime_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static int
-get_nbo_cast_transfer_function(int aligned,
-                            npy_intp src_stride, npy_intp dst_stride,
-                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
-                            int move_references,
-                            PyArray_StridedUnaryOp **out_stransfer,
-                            NpyAuxData **out_transferdata,
-                            int *out_needs_api,
-                            int *out_needs_wrap)
+
+NPY_NO_EXPORT int
+get_legacy_dtype_cast_function(
+        int aligned, npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        int move_references,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata,
+        int *out_needs_api, int *out_needs_wrap)
 {
     _strided_cast_data *data;
     PyArray_VectorUnaryFunc *castfunc;
     PyArray_Descr *tmp_dtype;
-    npy_intp shape = 1, src_itemsize = src_dtype->elsize,
-            dst_itemsize = dst_dtype->elsize;
-
-    if (PyTypeNum_ISNUMBER(src_dtype->type_num) &&
-                    PyTypeNum_ISNUMBER(dst_dtype->type_num)) {
-        *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
-                          !PyArray_ISNBO(dst_dtype->byteorder);
-        return get_nbo_cast_numeric_transfer_function(aligned,
-                                    src_stride, dst_stride,
-                                    src_dtype->type_num, dst_dtype->type_num,
-                                    out_stransfer, out_transferdata);
-    }
-
-    if (src_dtype->type_num == NPY_DATETIME ||
-            src_dtype->type_num == NPY_TIMEDELTA ||
-            dst_dtype->type_num == NPY_DATETIME ||
-            dst_dtype->type_num == NPY_TIMEDELTA) {
-        /* A parameterized type, datetime->datetime sometimes needs casting */
-        if ((src_dtype->type_num == NPY_DATETIME &&
-                    dst_dtype->type_num == NPY_DATETIME) ||
-                (src_dtype->type_num == NPY_TIMEDELTA &&
-                    dst_dtype->type_num == NPY_TIMEDELTA)) {
-            *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
-                              !PyArray_ISNBO(dst_dtype->byteorder);
-            return get_nbo_cast_datetime_transfer_function(aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-        }
-
-        /*
-         * Datetime <-> string conversions can be handled specially.
-         * The functions may raise an error if the strings have no
-         * space, or can't be parsed properly.
-         */
-        if (src_dtype->type_num == NPY_DATETIME) {
-            switch (dst_dtype->type_num) {
-                case NPY_STRING:
-                    *out_needs_api = 1;
-                    *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder);
-                    return get_nbo_datetime_to_string_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-
-                case NPY_UNICODE:
-                    return get_datetime_to_unicode_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata,
-                                        out_needs_api);
-            }
-        }
-        else if (dst_dtype->type_num == NPY_DATETIME) {
-            switch (src_dtype->type_num) {
-                case NPY_STRING:
-                    *out_needs_api = 1;
-                    *out_needs_wrap = !PyArray_ISNBO(dst_dtype->byteorder);
-                    return get_nbo_string_to_datetime_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata);
-
-                case NPY_UNICODE:
-                    return get_unicode_to_datetime_transfer_function(
-                                        aligned,
-                                        src_stride, dst_stride,
-                                        src_dtype, dst_dtype,
-                                        out_stransfer, out_transferdata,
-                                        out_needs_api);
-            }
-        }
-    }
+    npy_intp shape = 1;
+    npy_intp src_itemsize = src_dtype->elsize;
+    npy_intp dst_itemsize = dst_dtype->elsize;
 
     *out_needs_wrap = !aligned ||
                       !PyArray_ISNBO(src_dtype->byteorder) ||
@@ -1422,6 +1384,7 @@ get_nbo_cast_transfer_function(int aligned,
     data->base.free = &_strided_cast_data_free;
     data->base.clone = &_strided_cast_data_clone;
     data->castfunc = castfunc;
+    data->needs_api = *out_needs_api;
     /*
      * TODO: This is a hack so the cast functions have an array.
      *       The cast functions shouldn't need that.  Also, since we
@@ -1500,6 +1463,162 @@ get_nbo_cast_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
+
+static int
+get_nbo_cast_transfer_function(int aligned,
+                            npy_intp src_stride, npy_intp dst_stride,
+                            PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+                            int move_references,
+                            PyArray_StridedUnaryOp **out_stransfer,
+                            NpyAuxData **out_transferdata,
+                            int *out_needs_api,
+                            int *out_needs_wrap)
+{
+    if (PyTypeNum_ISNUMBER(src_dtype->type_num) &&
+                    PyTypeNum_ISNUMBER(dst_dtype->type_num)) {
+        *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
+                          !PyArray_ISNBO(dst_dtype->byteorder);
+        return get_nbo_cast_numeric_transfer_function(aligned,
+                                    src_stride, dst_stride,
+                                    src_dtype->type_num, dst_dtype->type_num,
+                                    out_stransfer, out_transferdata);
+    }
+
+    if (src_dtype->type_num == NPY_DATETIME ||
+            src_dtype->type_num == NPY_TIMEDELTA ||
+            dst_dtype->type_num == NPY_DATETIME ||
+            dst_dtype->type_num == NPY_TIMEDELTA) {
+        /* A parameterized type, datetime->datetime sometimes needs casting */
+        if ((src_dtype->type_num == NPY_DATETIME &&
+                    dst_dtype->type_num == NPY_DATETIME) ||
+                (src_dtype->type_num == NPY_TIMEDELTA &&
+                    dst_dtype->type_num == NPY_TIMEDELTA)) {
+            *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder) ||
+                              !PyArray_ISNBO(dst_dtype->byteorder);
+            return get_nbo_cast_datetime_transfer_function(aligned,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata);
+        }
+
+        /*
+         * Datetime <-> string conversions can be handled specially.
+         * The functions may raise an error if the strings have no
+         * space, or can't be parsed properly.
+         */
+        if (src_dtype->type_num == NPY_DATETIME) {
+            switch (dst_dtype->type_num) {
+                case NPY_STRING:
+                    *out_needs_api = 1;
+                    *out_needs_wrap = !PyArray_ISNBO(src_dtype->byteorder);
+                    return get_nbo_datetime_to_string_transfer_function(
+                            src_dtype, dst_dtype,
+                            out_stransfer, out_transferdata);
+
+                case NPY_UNICODE:
+                    return get_datetime_to_unicode_transfer_function(
+                                        aligned,
+                                        src_stride, dst_stride,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata,
+                                        out_needs_api);
+            }
+        }
+        else if (dst_dtype->type_num == NPY_DATETIME) {
+            switch (src_dtype->type_num) {
+                case NPY_STRING:
+                    *out_needs_api = 1;
+                    *out_needs_wrap = !PyArray_ISNBO(dst_dtype->byteorder);
+                    return get_nbo_string_to_datetime_transfer_function(
+                            src_dtype, dst_dtype,
+                            out_stransfer, out_transferdata);
+
+                case NPY_UNICODE:
+                    return get_unicode_to_datetime_transfer_function(
+                                        aligned,
+                                        src_stride, dst_stride,
+                                        src_dtype, dst_dtype,
+                                        out_stransfer, out_transferdata,
+                                        out_needs_api);
+            }
+        }
+    }
+
+    return get_legacy_dtype_cast_function(
+            aligned, src_stride, dst_stride, src_dtype, dst_dtype,
+            move_references, out_stransfer, out_transferdata,
+            out_needs_api, out_needs_wrap);
+}
+
+
+NPY_NO_EXPORT int
+wrap_aligned_contig_transfer_function_with_copyswapn(
+        int aligned, npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        PyArray_StridedUnaryOp **out_stransfer, NpyAuxData **out_transferdata,
+        int *out_needs_api,
+        PyArray_StridedUnaryOp *caststransfer, NpyAuxData *castdata)
+{
+    NpyAuxData *todata = NULL, *fromdata = NULL;
+    PyArray_StridedUnaryOp *tobuffer = NULL, *frombuffer = NULL;
+    npy_intp src_itemsize = src_dtype->elsize;
+    npy_intp dst_itemsize = dst_dtype->elsize;
+
+    /* Get the copy/swap operation from src */
+    PyArray_GetDTypeCopySwapFn(
+            aligned, src_stride, src_itemsize, src_dtype, &tobuffer, &todata);
+
+    if (!PyDataType_REFCHK(dst_dtype)) {
+        /* Copying from buffer is a simple copy/swap operation */
+        PyArray_GetDTypeCopySwapFn(
+                aligned, dst_itemsize, dst_stride, dst_dtype,
+                &frombuffer, &fromdata);
+    }
+    else {
+        /*
+         * Since the buffer is initialized to NULL, need to move the
+         * references in order to DECREF the existing data.
+         */
+         /* Object types cannot be byte swapped */
+        assert(PyDataType_ISNOTSWAPPED(dst_dtype));
+        /* The loop already needs the python api if this is reached */
+        assert(*out_needs_api);
+
+        if (PyArray_GetDTypeTransferFunction(
+                aligned, dst_itemsize, dst_stride,
+                dst_dtype, dst_dtype, 1,
+                &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
+            return NPY_FAIL;
+        }
+    }
+
+    if (frombuffer == NULL || tobuffer == NULL) {
+        NPY_AUXDATA_FREE(castdata);
+        NPY_AUXDATA_FREE(todata);
+        NPY_AUXDATA_FREE(fromdata);
+        return NPY_FAIL;
+    }
+
+    *out_stransfer = caststransfer;
+
+    /* Wrap it all up in a new transfer function + data */
+    if (wrap_aligned_contig_transfer_function(
+                        src_itemsize, dst_itemsize,
+                        tobuffer, todata,
+                        frombuffer, fromdata,
+                        caststransfer, castdata,
+                        PyDataType_FLAGCHK(dst_dtype, NPY_NEEDS_INIT),
+                        *out_needs_api,
+                        out_stransfer, out_transferdata) != NPY_SUCCEED) {
+        NPY_AUXDATA_FREE(castdata);
+        NPY_AUXDATA_FREE(todata);
+        NPY_AUXDATA_FREE(fromdata);
+        return NPY_FAIL;
+    }
+
+    return NPY_SUCCEED;
+}
+
+
 static int
 get_cast_transfer_function(int aligned,
                             npy_intp src_stride, npy_intp dst_stride,
@@ -1510,10 +1629,8 @@ get_cast_transfer_function(int aligned,
                             int *out_needs_api)
 {
     PyArray_StridedUnaryOp *caststransfer;
-    NpyAuxData *castdata, *todata = NULL, *fromdata = NULL;
+    NpyAuxData *castdata;
     int needs_wrap = 0;
-    npy_intp src_itemsize = src_dtype->elsize,
-            dst_itemsize = dst_dtype->elsize;
 
     if (get_nbo_cast_transfer_function(aligned,
                             src_stride, dst_stride,
@@ -1538,64 +1655,10 @@ get_cast_transfer_function(int aligned,
     }
     /* Otherwise, we have to copy and/or swap to aligned temporaries */
     else {
-        PyArray_StridedUnaryOp *tobuffer, *frombuffer;
-
-        /* Get the copy/swap operation from src */
-        PyArray_GetDTypeCopySwapFn(aligned,
-                                src_stride, src_itemsize,
-                                src_dtype,
-                                &tobuffer, &todata);
-
-        if (!PyDataType_REFCHK(dst_dtype)) {
-            /* Copying from buffer is a simple copy/swap operation */
-            PyArray_GetDTypeCopySwapFn(aligned,
-                                    dst_itemsize, dst_stride,
-                                    dst_dtype,
-                                    &frombuffer, &fromdata);
-        }
-        else {
-            /*
-             * Since the buffer is initialized to NULL, need to move the
-             * references in order to DECREF the existing data.
-             */
-             /* Object types cannot be byte swapped */
-            assert(PyDataType_ISNOTSWAPPED(dst_dtype));
-            /* The loop already needs the python api if this is reached */
-            assert(*out_needs_api);
-
-            if (PyArray_GetDTypeTransferFunction(
-                    aligned, dst_itemsize, dst_stride,
-                    dst_dtype, dst_dtype, 1,
-                    &frombuffer, &fromdata, out_needs_api) != NPY_SUCCEED) {
-                return NPY_FAIL;
-            }
-        }
-
-        if (frombuffer == NULL || tobuffer == NULL) {
-            NPY_AUXDATA_FREE(castdata);
-            NPY_AUXDATA_FREE(todata);
-            NPY_AUXDATA_FREE(fromdata);
-            return NPY_FAIL;
-        }
-
-        *out_stransfer = caststransfer;
-
-        /* Wrap it all up in a new transfer function + data */
-        if (wrap_aligned_contig_transfer_function(
-                            src_itemsize, dst_itemsize,
-                            tobuffer, todata,
-                            frombuffer, fromdata,
-                            caststransfer, castdata,
-                            PyDataType_FLAGCHK(dst_dtype, NPY_NEEDS_INIT),
-                            *out_needs_api,
-                            out_stransfer, out_transferdata) != NPY_SUCCEED) {
-            NPY_AUXDATA_FREE(castdata);
-            NPY_AUXDATA_FREE(todata);
-            NPY_AUXDATA_FREE(fromdata);
-            return NPY_FAIL;
-        }
-
-        return NPY_SUCCEED;
+        return wrap_aligned_contig_transfer_function_with_copyswapn(
+                aligned, src_stride, dst_stride, src_dtype, dst_dtype,
+                out_stransfer, out_transferdata, out_needs_api,
+                caststransfer, castdata);
     }
 }
 
@@ -1652,7 +1715,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1664,18 +1727,19 @@ _strided_to_strided_one_to_n(char *dst, npy_intp dst_stride,
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_itemsize,
-                    src, 0,
-                    subN, src_itemsize,
-                    subdata);
+        if (subtransfer(
+                dst, dst_itemsize, src, 0, subN, src_itemsize, subdata) < 0) {
+            return -1;
+        }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1688,21 +1752,21 @@ _strided_to_strided_one_to_n_with_finish(char *dst, npy_intp dst_stride,
     npy_intp subN = d->N, dst_itemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_itemsize,
-                    src, 0,
-                    subN, src_itemsize,
-                    subdata);
-
+        if (subtransfer(
+                dst, dst_itemsize, src, 0, subN, src_itemsize, subdata) < 0) {
+            return -1;
+        }
 
-        stransfer_finish_src(NULL, 0,
-                            src, 0,
-                            1, src_itemsize,
-                            data_finish_src);
+        if (stransfer_finish_src(
+                NULL, 0, src, 0, 1, src_itemsize, data_finish_src) < 0) {
+            return -1;
+        }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 /*
@@ -1846,7 +1910,7 @@ static NpyAuxData *_n_to_n_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -1859,18 +1923,19 @@ _strided_to_strided_n_to_n(char *dst, npy_intp dst_stride,
                 dst_subitemsize = d->dst_itemsize;
 
     while (N > 0) {
-        subtransfer(dst, dst_subitemsize,
-                    src, src_subitemsize,
-                    subN, src_subitemsize,
-                    subdata);
-
+        if (subtransfer(
+                dst, dst_subitemsize, src, src_subitemsize,
+                subN, src_subitemsize, subdata) < 0) {
+            return -1;
+        }
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -1882,10 +1947,12 @@ _contig_to_contig_n_to_n(char *dst, npy_intp NPY_UNUSED(dst_stride),
     npy_intp subN = d->N, src_subitemsize = d->src_itemsize,
                 dst_subitemsize = d->dst_itemsize;
 
-    subtransfer(dst, dst_subitemsize,
-                src, src_subitemsize,
-                subN*N, src_subitemsize,
-                subdata);
+    if (subtransfer(
+            dst, dst_subitemsize, src, src_subitemsize,
+            subN*N, src_subitemsize, subdata) < 0) {
+        return -1;
+    }
+    return 0;
 }
 
 /*
@@ -2049,7 +2116,7 @@ static NpyAuxData *_subarray_broadcast_data_clone( NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2072,10 +2139,11 @@ _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
             count = offsetruns[run].count;
             dst_ptr = dst + loop_index*dst_subitemsize;
             if (offset != -1) {
-                subtransfer(dst_ptr, dst_subitemsize,
-                            src + offset, src_subitemsize,
-                            count, src_subitemsize,
-                            subdata);
+                if (subtransfer(
+                        dst_ptr, dst_subitemsize, src + offset, src_subitemsize,
+                        count, src_subitemsize, subdata) < 0) {
+                    return -1;
+                }
             }
             else {
                 memset(dst_ptr, 0, count*dst_subitemsize);
@@ -2087,10 +2155,11 @@ _strided_to_strided_subarray_broadcast(char *dst, npy_intp dst_stride,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
-static void
+static int
 _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2118,16 +2187,19 @@ _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
             count = offsetruns[run].count;
             dst_ptr = dst + loop_index*dst_subitemsize;
             if (offset != -1) {
-                subtransfer(dst_ptr, dst_subitemsize,
-                            src + offset, src_subitemsize,
-                            count, src_subitemsize,
-                            subdata);
+                if (subtransfer(
+                        dst_ptr, dst_subitemsize, src + offset, src_subitemsize,
+                        count, src_subitemsize, subdata) < 0) {
+                    return -1;
+                }
             }
             else {
                 if (stransfer_decdstref != NULL) {
-                    stransfer_decdstref(NULL, 0, dst_ptr, dst_subitemsize,
-                                        count, dst_subitemsize,
-                                        data_decdstref);
+                    if (stransfer_decdstref(
+                            NULL, 0, dst_ptr, dst_subitemsize,
+                            count, dst_subitemsize, data_decdstref) < 0) {
+                        return -1;
+                    }
                 }
                 memset(dst_ptr, 0, count*dst_subitemsize);
             }
@@ -2135,15 +2207,18 @@ _strided_to_strided_subarray_broadcast_withrefs(char *dst, npy_intp dst_stride,
         }
 
         if (stransfer_decsrcref != NULL) {
-            stransfer_decsrcref(NULL, 0, src, src_subitemsize,
-                                    src_subN, src_subitemsize,
-                                    data_decsrcref);
+            if (stransfer_decsrcref(
+                    NULL, 0, src, src_subitemsize,
+                    src_subN, src_subitemsize, data_decsrcref) < 0) {
+                return -1;
+            }
         }
 
         src += src_stride;
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -2500,7 +2575,7 @@ static NpyAuxData *_field_transfer_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp NPY_UNUSED(src_itemsize),
@@ -2515,11 +2590,13 @@ _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
         field = &d->fields;
         if (N > NPY_LOWLEVEL_BUFFER_BLOCKSIZE) {
             for (i = 0; i < field_count; ++i, ++field) {
-                field->stransfer(dst + field->dst_offset, dst_stride,
-                                 src + field->src_offset, src_stride,
-                                 NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
-                                 field->src_itemsize,
-                                 field->data);
+                if (field->stransfer(
+                        dst + field->dst_offset, dst_stride,
+                        src + field->src_offset, src_stride,
+                        NPY_LOWLEVEL_BUFFER_BLOCKSIZE,
+                        field->src_itemsize, field->data) < 0) {
+                    return -1;
+                }
             }
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
@@ -2527,13 +2604,15 @@ _strided_to_strided_field_transfer(char *dst, npy_intp dst_stride,
         }
         else {
             for (i = 0; i < field_count; ++i, ++field) {
-                field->stransfer(dst + field->dst_offset, dst_stride,
-                                 src + field->src_offset, src_stride,
-                                 N,
-                                 field->src_itemsize,
-                                 field->data);
+                if (field->stransfer(
+                        dst + field->dst_offset, dst_stride,
+                        src + field->src_offset, src_stride,
+                        N,
+                        field->src_itemsize, field->data) < 0) {
+                    return -1;
+                }
             }
-            return;
+            return 0;
         }
     }
 }
@@ -2947,7 +3026,8 @@ static NpyAuxData *_masked_wrapper_transfer_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void _strided_masked_wrapper_decsrcref_transfer_function(
+static int
+_strided_masked_wrapper_decsrcref_transfer_function(
                                     char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_bool *mask, npy_intp mask_stride,
@@ -2969,8 +3049,11 @@ static void _strided_masked_wrapper_decsrcref_transfer_function(
         /* Skip masked values, still calling decsrcref for move_references */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 1);
-        decsrcref_stransfer(NULL, 0, src, src_stride,
-                            subloopsize, src_itemsize, decsrcref_transferdata);
+        if (decsrcref_stransfer(
+                NULL, 0, src, src_stride,
+                subloopsize, src_itemsize, decsrcref_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
@@ -2981,15 +3064,20 @@ static void _strided_masked_wrapper_decsrcref_transfer_function(
         /* Process unmasked values */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 0);
-        unmasked_stransfer(dst, dst_stride, src, src_stride,
-                            subloopsize, src_itemsize, unmasked_transferdata);
+        if (unmasked_stransfer(
+                dst, dst_stride, src, src_stride,
+                subloopsize, src_itemsize, unmasked_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
     }
+    return 0;
 }
 
-static void _strided_masked_wrapper_transfer_function(
+static int
+_strided_masked_wrapper_transfer_function(
                                     char *dst, npy_intp dst_stride,
                                     char *src, npy_intp src_stride,
                                     npy_bool *mask, npy_intp mask_stride,
@@ -3020,18 +3108,22 @@ static void _strided_masked_wrapper_transfer_function(
         /* Process unmasked values */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 0);
-        unmasked_stransfer(dst, dst_stride, src, src_stride,
-                            subloopsize, src_itemsize, unmasked_transferdata);
+        if (unmasked_stransfer(
+                dst, dst_stride, src, src_stride,
+                subloopsize, src_itemsize, unmasked_transferdata) < 0) {
+            return -1;
+        }
         dst += subloopsize * dst_stride;
         src += subloopsize * src_stride;
         N -= subloopsize;
     }
+    return 0;
 }
 
 
 /************************* DEST BOOL SETONE *******************************/
 
-static void
+static int
 _null_to_strided_set_bool_one(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3046,9 +3138,10 @@ _null_to_strided_set_bool_one(char *dst,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _null_to_contig_set_bool_one(char *dst,
                         npy_intp NPY_UNUSED(dst_stride),
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3058,6 +3151,7 @@ _null_to_contig_set_bool_one(char *dst,
     /* bool type is one byte, so can just use the char */
 
     memset(dst, 1, N);
+    return 0;
 }
 
 /* Only for the bool type, sets the destination to 1 */
@@ -3101,7 +3195,7 @@ static NpyAuxData *_dst_memset_zero_data_clone(NpyAuxData *data)
     return (NpyAuxData *)newdata;
 }
 
-static void
+static int
 _null_to_strided_memset_zero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3116,9 +3210,10 @@ _null_to_strided_memset_zero(char *dst,
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _null_to_contig_memset_zero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3129,9 +3224,10 @@ _null_to_contig_memset_zero(char *dst,
     npy_intp dst_itemsize = d->dst_itemsize;
 
     memset(dst, 0, N*dst_itemsize);
+    return 0;
 }
 
-static void
+static int
 _null_to_strided_reference_setzero(char *dst,
                         npy_intp dst_stride,
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3141,19 +3237,17 @@ _null_to_strided_reference_setzero(char *dst,
     PyObject *dst_ref = NULL;
 
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&dst_ref, dst);
+        memcpy(&dst_ref, dst, sizeof(dst_ref));
 
-        /* Release the reference in dst */
+        /* Release the reference in dst and set it to NULL */
         NPY_DT_DBG_REFTRACE("dec dest ref (to set zero)", dst_ref);
         Py_XDECREF(dst_ref);
-
-        /* Set it to zero */
-        dst_ref = NULL;
-        NPY_COPY_PYOBJECT_PTR(dst, &dst_ref);
+        memset(dst, 0, sizeof(PyObject *));
 
         dst += dst_stride;
         --N;
     }
+    return 0;
 }
 
 NPY_NO_EXPORT int
@@ -3250,7 +3344,7 @@ get_setdstzero_transfer_function(int aligned,
     return NPY_SUCCEED;
 }
 
-static void
+static int
 _dec_src_ref_nop(char *NPY_UNUSED(dst),
                         npy_intp NPY_UNUSED(dst_stride),
                         char *NPY_UNUSED(src), npy_intp NPY_UNUSED(src_stride),
@@ -3259,9 +3353,10 @@ _dec_src_ref_nop(char *NPY_UNUSED(dst),
                         NpyAuxData *NPY_UNUSED(data))
 {
     /* NOP */
+    return 0;
 }
 
-static void
+static int
 _strided_to_null_dec_src_ref_reference(char *NPY_UNUSED(dst),
                         npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp src_stride,
@@ -3271,15 +3366,16 @@ _strided_to_null_dec_src_ref_reference(char *NPY_UNUSED(dst),
 {
     PyObject *src_ref = NULL;
     while (N > 0) {
-        NPY_COPY_PYOBJECT_PTR(&src_ref, src);
-
-        /* Release the reference in src */
+        /* Release the reference in src and set it to NULL */
         NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
+        memcpy(&src_ref, src, sizeof(src_ref));
         Py_XDECREF(src_ref);
+        memset(src, 0, sizeof(PyObject *));
 
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
 
@@ -3661,6 +3757,53 @@ PyArray_GetDTypeTransferFunction(int aligned,
                     out_needs_api);
 }
 
+
+/*
+ * Basic version of PyArray_GetDTypeTransferFunction for legacy dtype
+ * support.
+ * It supports only wrapping the copyswapn functions and the legacy
+ * cast functions registered with `PyArray_RegisterCastFunc`.
+ * This function takes the easy way out: It does not wrap
+ */
+NPY_NO_EXPORT int
+PyArray_GetLegacyDTypeTransferFunction(int aligned,
+        npy_intp src_stride, npy_intp dst_stride,
+        PyArray_Descr *src_dtype, PyArray_Descr *dst_dtype,
+        int move_references,
+        PyArray_StridedUnaryOp **out_stransfer,
+        NpyAuxData **out_transferdata,
+        int *out_needs_api)
+{
+    /* Note: We ignore `needs_wrap`; needs-wrap is handled by another cast */
+    int needs_wrap = 0;
+
+    if (src_dtype->type_num == dst_dtype->type_num) {
+        /*
+         * This is a cast within the same dtype. For legacy user-dtypes,
+         * it is always valid to handle this using the copy swap function.
+         */
+        return wrap_copy_swap_function(aligned,
+                src_stride, dst_stride,
+                src_dtype,
+                PyArray_ISNBO(src_dtype->byteorder) !=
+                PyArray_ISNBO(dst_dtype->byteorder),
+                out_stransfer, out_transferdata);
+    }
+
+    if (get_legacy_dtype_cast_function(aligned,
+            src_stride, dst_stride,
+            src_dtype, dst_dtype,
+            move_references,
+            out_stransfer,
+            out_transferdata,
+            out_needs_api,
+            &needs_wrap) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+    return NPY_SUCCEED;
+}
+
+
 NPY_NO_EXPORT int
 PyArray_GetMaskedDTypeTransferFunction(int aligned,
                             npy_intp src_stride,
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index 3026e68e9..4c11723e7 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -15,6 +15,9 @@
 #include "dtypemeta.h"
 #include "_datetime.h"
 #include "array_coercion.h"
+#include "scalartypes.h"
+#include "convert_datatype.h"
+#include "usertypes.h"
 
 
 static void
@@ -24,6 +27,7 @@ dtypemeta_dealloc(PyArray_DTypeMeta *self) {
 
     Py_XDECREF(self->scalar_type);
     Py_XDECREF(self->singleton);
+    Py_XDECREF(self->castingimpls);
     PyType_Type.tp_dealloc((PyObject *) self);
 }
 
@@ -194,7 +198,39 @@ discover_datetime_and_timedelta_from_pyobject(
 
 
 static PyArray_Descr *
-flexible_default_descr(PyArray_DTypeMeta *cls)
+nonparametric_default_descr(PyArray_DTypeMeta *cls)
+{
+    Py_INCREF(cls->singleton);
+    return cls->singleton;
+}
+
+
+/* Ensure a copy of the singleton (just in case we do adapt it somewhere) */
+static PyArray_Descr *
+datetime_and_timedelta_default_descr(PyArray_DTypeMeta *cls)
+{
+    return PyArray_DescrNew(cls->singleton);
+}
+
+
+static PyArray_Descr *
+void_default_descr(PyArray_DTypeMeta *cls)
+{
+    PyArray_Descr *res = PyArray_DescrNew(cls->singleton);
+    if (res == NULL) {
+        return NULL;
+    }
+    /*
+     * The legacy behaviour for `np.array([], dtype="V")` is to use "V8".
+     * This is because `[]` uses `float64` as dtype, and then that is used
+     * for the size of the requested void.
+     */
+    res->elsize = 8;
+    return res;
+}
+
+static PyArray_Descr *
+string_and_unicode_default_descr(PyArray_DTypeMeta *cls)
 {
     PyArray_Descr *res = PyArray_DescrNewFromType(cls->type_num);
     if (res == NULL) {
@@ -208,6 +244,43 @@ flexible_default_descr(PyArray_DTypeMeta *cls)
 }
 
 
+static PyArray_Descr *
+string_unicode_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
+{
+    if (descr1->elsize >= descr2->elsize) {
+        return ensure_dtype_nbo(descr1);
+    }
+    else {
+        return ensure_dtype_nbo(descr2);
+    }
+}
+
+
+static PyArray_Descr *
+void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
+{
+    /*
+     * We currently do not support promotion of void types unless they
+     * are equivalent.
+     */
+    if (!PyArray_CanCastTypeTo(descr1, descr2, NPY_EQUIV_CASTING)) {
+        if (descr1->subarray == NULL && descr1->names == NULL &&
+                descr2->subarray == NULL && descr2->names == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                    "Invalid type promotion with void datatypes of different "
+                    "lengths. Use the `np.bytes_` datatype instead to pad the "
+                    "shorter value with trailing zero bytes.");
+        }
+        else {
+            PyErr_SetString(PyExc_TypeError,
+                    "invalid type promotion with structured datatype(s).");
+        }
+        return NULL;
+    }
+    Py_INCREF(descr1);
+    return descr1;
+}
+
 static int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
@@ -242,6 +315,18 @@ python_builtins_are_known_scalar_types(
 
 
 static int
+signed_integers_is_known_scalar_types(
+        PyArray_DTypeMeta *cls, PyTypeObject *pytype)
+{
+    if (python_builtins_are_known_scalar_types(cls, pytype)) {
+        return 1;
+    }
+    /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
+    return PyType_IsSubtype(pytype, &PyGenericArrType_Type);
+}
+
+
+static int
 datetime_known_scalar_types(
         PyArray_DTypeMeta *cls, PyTypeObject *pytype)
 {
@@ -253,7 +338,7 @@ datetime_known_scalar_types(
      * must take charge. Otherwise we would attempt casting which does not
      * truly support this. Only object arrays are special cased in this way.
      */
-    return (PyType_IsSubtype(pytype, &PyString_Type) ||
+    return (PyType_IsSubtype(pytype, &PyBytes_Type) ||
             PyType_IsSubtype(pytype, &PyUnicode_Type));
 }
 
@@ -281,6 +366,86 @@ string_known_scalar_types(
 }
 
 
+/*
+ * The following set of functions define the common dtype operator for
+ * the builtin types.
+ */
+static PyArray_DTypeMeta *
+default_builtin_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    assert(cls->type_num < NPY_NTYPES);
+    if (!other->legacy || other->type_num > cls->type_num) {
+        /* Let the more generic (larger type number) DType handle this */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+
+    /*
+     * Note: The use of the promotion table should probably be revised at
+     *       some point. It may be most useful to remove it entirely and then
+     *       consider adding a fast path/cache `PyArray_CommonDType()` itself.
+     */
+    int common_num = _npy_type_promotion_table[cls->type_num][other->type_num];
+    if (common_num < 0) {
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    return PyArray_DTypeFromTypeNum(common_num);
+}
+
+
+static PyArray_DTypeMeta *
+string_unicode_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    assert(cls->type_num < NPY_NTYPES);
+    if (!other->legacy || other->type_num > cls->type_num ||
+        other->type_num == NPY_OBJECT) {
+        /* Let the more generic (larger type number) DType handle this */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    /*
+     * The builtin types are ordered by complexity (aside from object) here.
+     * Arguably, we should not consider numbers and strings "common", but
+     * we currently do.
+     */
+    Py_INCREF(cls);
+    return cls;
+}
+
+static PyArray_DTypeMeta *
+datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    if (cls->type_num == NPY_DATETIME && other->type_num == NPY_TIMEDELTA) {
+        /*
+         * TODO: We actually currently do allow promotion here. This is
+         *       currently relied on within `np.add(datetime, timedelta)`,
+         *       while for concatenation the cast step will fail.
+         */
+        Py_INCREF(cls);
+        return cls;
+    }
+    return default_builtin_common_dtype(cls, other);
+}
+
+
+
+static PyArray_DTypeMeta *
+object_common_dtype(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *NPY_UNUSED(other))
+{
+    /*
+     * The object DType is special in that it can represent everything,
+     * including all potential user DTypes.
+     * One reason to defer (or error) here might be if the other DType
+     * does not support scalars so that e.g. `arr1d[0]` returns a 0-D array
+     * and `arr.astype(object)` would fail. But object casts are special.
+     */
+    Py_INCREF(cls);
+    return cls;
+}
+
+
 /**
  * This function takes a PyArray_Descr and replaces its base class with
  * a newly created dtype subclass (DTypeMeta instances).
@@ -312,10 +477,28 @@ string_known_scalar_types(
 NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
 {
-    if (Py_TYPE(descr) != &PyArrayDescr_Type) {
+    int has_type_set = Py_TYPE(descr) == &PyArrayDescr_Type;
+
+    if (!has_type_set) {
+        /* Accept if the type was filled in from an existing builtin dtype */
+        for (int i = 0; i < NPY_NTYPES; i++) {
+            PyArray_Descr *builtin = PyArray_DescrFromType(i);
+            has_type_set = Py_TYPE(descr) == Py_TYPE(builtin);
+            Py_DECREF(builtin);
+            if (has_type_set) {
+                break;
+            }
+        }
+    }
+    if (!has_type_set) {
         PyErr_Format(PyExc_RuntimeError,
                 "During creation/wrapping of legacy DType, the original class "
-                "was not PyArrayDescr_Type (it is replaced in this step).");
+                "was not of PyArrayDescr_Type (it is replaced in this step). "
+                "The extension creating a custom DType for type %S must be "
+                "modified to ensure `Py_TYPE(descr) == &PyArrayDescr_Type` or "
+                "that of an existing dtype (with the assumption it is just "
+                "copied over and can be replaced).",
+                descr->typeobj, Py_TYPE(descr));
         return -1;
     }
 
@@ -383,6 +566,12 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
 
     /* Let python finish the initialization (probably unnecessary) */
     if (PyType_Ready((PyTypeObject *)dtype_class) < 0) {
+        Py_DECREF(dtype_class);
+        return -1;
+    }
+    dtype_class->castingimpls = PyDict_New();
+    if (dtype_class->castingimpls == NULL) {
+        Py_DECREF(dtype_class);
         return -1;
     }
 
@@ -398,36 +587,54 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     dtype_class->f = descr->f;
     dtype_class->kind = descr->kind;
 
-    /* Strings and voids have (strange) logic around scalars. */
+    /* Set default functions (correct for most dtypes, override below) */
+    dtype_class->default_descr = nonparametric_default_descr;
+    dtype_class->discover_descr_from_pyobject = (
+            nonparametric_discover_descr_from_pyobject);
     dtype_class->is_known_scalar_type = python_builtins_are_known_scalar_types;
+    dtype_class->common_dtype = default_builtin_common_dtype;
+    dtype_class->common_instance = NULL;
+
+    if (PyTypeNum_ISSIGNED(dtype_class->type_num)) {
+        /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
+        dtype_class->is_known_scalar_type = signed_integers_is_known_scalar_types;
+    }
 
-    if (PyTypeNum_ISDATETIME(descr->type_num)) {
+    if (PyTypeNum_ISUSERDEF(descr->type_num)) {
+        dtype_class->common_dtype = legacy_userdtype_common_dtype_function;
+    }
+    else if (descr->type_num == NPY_OBJECT) {
+        dtype_class->common_dtype = object_common_dtype;
+    }
+    else if (PyTypeNum_ISDATETIME(descr->type_num)) {
         /* Datetimes are flexible, but were not considered previously */
         dtype_class->parametric = NPY_TRUE;
+        dtype_class->default_descr = datetime_and_timedelta_default_descr;
         dtype_class->discover_descr_from_pyobject = (
                 discover_datetime_and_timedelta_from_pyobject);
+        dtype_class->common_dtype = datetime_common_dtype;
+        dtype_class->common_instance = datetime_type_promotion;
         if (descr->type_num == NPY_DATETIME) {
             dtype_class->is_known_scalar_type = datetime_known_scalar_types;
         }
     }
     else if (PyTypeNum_ISFLEXIBLE(descr->type_num)) {
         dtype_class->parametric = NPY_TRUE;
-        dtype_class->default_descr = flexible_default_descr;
         if (descr->type_num == NPY_VOID) {
+            dtype_class->default_descr = void_default_descr;
             dtype_class->discover_descr_from_pyobject = (
                     void_discover_descr_from_pyobject);
+            dtype_class->common_instance = void_common_instance;
         }
         else {
+            dtype_class->default_descr = string_and_unicode_default_descr;
             dtype_class->is_known_scalar_type = string_known_scalar_types;
             dtype_class->discover_descr_from_pyobject = (
                     string_discover_descr_from_pyobject);
+            dtype_class->common_dtype = string_unicode_common_dtype;
+            dtype_class->common_instance = string_unicode_common_instance;
         }
     }
-    else {
-        /* nonparametric case */
-        dtype_class->discover_descr_from_pyobject = (
-                nonparametric_discover_descr_from_pyobject);
-    }
 
     if (_PyArray_MapPyTypeToDType(dtype_class, descr->typeobj,
             PyTypeNum_ISUSERDEF(dtype_class->type_num)) < 0) {
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index e0909a7eb..83cf7c07e 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -2,6 +2,22 @@
 #define _NPY_DTYPEMETA_H
 
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
+/*
+ * This function will hopefully be phased out or replaced, but was convenient
+ * for incremental implementation of new DTypes based on DTypeMeta.
+ * (Error checking is not required for DescrFromType, assuming that the
+ * type is valid.)
+ */
+static NPY_INLINE PyArray_DTypeMeta *
+PyArray_DTypeFromTypeNum(int typenum)
+{
+    PyArray_Descr *descr = PyArray_DescrFromType(typenum);
+    PyArray_DTypeMeta *dtype = NPY_DTYPE(descr);
+    Py_INCREF(dtype);
+    Py_DECREF(descr);
+    return dtype;
+}
+
 
 NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 2538e05c6..6ad375f67 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -16,7 +16,6 @@
 #define _MULTIARRAYMODULE
 #include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
 #include <npy_pycompat.h>
 
 #include <ctype.h>
@@ -25,1898 +24,8 @@
 #include "common.h"
 #include "ctors.h"
 
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
-#else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/********** PRINTF DEBUG TRACING **************/
-#define NPY_EINSUM_DBG_TRACING 0
-
-#if NPY_EINSUM_DBG_TRACING
-#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
-#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
-#else
-#define NPY_EINSUM_DBG_PRINT(s)
-#define NPY_EINSUM_DBG_PRINT1(s, p1)
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
-#endif
-/**********************************************/
-
-/**begin repeat
- * #name = byte, short, int, long, longlong,
- *         ubyte, ushort, uint, ulong, ulonglong,
- *         half, float, double, longdouble,
- *         cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble#
- * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- *             npy_float, npy_float, npy_double, npy_longdouble,
- *             npy_float, npy_double, npy_longdouble#
- * #to = ,,,,,
- *       ,,,,,
- *       npy_float_to_half,,,,
- *       ,,#
- * #from = ,,,,,
- *         ,,,,,
- *         npy_half_to_float,,,,
- *         ,,#
- * #complex = 0*5,
- *            0*5,
- *            0*4,
- *            1*3#
- * #float32 = 0*5,
- *            0*5,
- *            0,1,0,0,
- *            0*3#
- * #float64 = 0*5,
- *            0*5,
- *            0,0,1,0,
- *            0*3#
- */
-
-/**begin repeat1
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-static void
-@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data_out += stride_out;
-#  elif @nop@ == 2
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#  elif @nop@ == 3
-        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
-                                         @from@(*(@type@ *)data1) *
-                                         @from@(*(@type@ *)data2) +
-                                         @from@(*(@type@ *)data_out));
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
-                                         ((@temptype@ *)data_out)[0];
-        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
-                                         ((@temptype@ *)data_out)[1];
-        data0 += stride0;
-        data_out += stride_out;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-}
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_one(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data_out = (@type@ *)dataptr[1];
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            data_out[@i@] = @to@(@from@(data0[@i@]) +
-                                 @from@(data_out[@i@]));
-#else
-            ((@temptype@ *)data_out + 2*@i@)[0] =
-                                    ((@temptype@ *)data0 + 2*@i@)[0] +
-                                    ((@temptype@ *)data_out + 2*@i@)[0];
-            ((@temptype@ *)data_out + 2*@i@)[1] =
-                                    ((@temptype@ *)data0 + 2*@i@)[1] +
-                                    ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#if !@complex@
-        data_out[@i@] = @to@(@from@(data0[@i@]) +
-                             @from@(data_out[@i@]));
-#else /* complex */
-        ((@temptype@ *)data_out + 2*@i@)[0] =
-                                ((@temptype@ *)data0 + 2*@i@)[0] +
-                                ((@temptype@ *)data_out + 2*@i@)[0];
-        ((@temptype@ *)data_out + 2*@i@)[1] =
-                                ((@temptype@ *)data0 + 2*@i@)[1] +
-                                ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 2 && !@complex@
-
-static void
-@name@_sum_of_products_contig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
-                                                            (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@]) *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
-        EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-/* Some extra specializations for the two operand case */
-static void
-@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value0_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(value0 *
-                                 @from@(data1[@i@]) +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value0_sse = _mm_set_ps1(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value0_sse = _mm_set1_pd(value0);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data1 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        if (count > 0) {
-            goto finish_after_unrolled_loop;
-        }
-        else {
-            return;
-        }
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(value0 *
-                             @from@(data1[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data1 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    if (count > 0) {
-        goto finish_after_unrolled_loop;
-    }
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, b, value1_sse;
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            data_out[@i@] = @to@(@from@(data0[@i@])*
-                                 value1  +
-                                 @from@(data_out[@i@]));
-/**end repeat2**/
-        case 0:
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    value1_sse = _mm_set_ps1(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
-            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
-            _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    value1_sse = _mm_set1_pd(value1);
-
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
-            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
-            _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
-            data0 += 8;
-            data_out += 8;
-        }
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
-        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
-        _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
-        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
-        _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@])*
-                             value1  +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
-            accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-            _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
-            accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-            data0 += 8;
-            data1 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
-        accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-        _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
-        accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data1[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data1)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
-            data1 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
-        data1 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
-    @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-            accum += @from@(data0[@i@]);
-/**end repeat2**/
-        case 0:
-            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
-        data0 += 8;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 3 && !@complex@
-
-static void
-@name@_sum_of_products_contig_three(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    @type@ *data0 = (@type@ *)dataptr[0];
-    @type@ *data1 = (@type@ *)dataptr[1];
-    @type@ *data2 = (@type@ *)dataptr[2];
-    @type@ *data_out = (@type@ *)dataptr[3];
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        data_out[@i@] = @to@(@from@(data0[@i@]) *
-                             @from@(data1[@i@]) *
-                             @from@(data2[@i@]) +
-                             @from@(data_out[@i@]));
-/**end repeat2**/
-        data0 += 8;
-        data1 += 8;
-        data2 += 8;
-        data_out += 8;
-    }
-
-    /* Finish off the loop */
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-    if (count-- == 0) {
-        return;
-    }
-    data_out[@i@] = @to@(@from@(data0[@i@]) *
-                         @from@(data1[@i@]) *
-                         @from@(data2[@i@]) +
-                         @from@(data_out[@i@]));
-/**end repeat2**/
-}
-
-#else /* @nop@ > 3 || @complex */
-
-static void
-@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        *(@type@ *)dataptr[nop] = @to@(temp +
-                                           @from@(*(@type@ *)dataptr[i]));
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#else /* complex */
-#  if @nop@ <= 3
-#    define _SUMPROD_NOP @nop@
-#  else
-#    define _SUMPROD_NOP nop
-#  endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
-        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
-                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
-        for (i = 0; i <= _SUMPROD_NOP; ++i) {
-            dataptr[i] += sizeof(@type@);
-        }
-#  undef _SUMPROD_NOP
-#endif
-    }
-}
-
-#endif /* functions for various @nop@ */
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-    @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
-    @temptype@ accum = 0;
-    @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
-    __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
-    __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
-                                                    (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#if !@complex@
-            accum += @from@(data0[@i@]);
-#else /* complex */
-            accum_re += data0[2*@i@+0];
-            accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
-        case 0:
-#if @complex@
-            ((@temptype@ *)dataptr[1])[0] += accum_re;
-            ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
-            *((@type@ *)dataptr[1]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[1])));
-#endif
-            return;
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the four SSE values and put in accum */
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-        accum_sse = _mm_add_ps(a, accum_sse);
-        _mm_store_ss(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Use aligned instructions if possible */
-    if (EINSUM_IS_SSE_ALIGNED(data0)) {
-        /* Unroll the loop by 8 */
-        while (count >= 8) {
-            count -= 8;
-
-            _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-            /*
-             * NOTE: This accumulation changes the order, so will likely
-             *       produce slightly different results.
-             */
-            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
-            data0 += 8;
-        }
-
-        /* Add the two SSE2 values and put in accum */
-        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-        accum_sse = _mm_add_pd(a, accum_sse);
-        _mm_store_sd(&accum, accum_sse);
-
-        /* Finish off the loop */
-        goto finish_after_unrolled_loop;
-    }
-#endif
-
-    /* Unroll the loop by 8 */
-    while (count >= 8) {
-        count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-        _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
-        /*
-         * NOTE: This accumulation changes the order, so will likely
-         *       produce slightly different results.
-         */
-        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#  if !@complex@
-        accum += @from@(data0[@i@]);
-#  else /* complex */
-        accum_re += data0[2*@i@+0];
-        accum_im += data0[2*@i@+1];
-#  endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
-        data0 += 8;
-#else
-        data0 += 8*2;
-#endif
-    }
-
-#if EINSUM_USE_SSE1 && @float32@
-    /* Add the four SSE values and put in accum */
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
-    accum_sse = _mm_add_ps(a, accum_sse);
-    _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
-    /* Add the two SSE2 values and put in accum */
-    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
-    accum_sse = _mm_add_pd(a, accum_sse);
-    _mm_store_sd(&accum, accum_sse);
-#endif
-
-    /* Finish off the loop */
-    goto finish_after_unrolled_loop;
-}
-
-#endif /* @nop@ == 1 */
-
-static void
-@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if @complex@
-    @temptype@ accum_re = 0, accum_im = 0;
-#else
-    @temptype@ accum = 0;
-#endif
-
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
-                                                    (int)count);
-
-    while (count--) {
-#if !@complex@
-#  if @nop@ == 1
-        accum += @from@(*(@type@ *)data0);
-        data0 += stride0;
-#  elif @nop@ == 2
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1);
-        data0 += stride0;
-        data1 += stride1;
-#  elif @nop@ == 3
-        accum += @from@(*(@type@ *)data0) *
-                 @from@(*(@type@ *)data1) *
-                 @from@(*(@type@ *)data2);
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#  else
-        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp *= @from@(*(@type@ *)dataptr[i]);
-        }
-        accum += temp;
-        for (i = 0; i < nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#  endif
-#else /* complex */
-#  if @nop@ == 1
-        accum_re += ((@temptype@ *)data0)[0];
-        accum_im += ((@temptype@ *)data0)[1];
-        data0 += stride0;
-#  else
-#    if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-#    else
-#define _SUMPROD_NOP nop
-#    endif
-        @temptype@ re, im, tmp;
-        int i;
-        re = ((@temptype@ *)dataptr[0])[0];
-        im = ((@temptype@ *)dataptr[0])[1];
-        for (i = 1; i < _SUMPROD_NOP; ++i) {
-            tmp = re * ((@temptype@ *)dataptr[i])[0] -
-                  im * ((@temptype@ *)dataptr[i])[1];
-            im = re * ((@temptype@ *)dataptr[i])[1] +
-                 im * ((@temptype@ *)dataptr[i])[0];
-            re = tmp;
-        }
-        accum_re += re;
-        accum_im += im;
-        for (i = 0; i < _SUMPROD_NOP; ++i) {
-            dataptr[i] += strides[i];
-        }
-#undef _SUMPROD_NOP
-#  endif
-#endif
-    }
-
-#if @complex@
-#  if @nop@ <= 3
-    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
-    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
-#  else
-    ((@temptype@ *)dataptr[nop])[0] += accum_re;
-    ((@temptype@ *)dataptr[nop])[1] += accum_im;
-#  endif
-#else
-#  if @nop@ <= 3
-    *((@type@ *)dataptr[@nop@]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[@nop@])));
-#  else
-    *((@type@ *)dataptr[nop]) = @to@(accum +
-                                    @from@(*((@type@ *)dataptr[nop])));
-#  endif
-#endif
-
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-
-/* Do OR of ANDs for the boolean type */
-
-/**begin repeat
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-
-static void
-bool_sum_of_products_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-    npy_intp stride_out = strides[@nop@];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        *(npy_bool *)data_out = *(npy_bool *)data0 ||
-                                  *(npy_bool *)data_out;
-        data0 += stride0;
-        data_out += stride_out;
-#elif @nop@ == 2
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data_out += stride_out;
-#elif @nop@ == 3
-        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
-                                   *(npy_bool *)data1 &&
-                                   *(npy_bool *)data2) ||
-                                   *(npy_bool *)data_out;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-        data_out += stride_out;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-}
-
-static void
-bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-#endif
-#if (@nop@ <= 3)
-    char *data_out = dataptr[@nop@];
-#endif
-
-#if (@nop@ <= 3)
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
-    switch (count) {
-/**begin repeat1
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
-        case @i@+1:
-#  if @nop@ == 1
-            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
-                                            ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 2
-            ((npy_bool *)data_out)[@i@] =
-                            (((npy_bool *)data0)[@i@] &&
-                             ((npy_bool *)data1)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  elif @nop@ == 3
-            ((npy_bool *)data_out)[@i@] =
-                           (((npy_bool *)data0)[@i@] &&
-                            ((npy_bool *)data1)[@i@] &&
-                            ((npy_bool *)data2)[@i@]) ||
-                                ((npy_bool *)data_out)[@i@];
-#  endif
-/**end repeat1**/
-        case 0:
-            return;
-    }
-#endif
-
-/* Unroll the loop by 8 for fixed-size nop */
-#if (@nop@ <= 3)
-    while (count >= 8) {
-        count -= 8;
-#else
-    while (count--) {
-#endif
-
-#  if @nop@ == 1
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
-                                        (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 2
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                        ((*((npy_bool *)data0 + @i@)) &&
-                         (*((npy_bool *)data1 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  elif @nop@ == 3
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-        *((npy_bool *)data_out + @i@) =
-                       ((*((npy_bool *)data0 + @i@)) &&
-                        (*((npy_bool *)data1 + @i@)) &&
-                        (*((npy_bool *)data2 + @i@))) ||
-                            (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
-        data0 += 8*sizeof(npy_bool);
-        data1 += 8*sizeof(npy_bool);
-        data2 += 8*sizeof(npy_bool);
-        data_out += 8*sizeof(npy_bool);
-#  else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += sizeof(npy_bool);
-        }
-#  endif
-    }
-
-    /* If the loop was unrolled, we need to finish it off */
-#if (@nop@ <= 3)
-    goto finish_after_unrolled_loop;
-#endif
-}
-
-static void
-bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
-                                npy_intp const *strides, npy_intp count)
-{
-    npy_bool accum = 0;
-
-#if (@nop@ <= 3)
-    char *data0 = dataptr[0];
-    npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
-    char *data1 = dataptr[1];
-    npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
-    char *data2 = dataptr[2];
-    npy_intp stride2 = strides[2];
-#endif
-
-    while (count--) {
-#if @nop@ == 1
-        accum = *(npy_bool *)data0 || accum;
-        data0 += stride0;
-#elif @nop@ == 2
-        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
-        data0 += stride0;
-        data1 += stride1;
-#elif @nop@ == 3
-        accum = (*(npy_bool *)data0 &&
-                 *(npy_bool *)data1 &&
-                 *(npy_bool *)data2) || accum;
-        data0 += stride0;
-        data1 += stride1;
-        data2 += stride2;
-#else
-        npy_bool temp = *(npy_bool *)dataptr[0];
-        int i;
-        for (i = 1; i < nop; ++i) {
-            temp = temp && *(npy_bool *)dataptr[i];
-        }
-        accum = temp || accum;
-        for (i = 0; i <= nop; ++i) {
-            dataptr[i] += strides[i];
-        }
-#endif
-    }
-
-#  if @nop@ <= 3
-    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
-#  else
-    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
-#  endif
-}
-
-/**end repeat**/
-
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
-/* These tables need to match up with the type enum */
-static sum_of_products_fn
-_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-    &@name@_sum_of_products_contig_outstride0_one,
-#else
-    NULL,
-#endif
-/**end repeat**/
-}; /* End of _contig_outstride0_unary_specialization_table */
-
-static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 0,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        0, 0, 0,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_stride0_contig_outstride0_two,
-    &@name@_sum_of_products_stride0_contig_outcontig_two,
-    &@name@_sum_of_products_contig_stride0_outstride0_two,
-    &@name@_sum_of_products_contig_stride0_outcontig_two,
-    &@name@_sum_of_products_contig_contig_outstride0_two,
-},
-#else
-    {NULL, NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _binary_specialization_table */
-
-static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_outstride0_any,
-    &@name@_sum_of_products_outstride0_one,
-    &@name@_sum_of_products_outstride0_two,
-    &@name@_sum_of_products_outstride0_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _outstride0_specialized_table */
-
-static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_contig_any,
-    &@name@_sum_of_products_contig_one,
-    &@name@_sum_of_products_contig_two,
-    &@name@_sum_of_products_contig_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _allcontig_specialized_table */
-
-static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- *         byte, ubyte,
- *         short, ushort,
- *         int, uint,
- *         long, ulong,
- *         longlong, ulonglong,
- *         float, double, longdouble,
- *         cfloat, cdouble, clongdouble,
- *         object, string, unicode, void,
- *         datetime, timedelta, half#
- * #use = 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1,
- *        1, 1, 1,
- *        1, 1, 1,
- *        0, 0, 0, 0,
- *        0, 0, 1#
- */
-#if @use@
-{
-    &@name@_sum_of_products_any,
-    &@name@_sum_of_products_one,
-    &@name@_sum_of_products_two,
-    &@name@_sum_of_products_three
-},
-#else
-    {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _unnspecialized_table */
-
-static sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
-                             npy_intp itemsize, npy_intp const *fixed_strides)
-{
-    int iop;
-
-    if (type_num >= NPY_NTYPES) {
-        return NULL;
-    }
-
-    /* contiguous reduction */
-    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
-        sum_of_products_fn ret =
-            _contig_outstride0_unary_specialization_table[type_num];
-        if (ret != NULL) {
-            return ret;
-        }
-    }
-
-    /* nop of 2 has more specializations */
-    if (nop == 2) {
-        /* Encode the zero/contiguous strides */
-        int code;
-        code = (fixed_strides[0] == 0) ? 0 :
-                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
-        code += (fixed_strides[1] == 0) ? 0 :
-                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
-        code += (fixed_strides[2] == 0) ? 0 :
-                    (fixed_strides[2] == itemsize) ? 1 : 8;
-        if (code >= 2 && code < 7) {
-            sum_of_products_fn ret =
-                        _binary_specialization_table[type_num][code-2];
-            if (ret != NULL) {
-                return ret;
-            }
-        }
-    }
-
-    /* Inner loop with an output stride of 0 */
-    if (fixed_strides[nop] == 0) {
-        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* Check for all contiguous */
-    for (iop = 0; iop < nop + 1; ++iop) {
-        if (fixed_strides[iop] != itemsize) {
-            break;
-        }
-    }
-
-    /* Contiguous loop */
-    if (iop == nop + 1) {
-        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
-    }
-
-    /* None of the above specializations caught it, general loops */
-    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
-}
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
 
 
 /*
diff --git a/numpy/core/src/multiarray/einsum_debug.h b/numpy/core/src/multiarray/einsum_debug.h
new file mode 100644
index 000000000..9aa81fcbd
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_debug.h
@@ -0,0 +1,28 @@
+/*
+ * This file provides debug macros used by the other einsum files.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+#ifndef _NPY_MULTIARRAY_EINSUM_DEBUG_H
+#define _NPY_MULTIARRAY_EINSUM_DEBUG_H
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_EINSUM_DBG_TRACING 0
+
+#if NPY_EINSUM_DBG_TRACING
+#include <cstdio>
+#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
+#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
+#else
+#define NPY_EINSUM_DBG_PRINT(s)
+#define NPY_EINSUM_DBG_PRINT1(s, p1)
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
+#endif
+
+#endif
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
new file mode 100644
index 000000000..caba0e00a
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -0,0 +1,1878 @@
+/*
+ * This file provides optimized sum of product implementations used internally
+ * by einsum.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
+#include <numpy/halffloat.h>
+
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
+
+#ifdef NPY_HAVE_SSE_INTRINSICS
+#define EINSUM_USE_SSE1 1
+#else
+#define EINSUM_USE_SSE1 0
+#endif
+
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#define EINSUM_USE_SSE2 1
+#else
+#define EINSUM_USE_SSE2 0
+#endif
+
+#if EINSUM_USE_SSE1
+#include <xmmintrin.h>
+#endif
+
+#if EINSUM_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
+
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**********************************************/
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ *         ubyte, ushort, uint, ulong, ulonglong,
+ *         half, float, double, longdouble,
+ *         cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *             npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *             npy_float, npy_float, npy_double, npy_longdouble,
+ *             npy_float, npy_double, npy_longdouble#
+ * #sfx  = s8, s16, s32, long, s64,
+ *        u8, u16, u32, ulong, u64,
+ *        half, f32, f64, longdouble,
+ *        f32, f64, clongdouble#
+ * #to = ,,,,,
+ *       ,,,,,
+ *       npy_float_to_half,,,,
+ *       ,,#
+ * #from = ,,,,,
+ *         ,,,,,
+ *         npy_half_to_float,,,,
+ *         ,,#
+ * #complex = 0*5,
+ *            0*5,
+ *            0*4,
+ *            1*3#
+ * #float32 = 0*5,
+ *            0*5,
+ *            0,1,0,0,
+ *            0*3#
+ * #float64 = 0*5,
+ *            0*5,
+ *            0,0,1,0,
+ *            0*3#
+ * #NPYV_CHK = 0*5,
+ *             0*5,
+ *             0, NPY_SIMD, NPY_SIMD_F64, 0,
+ *             0*3#
+ */
+
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif @nop@ == 2
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif @nop@ == 3
+        *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+                                         @from@(*(@type@ *)data1) *
+                                         @from@(*(@type@ *)data2) +
+                                         @from@(*(@type@ *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
+                                         ((@temptype@ *)data_out)[0];
+        ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
+                                         ((@temptype@ *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data_out = (@type@ *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            data_out[@i@] = @to@(@from@(data0[@i@]) +
+                                 @from@(data_out[@i@]));
+#else
+            ((@temptype@ *)data_out + 2*@i@)[0] =
+                                    ((@temptype@ *)data0 + 2*@i@)[0] +
+                                    ((@temptype@ *)data_out + 2*@i@)[0];
+            ((@temptype@ *)data_out + 2*@i@)[1] =
+                                    ((@temptype@ *)data0 + 2*@i@)[1] +
+                                    ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#if !@complex@
+        data_out[@i@] = @to@(@from@(data0[@i@]) +
+                             @from@(data_out[@i@]));
+#else /* complex */
+        ((@temptype@ *)data_out + 2*@i@)[0] =
+                                ((@temptype@ *)data0 + 2*@i@)[0] +
+                                ((@temptype@ *)data_out + 2*@i@)[0];
+        ((@temptype@ *)data_out + 2*@i@)[1] =
+                                ((@temptype@ *)data0 + 2*@i@)[1] +
+                                ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for @type@
+#if @NPYV_CHK@
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_@sfx@;
+
+    /**begin repeat2
+     * #cond = if(is_aligned), else#
+     * #ld = loada, load#
+     * #st = storea, store#
+     */
+    @cond@ {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
+            npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
+            npyv_@sfx@ c@i@ = npyv_@ld@_@sfx@(data_out + vstep * @i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@sfx@ abc@i@ = npyv_muladd_@sfx@(a@i@, b@i@, c@i@);
+            /**end repeat3**/
+            /**begin repeat3
+             * #i = 0, 1, 2, 3#
+             */
+            npyv_@st@_@sfx@(data_out + vstep * @i@, abc@i@);
+            /**end repeat3**/
+        }
+    }
+    /**end repeat2**/
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
+        npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
+        npyv_@sfx@ c = npyv_load_tillz_@sfx@(data_out, count);
+        npyv_store_till_@sfx@(data_out, count, npyv_muladd_@sfx@(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ a@i@ = @from@(data0[@i@]);
+        const @type@ b@i@ = @from@(data1[@i@]);
+        const @type@ c@i@ = @from@(data_out[@i@]);
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        /**end repeat2**/
+        /**begin repeat2
+         * #i = 0, 1, 2, 3#
+         */
+        data_out[@i@] = @to@(abc@i@);
+        /**end repeat2**/
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const @type@ a = @from@(*data0);
+        const @type@ b = @from@(*data1);
+        const @type@ c = @from@(*data_out);
+        *data_out = @to@(a * b + c);
+    }
+#endif // NPYV check for @type@
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value0_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value0_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(value0 *
+                                 @from@(data1[@i@]) +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value0_sse = _mm_set_ps1(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    value0_sse = _mm_set1_pd(value0);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data1 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        if (count > 0) {
+            goto finish_after_unrolled_loop;
+        }
+        else {
+            return;
+        }
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(value0 *
+                             @from@(data1[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data1 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    if (count > 0) {
+        goto finish_after_unrolled_loop;
+    }
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, b, value1_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, b, value1_sse;
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            data_out[@i@] = @to@(@from@(data0[@i@])*
+                                 value1  +
+                                 @from@(data_out[@i@]));
+/**end repeat2**/
+        case 0:
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    value1_sse = _mm_set_ps1(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
+            b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+            _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    value1_sse = _mm_set1_pd(value1);
+
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
+            b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+            _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+            data0 += 8;
+            data_out += 8;
+        }
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
+        b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+        _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
+        b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+        _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@])*
+                             value1  +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+            accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+            _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+            accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+            data0 += 8;
+            data1 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+        accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+        _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+        accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data1[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data1)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
+/**end repeat2**/
+            data1 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+        data1 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+    @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+            accum += @from@(data0[@i@]);
+/**end repeat2**/
+        case 0:
+            *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        accum += @from@(data0[@i@]);
+/**end repeat2**/
+#endif
+        data0 += 8;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    @type@ *data0 = (@type@ *)dataptr[0];
+    @type@ *data1 = (@type@ *)dataptr[1];
+    @type@ *data2 = (@type@ *)dataptr[2];
+    @type@ *data_out = (@type@ *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        data_out[@i@] = @to@(@from@(data0[@i@]) *
+                             @from@(data1[@i@]) *
+                             @from@(data2[@i@]) +
+                             @from@(data_out[@i@]));
+/**end repeat2**/
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+    if (count-- == 0) {
+        return;
+    }
+    data_out[@i@] = @to@(@from@(data0[@i@]) *
+                         @from@(data1[@i@]) *
+                         @from@(data2[@i@]) +
+                         @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else /* @nop@ > 3 || @complex */
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        *(@type@ *)dataptr[nop] = @to@(temp +
+                                           @from@(*(@type@ *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#else /* complex */
+#  if @nop@ <= 3
+#    define _SUMPROD_NOP @nop@
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+        ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(@type@);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various @nop@ */
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+    @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#else
+    @temptype@ accum = 0;
+    @type@ *data0 = (@type@ *)dataptr[0];
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+    __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+    __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
+                                                    (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#if !@complex@
+            accum += @from@(data0[@i@]);
+#else /* complex */
+            accum_re += data0[2*@i@+0];
+            accum_im += data0[2*@i@+1];
+#endif
+/**end repeat2**/
+        case 0:
+#if @complex@
+            ((@temptype@ *)dataptr[1])[0] += accum_re;
+            ((@temptype@ *)dataptr[1])[1] += accum_im;
+#else
+            *((@type@ *)dataptr[1]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[1])));
+#endif
+            return;
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the four SSE values and put in accum */
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+        accum_sse = _mm_add_ps(a, accum_sse);
+        _mm_store_ss(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Use aligned instructions if possible */
+    if (EINSUM_IS_SSE_ALIGNED(data0)) {
+        /* Unroll the loop by 8 */
+        while (count >= 8) {
+            count -= 8;
+
+            _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+            /*
+             * NOTE: This accumulation changes the order, so will likely
+             *       produce slightly different results.
+             */
+            accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+            data0 += 8;
+        }
+
+        /* Add the two SSE2 values and put in accum */
+        a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+        accum_sse = _mm_add_pd(a, accum_sse);
+        _mm_store_sd(&accum, accum_sse);
+
+        /* Finish off the loop */
+        goto finish_after_unrolled_loop;
+    }
+#endif
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+        _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+        /*
+         * NOTE: This accumulation changes the order, so will likely
+         *       produce slightly different results.
+         */
+        accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#  if !@complex@
+        accum += @from@(data0[@i@]);
+#  else /* complex */
+        accum_re += data0[2*@i@+0];
+        accum_im += data0[2*@i@+1];
+#  endif
+/**end repeat2**/
+#endif
+
+#if !@complex@
+        data0 += 8;
+#else
+        data0 += 8*2;
+#endif
+    }
+
+#if EINSUM_USE_SSE1 && @float32@
+    /* Add the four SSE values and put in accum */
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+    accum_sse = _mm_add_ps(a, accum_sse);
+    _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+    /* Add the two SSE2 values and put in accum */
+    a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+    accum_sse = _mm_add_pd(a, accum_sse);
+    _mm_store_sd(&accum, accum_sse);
+#endif
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#endif /* @nop@ == 1 */
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+    @temptype@ accum_re = 0, accum_im = 0;
+#else
+    @temptype@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !@complex@
+#  if @nop@ == 1
+        accum += @from@(*(@type@ *)data0);
+        data0 += stride0;
+#  elif @nop@ == 2
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif @nop@ == 3
+        accum += @from@(*(@type@ *)data0) *
+                 @from@(*(@type@ *)data1) *
+                 @from@(*(@type@ *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= @from@(*(@type@ *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if @nop@ == 1
+        accum_re += ((@temptype@ *)data0)[0];
+        accum_im += ((@temptype@ *)data0)[1];
+        data0 += stride0;
+#  else
+#    if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        @temptype@ re, im, tmp;
+        int i;
+        re = ((@temptype@ *)dataptr[0])[0];
+        im = ((@temptype@ *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((@temptype@ *)dataptr[i])[0] -
+                  im * ((@temptype@ *)dataptr[i])[1];
+            im = re * ((@temptype@ *)dataptr[i])[1] +
+                 im * ((@temptype@ *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if @complex@
+#  if @nop@ <= 3
+    ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
+    ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
+#  else
+    ((@temptype@ *)dataptr[nop])[0] += accum_re;
+    ((@temptype@ *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if @nop@ <= 3
+    *((@type@ *)dataptr[@nop@]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[@nop@])));
+#  else
+    *((@type@ *)dataptr[nop]) = @to@(accum +
+                                    @from@(*((@type@ *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+    npy_intp stride_out = strides[@nop@];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif @nop@ == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif @nop@ == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+    char *data_out = dataptr[@nop@];
+#endif
+
+#if (@nop@ <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+/**begin repeat1
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+        case @i@+1:
+#  if @nop@ == 1
+            ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
+                                            ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 2
+            ((npy_bool *)data_out)[@i@] =
+                            (((npy_bool *)data0)[@i@] &&
+                             ((npy_bool *)data1)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  elif @nop@ == 3
+            ((npy_bool *)data_out)[@i@] =
+                           (((npy_bool *)data0)[@i@] &&
+                            ((npy_bool *)data1)[@i@] &&
+                            ((npy_bool *)data2)[@i@]) ||
+                                ((npy_bool *)data_out)[@i@];
+#  endif
+/**end repeat1**/
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (@nop@ <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+                                        (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                        ((*((npy_bool *)data0 + @i@)) &&
+                         (*((npy_bool *)data1 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+        *((npy_bool *)data_out + @i@) =
+                       ((*((npy_bool *)data0 + @i@)) &&
+                        (*((npy_bool *)data1 + @i@)) &&
+                        (*((npy_bool *)data2 + @i@))) ||
+                            (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if @nop@ == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif @nop@ == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif @nop@ == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if @nop@ <= 3
+    *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+/**end repeat**/
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+    &@name@_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+/**end repeat**/
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 0,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_stride0_contig_outstride0_two,
+    &@name@_sum_of_products_stride0_contig_outcontig_two,
+    &@name@_sum_of_products_contig_stride0_outstride0_two,
+    &@name@_sum_of_products_contig_stride0_outcontig_two,
+    &@name@_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_outstride0_any,
+    &@name@_sum_of_products_outstride0_one,
+    &@name@_sum_of_products_outstride0_two,
+    &@name@_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_contig_any,
+    &@name@_sum_of_products_contig_one,
+    &@name@_sum_of_products_contig_two,
+    &@name@_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ *         byte, ubyte,
+ *         short, ushort,
+ *         int, uint,
+ *         long, ulong,
+ *         longlong, ulonglong,
+ *         float, double, longdouble,
+ *         cfloat, cdouble, clongdouble,
+ *         object, string, unicode, void,
+ *         datetime, timedelta, half#
+ * #use = 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1,
+ *        1, 1, 1,
+ *        1, 1, 1,
+ *        0, 0, 0, 0,
+ *        0, 0, 1#
+ */
+#if @use@
+{
+    &@name@_sum_of_products_any,
+    &@name@_sum_of_products_one,
+    &@name@_sum_of_products_two,
+    &@name@_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _unnspecialized_table */
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides)
+{
+    int iop;
+
+    if (type_num >= NPY_NTYPES) {
+        return NULL;
+    }
+
+    /* contiguous reduction */
+    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+        sum_of_products_fn ret =
+            _contig_outstride0_unary_specialization_table[type_num];
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        /* Encode the zero/contiguous strides */
+        int code;
+        code = (fixed_strides[0] == 0) ? 0 :
+                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+        code += (fixed_strides[1] == 0) ? 0 :
+                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
+        code += (fixed_strides[2] == 0) ? 0 :
+                    (fixed_strides[2] == itemsize) ? 1 : 8;
+        if (code >= 2 && code < 7) {
+            sum_of_products_fn ret =
+                        _binary_specialization_table[type_num][code-2];
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop + 1; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop + 1) {
+        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* None of the above specializations caught it, general loops */
+    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
new file mode 100644
index 000000000..c6cf18ec6
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -0,0 +1,12 @@
+#ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+#define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+
+#include <numpy/npy_common.h>
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides);
+
+#endif
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index d5f24e75a..9b7d8deae 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -307,7 +307,7 @@ arrayflags_farray_get(PyArrayFlagsObject *self)
 static PyObject *
 arrayflags_num_get(PyArrayFlagsObject *self)
 {
-    return PyInt_FromLong(self->flags);
+    return PyLong_FromLong(self->flags);
 }
 
 /* relies on setflags order being write, align, uic */
@@ -711,7 +711,7 @@ arrayflags_print(PyArrayFlagsObject *self)
     if (fl & NPY_ARRAY_WARN_ON_WRITE) {
         _warn_on_write = "  (with WARN_ON_WRITE=True)";
     }
-    return PyUString_FromFormat(
+    return PyUnicode_FromFormat(
                         "  %s : %s\n  %s : %s\n"
                         "  %s : %s\n  %s : %s%s\n"
                         "  %s : %s\n  %s : %s\n"
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 5405a25db..3575d6fad 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -28,7 +28,7 @@
 static PyObject *
 array_ndim_get(PyArrayObject *self)
 {
-    return PyInt_FromLong(PyArray_NDIM(self));
+    return PyLong_FromLong(PyArray_NDIM(self));
 }
 
 static PyObject *
@@ -217,7 +217,7 @@ array_protocol_descr_get(PyArrayObject *self)
     if (dobj == NULL) {
         return NULL;
     }
-    PyTuple_SET_ITEM(dobj, 0, PyString_FromString(""));
+    PyTuple_SET_ITEM(dobj, 0, PyUnicode_FromString(""));
     PyTuple_SET_ITEM(dobj, 1, array_typestr_get(self));
     res = PyList_New(1);
     if (res == NULL) {
@@ -244,8 +244,9 @@ array_dataptr_get(PyArrayObject *self)
 {
     return Py_BuildValue("NO",
                          PyLong_FromVoidPtr(PyArray_DATA(self)),
-                         (PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE ? Py_False :
-                          Py_True));
+                         ((PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE) &&
+                          !(PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) ?
+                         Py_False : Py_True);
 }
 
 static PyObject *
@@ -274,10 +275,6 @@ array_interface_get(PyArrayObject *self)
         return NULL;
     }
 
-    if (array_might_be_written(self) < 0) {
-        Py_DECREF(dict);
-        return NULL;
-    }
     int ret;
 
     /* dataptr */
@@ -321,7 +318,7 @@ array_interface_get(PyArrayObject *self)
         return NULL;
     }
 
-    obj = PyInt_FromLong(3);
+    obj = PyLong_FromLong(3);
     ret = PyDict_SetItemString(dict, "version", obj);
     Py_DECREF(obj);
     if (ret < 0) {
@@ -416,7 +413,7 @@ array_data_set(PyArrayObject *self, PyObject *op)
 static PyObject *
 array_itemsize_get(PyArrayObject *self)
 {
-    return PyInt_FromLong((long) PyArray_DESCR(self)->elsize);
+    return PyLong_FromLong((long) PyArray_DESCR(self)->elsize);
 }
 
 static PyObject *
@@ -424,13 +421,13 @@ array_size_get(PyArrayObject *self)
 {
     npy_intp size=PyArray_SIZE(self);
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) size);
+    return PyLong_FromLong((long) size);
 #else
     if (size > NPY_MAX_LONG || size < NPY_MIN_LONG) {
         return PyLong_FromLongLong(size);
     }
     else {
-        return PyInt_FromLong((long) size);
+        return PyLong_FromLong((long) size);
     }
 #endif
 }
@@ -440,13 +437,13 @@ array_nbytes_get(PyArrayObject *self)
 {
     npy_intp nbytes = PyArray_NBYTES(self);
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) nbytes);
+    return PyLong_FromLong((long) nbytes);
 #else
     if (nbytes > NPY_MAX_LONG || nbytes < NPY_MIN_LONG) {
         return PyLong_FromLongLong(nbytes);
     }
     else {
-        return PyInt_FromLong((long) nbytes);
+        return PyLong_FromLong((long) nbytes);
     }
 #endif
 }
@@ -624,13 +621,7 @@ static PyObject *
 array_struct_get(PyArrayObject *self)
 {
     PyArrayInterface *inter;
-    PyObject *ret;
 
-    if (PyArray_ISWRITEABLE(self)) {
-        if (array_might_be_written(self) < 0) {
-            return NULL;
-        }
-    }
     inter = (PyArrayInterface *)PyArray_malloc(sizeof(PyArrayInterface));
     if (inter==NULL) {
         return PyErr_NoMemory();
@@ -640,6 +631,11 @@ array_struct_get(PyArrayObject *self)
     inter->typekind = PyArray_DESCR(self)->kind;
     inter->itemsize = PyArray_DESCR(self)->elsize;
     inter->flags = PyArray_FLAGS(self);
+    if (inter->flags & NPY_ARRAY_WARN_ON_WRITE) {
+        /* Export a warn-on-write array as read-only */
+        inter->flags = inter->flags & ~NPY_ARRAY_WARN_ON_WRITE;
+        inter->flags = inter->flags & ~NPY_ARRAY_WRITEABLE;
+    }
     /* reset unused flags */
     inter->flags &= ~(NPY_ARRAY_WRITEBACKIFCOPY | NPY_ARRAY_UPDATEIFCOPY |NPY_ARRAY_OWNDATA);
     if (PyArray_ISNOTSWAPPED(self)) inter->flags |= NPY_ARRAY_NOTSWAPPED;
@@ -676,8 +672,14 @@ array_struct_get(PyArrayObject *self)
     else {
         inter->descr = NULL;
     }
+    PyObject *ret = PyCapsule_New(inter, NULL, gentype_struct_free);
+    if (ret == NULL) {
+        return NULL;
+    }
     Py_INCREF(self);
-    ret = NpyCapsule_FromVoidPtrAndDesc(inter, self, gentype_struct_free);
+    if (PyCapsule_SetContext(ret, self) < 0) {
+        return NULL;
+    }
     return ret;
 }
 
diff --git a/numpy/core/src/multiarray/hashdescr.c b/numpy/core/src/multiarray/hashdescr.c
index 0b23b6c21..e9a99cc8f 100644
--- a/numpy/core/src/multiarray/hashdescr.c
+++ b/numpy/core/src/multiarray/hashdescr.c
@@ -132,7 +132,7 @@ static int _array_descr_walk_fields(PyObject *names, PyObject* fields, PyObject*
                     "(Hash) names and fields inconsistent ???");
             return -1;
         }
-        if (!PyUString_Check(key)) {
+        if (!PyUnicode_Check(key)) {
             PyErr_SetString(PyExc_SystemError,
                     "(Hash) key of dtype dict not a string ???");
             return -1;
@@ -165,7 +165,7 @@ static int _array_descr_walk_fields(PyObject *names, PyObject* fields, PyObject*
         }
 
         foffset = PyTuple_GET_ITEM(value, 1);
-        if (!PyInt_Check(foffset)) {
+        if (!PyLong_Check(foffset)) {
             PyErr_SetString(PyExc_SystemError,
                     "(Hash) Second item in compound dtype tuple not an int ???");
             return -1;
@@ -208,7 +208,7 @@ static int _array_descr_walk_subarray(PyArray_ArrayDescr* adescr, PyObject *l)
             PyList_Append(l, item);
         }
     }
-    else if (PyInt_Check(adescr->shape)) {
+    else if (PyLong_Check(adescr->shape)) {
         PyList_Append(l, adescr->shape);
     }
     else {
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 8052e24e4..b279ffc2f 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -576,6 +576,10 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
         return NULL;
     }
 
+    if (PyArray_FailUnlessWriteable(self, "putmask: output array") < 0) {
+        return NULL;
+    }
+
     mask = (PyArrayObject *)PyArray_FROM_OTF(mask0, NPY_BOOL,
                                 NPY_ARRAY_CARRAY | NPY_ARRAY_FORCECAST);
     if (mask == NULL) {
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index ac5b90400..3ebd4c858 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -61,7 +61,7 @@ parse_index_entry(PyObject *op, npy_intp *step_size,
     }
     else if (PySlice_Check(op)) {
         npy_intp stop;
-        if (NpySlice_GetIndicesEx(op, max, &i, &stop, step_size, n_steps) < 0) {
+        if (PySlice_GetIndicesEx(op, max, &i, &stop, step_size, n_steps) < 0) {
             goto fail;
         }
         if (*n_steps <= 0) {
@@ -597,7 +597,7 @@ iter_subscript(PyArrayIterObject *self, PyObject *ind)
     }
 
     /* Check for Integer or Slice */
-    if (PyLong_Check(ind) || PyInt_Check(ind) || PySlice_Check(ind)) {
+    if (PyLong_Check(ind) || PySlice_Check(ind)) {
         start = parse_index_entry(ind, &step_size, &n_steps,
                                   self->size, 0, 1);
         if (start == -1) {
@@ -1411,10 +1411,10 @@ static PyObject *
 arraymultiter_size_get(PyArrayMultiIterObject *self)
 {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) self->size);
+    return PyLong_FromLong((long) self->size);
 #else
     if (self->size < NPY_MAX_LONG) {
-        return PyInt_FromLong((long) self->size);
+        return PyLong_FromLong((long) self->size);
     }
     else {
         return PyLong_FromLongLong((npy_longlong) self->size);
@@ -1426,10 +1426,10 @@ static PyObject *
 arraymultiter_index_get(PyArrayMultiIterObject *self)
 {
 #if NPY_SIZEOF_INTP <= NPY_SIZEOF_LONG
-    return PyInt_FromLong((long) self->index);
+    return PyLong_FromLong((long) self->index);
 #else
     if (self->size < NPY_MAX_LONG) {
-        return PyInt_FromLong((long) self->index);
+        return PyLong_FromLong((long) self->index);
     }
     else {
         return PyLong_FromLongLong((npy_longlong) self->index);
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.c b/numpy/core/src/multiarray/legacy_dtype_implementation.c
new file mode 100644
index 000000000..3ce4710fd
--- /dev/null
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.c
@@ -0,0 +1,716 @@
+/*
+ * This file hosts legacy implementations of certain functions for
+ * which alternatives exists, but the old functions are still required
+ * in certain code paths, or until the code transition is finalized.
+ *
+ * This code should typically not require modification, and if modified
+ * similar changes may be necessary in the new version.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include "numpy/arrayobject.h"
+#include "scalartypes.h"
+#include "_datetime.h"
+#include "datetime_strings.h"
+#include "convert_datatype.h"
+
+#include "legacy_dtype_implementation.h"
+
+
+/*
+ * Compare the field dictionaries for two types.
+ *
+ * Return 1 if the field types and field names of the two descrs are equal and
+ * in the same order, 0 if not.
+ */
+static int
+_equivalent_fields(PyArray_Descr *type1, PyArray_Descr *type2) {
+
+    int val;
+
+    if (type1->fields == type2->fields && type1->names == type2->names) {
+        return 1;
+    }
+    if (type1->fields == NULL || type2->fields == NULL) {
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(type1->fields, type2->fields, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(type1->names, type2->names, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    return 1;
+}
+
+/*
+ * Compare the subarray data for two types.
+ * Return 1 if they are the same, 0 if not.
+ */
+static int
+_equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
+{
+    int val;
+
+    if (sub1 == sub2) {
+        return 1;
+
+    }
+    if (sub1 == NULL || sub2 == NULL) {
+        return 0;
+    }
+
+    val = PyObject_RichCompareBool(sub1->shape, sub2->shape, Py_EQ);
+    if (val != 1 || PyErr_Occurred()) {
+        PyErr_Clear();
+        return 0;
+    }
+
+    return PyArray_EquivTypes(sub1->base, sub2->base);
+}
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypes(PyArray_Descr *type1, PyArray_Descr *type2)
+{
+    int type_num1, type_num2, size1, size2;
+
+    if (type1 == type2) {
+        return NPY_TRUE;
+    }
+
+    type_num1 = type1->type_num;
+    type_num2 = type2->type_num;
+    size1 = type1->elsize;
+    size2 = type2->elsize;
+
+    if (size1 != size2) {
+        return NPY_FALSE;
+    }
+    if (PyArray_ISNBO(type1->byteorder) != PyArray_ISNBO(type2->byteorder)) {
+        return NPY_FALSE;
+    }
+    if (type1->subarray || type2->subarray) {
+        return ((type_num1 == type_num2)
+                && _equivalent_subarrays(type1->subarray, type2->subarray));
+    }
+    if (type_num1 == NPY_VOID || type_num2 == NPY_VOID) {
+        return ((type_num1 == type_num2) && _equivalent_fields(type1, type2));
+    }
+    if (type_num1 == NPY_DATETIME
+        || type_num1 == NPY_TIMEDELTA
+        || type_num2 == NPY_DATETIME
+        || type_num2 == NPY_TIMEDELTA) {
+        return ((type_num1 == type_num2)
+                && has_equivalent_datetime_metadata(type1, type2));
+    }
+    return type1->kind == type2->kind;
+}
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypenums(int typenum1, int typenum2)
+{
+    PyArray_Descr *d1, *d2;
+    npy_bool ret;
+
+    if (typenum1 == typenum2) {
+        return NPY_SUCCEED;
+    }
+
+    d1 = PyArray_DescrFromType(typenum1);
+    d2 = PyArray_DescrFromType(typenum2);
+    ret = PyArray_LegacyEquivTypes(d1, d2);
+    Py_DECREF(d1);
+    Py_DECREF(d2);
+    return ret;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_LegacyCanCastSafely(int fromtype, int totype)
+{
+    PyArray_Descr *from;
+
+    /* Fast table lookup for small type numbers */
+    if ((unsigned int)fromtype < NPY_NTYPES &&
+        (unsigned int)totype < NPY_NTYPES) {
+        return _npy_can_cast_safely_table[fromtype][totype];
+    }
+
+    /* Identity */
+    if (fromtype == totype) {
+        return 1;
+    }
+
+    from = PyArray_DescrFromType(fromtype);
+    /*
+     * cancastto is a NPY_NOTYPE terminated C-int-array of types that
+     * the data-type can be cast to safely.
+     */
+    if (from->f->cancastto) {
+        int *curtype = from->f->cancastto;
+
+        while (*curtype != NPY_NOTYPE) {
+            if (*curtype++ == totype) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTo(PyArray_Descr *from, PyArray_Descr *to)
+{
+    int from_type_num = from->type_num;
+    int to_type_num = to->type_num;
+    npy_bool ret;
+
+    ret = (npy_bool) PyArray_LegacyCanCastSafely(from_type_num, to_type_num);
+    if (ret) {
+        /* Check String and Unicode more closely */
+        if (from_type_num == NPY_STRING) {
+            if (to_type_num == NPY_STRING) {
+                ret = (from->elsize <= to->elsize);
+            }
+            else if (to_type_num == NPY_UNICODE) {
+                ret = (from->elsize << 2 <= to->elsize);
+            }
+        }
+        else if (from_type_num == NPY_UNICODE) {
+            if (to_type_num == NPY_UNICODE) {
+                ret = (from->elsize <= to->elsize);
+            }
+        }
+            /*
+             * For datetime/timedelta, only treat casts moving towards
+             * more precision as safe.
+             */
+        else if (from_type_num == NPY_DATETIME && to_type_num == NPY_DATETIME) {
+            PyArray_DatetimeMetaData *meta1, *meta2;
+            meta1 = get_datetime_metadata_from_dtype(from);
+            if (meta1 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+            meta2 = get_datetime_metadata_from_dtype(to);
+            if (meta2 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+
+            return can_cast_datetime64_metadata(meta1, meta2,
+                    NPY_SAFE_CASTING);
+        }
+        else if (from_type_num == NPY_TIMEDELTA &&
+                 to_type_num == NPY_TIMEDELTA) {
+            PyArray_DatetimeMetaData *meta1, *meta2;
+            meta1 = get_datetime_metadata_from_dtype(from);
+            if (meta1 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+            meta2 = get_datetime_metadata_from_dtype(to);
+            if (meta2 == NULL) {
+                PyErr_Clear();
+                return 0;
+            }
+
+            return can_cast_timedelta64_metadata(meta1, meta2,
+                    NPY_SAFE_CASTING);
+        }
+            /*
+             * If to_type_num is STRING or unicode
+             * see if the length is long enough to hold the
+             * stringified value of the object.
+             */
+        else if (to_type_num == NPY_STRING || to_type_num == NPY_UNICODE) {
+            /*
+             * Boolean value cast to string type is 5 characters max
+             * for string 'False'.
+             */
+            int char_size = 1;
+            if (to_type_num == NPY_UNICODE) {
+                char_size = 4;
+            }
+
+            ret = 0;
+            if (PyDataType_ISUNSIZED(to)) {
+                ret = 1;
+            }
+                /*
+                 * Need at least 5 characters to convert from boolean
+                 * to 'True' or 'False'.
+                 */
+            else if (from->kind == 'b' && to->elsize >= 5 * char_size) {
+                ret = 1;
+            }
+            else if (from->kind == 'u') {
+                /* Guard against unexpected integer size */
+                if (from->elsize > 8 || from->elsize < 0) {
+                    ret = 0;
+                }
+                else if (to->elsize >=
+                         REQUIRED_STR_LEN[from->elsize] * char_size) {
+                    ret = 1;
+                }
+            }
+            else if (from->kind == 'i') {
+                /* Guard against unexpected integer size */
+                if (from->elsize > 8 || from->elsize < 0) {
+                    ret = 0;
+                }
+                    /* Extra character needed for sign */
+                else if (to->elsize >=
+                         (REQUIRED_STR_LEN[from->elsize] + 1) * char_size) {
+                    ret = 1;
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+
+/*
+ * Compare two field dictionaries for castability.
+ *
+ * Return 1 if 'field1' can be cast to 'field2' according to the rule
+ * 'casting', 0 if not.
+ *
+ * Castabiliy of field dictionaries is defined recursively: 'field1' and
+ * 'field2' must have the same field names (possibly in different
+ * orders), and the corresponding field types must be castable according
+ * to the given casting rule.
+ */
+static int
+can_cast_fields(PyObject *field1, PyObject *field2, NPY_CASTING casting)
+{
+    Py_ssize_t ppos;
+    PyObject *key;
+    PyObject *tuple1, *tuple2;
+
+    if (field1 == field2) {
+        return 1;
+    }
+    if (field1 == NULL || field2 == NULL) {
+        return 0;
+    }
+    if (PyDict_Size(field1) != PyDict_Size(field2)) {
+        return 0;
+    }
+
+    /* Iterate over all the fields and compare for castability */
+    ppos = 0;
+    while (PyDict_Next(field1, &ppos, &key, &tuple1)) {
+        if ((tuple2 = PyDict_GetItem(field2, key)) == NULL) {
+            return 0;
+        }
+        /* Compare the dtype of the field for castability */
+        if (!PyArray_CanCastTypeTo(
+                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple1, 0),
+                        (PyArray_Descr *)PyTuple_GET_ITEM(tuple2, 0),
+                        casting)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
+        NPY_CASTING casting)
+{
+    /*
+     * Fast paths for equality and for basic types.
+     */
+    if (from == to ||
+        ((NPY_LIKELY(PyDataType_ISNUMBER(from)) ||
+          PyDataType_ISOBJECT(from)) &&
+         NPY_LIKELY(from->type_num == to->type_num) &&
+         NPY_LIKELY(from->byteorder == to->byteorder))) {
+        return 1;
+    }
+    /*
+     * Cases with subarrays and fields need special treatment.
+     */
+    if (PyDataType_HASFIELDS(from)) {
+        /*
+         * If from is a structured data type, then it can be cast to a simple
+         * non-object one only for unsafe casting *and* if it has a single
+         * field; recurse just in case the single field is itself structured.
+         */
+        if (!PyDataType_HASFIELDS(to) && !PyDataType_ISOBJECT(to)) {
+            if (casting == NPY_UNSAFE_CASTING &&
+                    PyDict_Size(from->fields) == 1) {
+                Py_ssize_t ppos = 0;
+                PyObject *tuple;
+                PyArray_Descr *field;
+                PyDict_Next(from->fields, &ppos, NULL, &tuple);
+                field = (PyArray_Descr *)PyTuple_GET_ITEM(tuple, 0);
+                /*
+                 * For a subarray, we need to get the underlying type;
+                 * since we already are casting unsafely, we can ignore
+                 * the shape.
+                 */
+                if (PyDataType_HASSUBARRAY(field)) {
+                    field = field->subarray->base;
+                }
+                return PyArray_LegacyCanCastTypeTo(field, to, casting);
+            }
+            else {
+                return 0;
+            }
+        }
+        /*
+         * Casting from one structured data type to another depends on the fields;
+         * we pass that case on to the EquivTypenums case below.
+         *
+         * TODO: move that part up here? Need to check whether equivalent type
+         * numbers is an addition constraint that is needed.
+         *
+         * TODO/FIXME: For now, always allow structured to structured for unsafe
+         * casting; this is not correct, but needed since the treatment in can_cast
+         * below got out of sync with astype; see gh-13667.
+         */
+        if (casting == NPY_UNSAFE_CASTING) {
+            return 1;
+        }
+    }
+    else if (PyDataType_HASFIELDS(to)) {
+        /*
+         * If "from" is a simple data type and "to" has fields, then only
+         * unsafe casting works (and that works always, even to multiple fields).
+         */
+        return casting == NPY_UNSAFE_CASTING;
+    }
+    /*
+     * Everything else we consider castable for unsafe for now.
+     * FIXME: ensure what we do here is consistent with "astype",
+     * i.e., deal more correctly with subarrays and user-defined dtype.
+     */
+    else if (casting == NPY_UNSAFE_CASTING) {
+        return 1;
+    }
+    /*
+     * Equivalent simple types can be cast with any value of 'casting', but
+     * we need to be careful about structured to structured.
+     */
+    if (PyArray_LegacyEquivTypenums(from->type_num, to->type_num)) {
+        /* For complicated case, use EquivTypes (for now) */
+        if (PyTypeNum_ISUSERDEF(from->type_num) ||
+                        from->subarray != NULL) {
+            int ret;
+
+            /* Only NPY_NO_CASTING prevents byte order conversion */
+            if ((casting != NPY_NO_CASTING) &&
+                                (!PyArray_ISNBO(from->byteorder) ||
+                                 !PyArray_ISNBO(to->byteorder))) {
+                PyArray_Descr *nbo_from, *nbo_to;
+
+                nbo_from = PyArray_DescrNewByteorder(from, NPY_NATIVE);
+                nbo_to = PyArray_DescrNewByteorder(to, NPY_NATIVE);
+                if (nbo_from == NULL || nbo_to == NULL) {
+                    Py_XDECREF(nbo_from);
+                    Py_XDECREF(nbo_to);
+                    PyErr_Clear();
+                    return 0;
+                }
+                ret = PyArray_LegacyEquivTypes(nbo_from, nbo_to);
+                Py_DECREF(nbo_from);
+                Py_DECREF(nbo_to);
+            }
+            else {
+                ret = PyArray_LegacyEquivTypes(from, to);
+            }
+            return ret;
+        }
+
+        if (PyDataType_HASFIELDS(from)) {
+            switch (casting) {
+                case NPY_EQUIV_CASTING:
+                case NPY_SAFE_CASTING:
+                case NPY_SAME_KIND_CASTING:
+                    /*
+                     * `from' and `to' must have the same fields, and
+                     * corresponding fields must be (recursively) castable.
+                     */
+                    return can_cast_fields(from->fields, to->fields, casting);
+
+                case NPY_NO_CASTING:
+                default:
+                    return PyArray_LegacyEquivTypes(from, to);
+            }
+        }
+
+        switch (from->type_num) {
+            case NPY_DATETIME: {
+                PyArray_DatetimeMetaData *meta1, *meta2;
+                meta1 = get_datetime_metadata_from_dtype(from);
+                if (meta1 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+                meta2 = get_datetime_metadata_from_dtype(to);
+                if (meta2 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+
+                if (casting == NPY_NO_CASTING) {
+                    return PyArray_ISNBO(from->byteorder) ==
+                                        PyArray_ISNBO(to->byteorder) &&
+                            can_cast_datetime64_metadata(meta1, meta2, casting);
+                }
+                else {
+                    return can_cast_datetime64_metadata(meta1, meta2, casting);
+                }
+            }
+            case NPY_TIMEDELTA: {
+                PyArray_DatetimeMetaData *meta1, *meta2;
+                meta1 = get_datetime_metadata_from_dtype(from);
+                if (meta1 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+                meta2 = get_datetime_metadata_from_dtype(to);
+                if (meta2 == NULL) {
+                    PyErr_Clear();
+                    return 0;
+                }
+
+                if (casting == NPY_NO_CASTING) {
+                    return PyArray_ISNBO(from->byteorder) ==
+                                        PyArray_ISNBO(to->byteorder) &&
+                        can_cast_timedelta64_metadata(meta1, meta2, casting);
+                }
+                else {
+                    return can_cast_timedelta64_metadata(meta1, meta2, casting);
+                }
+            }
+            default:
+                switch (casting) {
+                    case NPY_NO_CASTING:
+                        return PyArray_LegacyEquivTypes(from, to);
+                    case NPY_EQUIV_CASTING:
+                        return (from->elsize == to->elsize);
+                    case NPY_SAFE_CASTING:
+                        return (from->elsize <= to->elsize);
+                    default:
+                        return 1;
+                }
+                break;
+        }
+    }
+    /* If safe or same-kind casts are allowed */
+    else if (casting == NPY_SAFE_CASTING || casting == NPY_SAME_KIND_CASTING) {
+        if (PyArray_LegacyCanCastTo(from, to)) {
+            return 1;
+        }
+        else if(casting == NPY_SAME_KIND_CASTING) {
+            /*
+             * Also allow casting from lower to higher kinds, according
+             * to the ordering provided by dtype_kind_to_ordering.
+             * Some kinds, like datetime, don't fit in the hierarchy,
+             * and are special cased as -1.
+             */
+            int from_order, to_order;
+
+            from_order = dtype_kind_to_ordering(from->kind);
+            to_order = dtype_kind_to_ordering(to->kind);
+
+            if (to->kind == 'm') {
+                /* both types being timedelta is already handled before. */
+                int integer_order = dtype_kind_to_ordering('i');
+                return (from_order != -1) && (from_order <= integer_order);
+            }
+
+            return (from_order != -1) && (from_order <= to_order);
+        }
+        else {
+            return 0;
+        }
+    }
+    /* NPY_NO_CASTING or NPY_EQUIV_CASTING was specified */
+    else {
+        return 0;
+    }
+}
+
+
+/*
+ * Legacy function to find the correct dtype when casting from any built-in
+ * dtype to NPY_STRING, NPY_UNICODE, NPY_VOID, and NPY_DATETIME with generic
+ * units.
+ *
+ * This function returns a dtype based on flex_dtype and the values in
+ * data_dtype. It also calls Py_DECREF on the flex_dtype. If the
+ * flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype)
+{
+    PyArray_DatetimeMetaData *meta;
+    PyArray_Descr *retval = NULL;
+    int flex_type_num;
+
+    if (flex_dtype == NULL) {
+        return retval;
+    }
+
+    flex_type_num = flex_dtype->type_num;
+
+    /* Flexible types with expandable size */
+    if (PyDataType_ISUNSIZED(flex_dtype)) {
+        /* First replace the flex_dtype */
+        retval = PyArray_DescrNew(flex_dtype);
+        Py_DECREF(flex_dtype);
+        if (retval == NULL) {
+            return retval;
+        }
+
+        if (data_dtype->type_num == flex_type_num ||
+            flex_type_num == NPY_VOID) {
+            (retval)->elsize = data_dtype->elsize;
+        }
+        else if (flex_type_num == NPY_STRING || flex_type_num == NPY_UNICODE) {
+            npy_intp size = 8;
+
+            /*
+             * Get a string-size estimate of the input. These
+             * are generallly the size needed, rounded up to
+             * a multiple of eight.
+             */
+            switch (data_dtype->type_num) {
+                case NPY_BOOL:
+                case NPY_UBYTE:
+                case NPY_BYTE:
+                case NPY_USHORT:
+                case NPY_SHORT:
+                case NPY_UINT:
+                case NPY_INT:
+                case NPY_ULONG:
+                case NPY_LONG:
+                case NPY_ULONGLONG:
+                case NPY_LONGLONG:
+                    if (data_dtype->kind == 'b') {
+                        /* 5 chars needed for cast to 'True' or 'False' */
+                        size = 5;
+                    }
+                    else if (data_dtype->elsize > 8 ||
+                             data_dtype->elsize < 0) {
+                        /*
+                         * Element size should never be greater than 8 or
+                         * less than 0 for integer type, but just in case...
+                         */
+                        break;
+                    }
+                    else if (data_dtype->kind == 'u') {
+                        size = REQUIRED_STR_LEN[data_dtype->elsize];
+                    }
+                    else if (data_dtype->kind == 'i') {
+                        /* Add character for sign symbol */
+                        size = REQUIRED_STR_LEN[data_dtype->elsize] + 1;
+                    }
+                    break;
+                case NPY_HALF:
+                case NPY_FLOAT:
+                case NPY_DOUBLE:
+                    size = 32;
+                    break;
+                case NPY_LONGDOUBLE:
+                    size = 48;
+                    break;
+                case NPY_CFLOAT:
+                case NPY_CDOUBLE:
+                    size = 2 * 32;
+                    break;
+                case NPY_CLONGDOUBLE:
+                    size = 2 * 48;
+                    break;
+                case NPY_OBJECT:
+                    size = 64;
+                    break;
+                case NPY_STRING:
+                case NPY_VOID:
+                    size = data_dtype->elsize;
+                    break;
+                case NPY_UNICODE:
+                    size = data_dtype->elsize / 4;
+                    break;
+                case NPY_DATETIME:
+                    meta = get_datetime_metadata_from_dtype(data_dtype);
+                    if (meta == NULL) {
+                        Py_DECREF(retval);
+                        return NULL;
+                    }
+                    size = get_datetime_iso_8601_strlen(0, meta->base);
+                    break;
+                case NPY_TIMEDELTA:
+                    size = 21;
+                    break;
+            }
+
+            if (flex_type_num == NPY_STRING) {
+                retval->elsize = size;
+            }
+            else if (flex_type_num == NPY_UNICODE) {
+                retval->elsize = size * 4;
+            }
+        }
+        else {
+            /*
+             * We should never get here, but just in case someone adds
+             * a new flex dtype...
+             */
+            PyErr_SetString(PyExc_TypeError,
+                    "don't know how to adapt flex dtype");
+            Py_DECREF(retval);
+            return NULL;
+        }
+    }
+        /* Flexible type with generic time unit that adapts */
+    else if (flex_type_num == NPY_DATETIME ||
+             flex_type_num == NPY_TIMEDELTA) {
+        meta = get_datetime_metadata_from_dtype(flex_dtype);
+        retval = flex_dtype;
+        if (meta == NULL) {
+            return NULL;
+        }
+
+        if (meta->base == NPY_FR_GENERIC) {
+            if (data_dtype->type_num == NPY_DATETIME ||
+                data_dtype->type_num == NPY_TIMEDELTA) {
+                meta = get_datetime_metadata_from_dtype(data_dtype);
+                if (meta == NULL) {
+                    return NULL;
+                }
+
+                retval = create_datetime_dtype(flex_type_num, meta);
+                Py_DECREF(flex_dtype);
+            }
+        }
+    }
+    else {
+        retval = flex_dtype;
+    }
+    return retval;
+}
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.h b/numpy/core/src/multiarray/legacy_dtype_implementation.h
new file mode 100644
index 000000000..ca171d773
--- /dev/null
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.h
@@ -0,0 +1,40 @@
+#ifndef _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
+#define _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
+
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypes(PyArray_Descr *type1, PyArray_Descr *type2);
+
+NPY_NO_EXPORT unsigned char
+PyArray_LegacyEquivTypenums(int typenum1, int typenum2);
+
+NPY_NO_EXPORT int
+PyArray_LegacyCanCastSafely(int fromtype, int totype);
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTo(PyArray_Descr *from, PyArray_Descr *to);
+
+NPY_NO_EXPORT npy_bool
+PyArray_LegacyCanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
+        NPY_CASTING casting);
+
+/*
+ * This function calls Py_DECREF on flex_dtype, and replaces it with
+ * a new dtype that has been adapted based on the values in data_dtype
+ * and data_obj. If the flex_dtype is not flexible, it returns it as-is.
+ *
+ * Usually, if data_obj is not an array, dtype should be the result
+ * given by the PyArray_GetArrayParamsFromObject function.
+ *
+ * The data_obj may be NULL if just a dtype is known for the source.
+ *
+ * If *flex_dtype is NULL, returns immediately, without setting an
+ * exception, leaving any previous error handling intact.
+ *
+ * The current flexible dtypes include NPY_STRING, NPY_UNICODE, NPY_VOID,
+ * and NPY_DATETIME with generic units.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_AdaptFlexibleDType(PyArray_Descr *data_dtype, PyArray_Descr *flex_dtype);
+
+#endif /*_NPY_LEGACY_DTYPE_IMPLEMENTATION_H*/
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index d234c366c..0590558be 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -110,7 +110,7 @@
  * if not it can decrease performance
  * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
  */
-static void
+static int
 #if @is_aligned@ && @is_swap@ == 0 && @elsize@ <= NPY_SIZEOF_INTP
     NPY_GCC_UNROLL_LOOPS
 #endif
@@ -171,6 +171,7 @@ static void
 
         --N;
     }
+    return 0;
 }
 #endif
 
@@ -182,7 +183,7 @@ static void
  * but it profits from vectorization enabled with -O3
  */
 #if (@src_contig@ == 0) && @is_aligned@
-static NPY_GCC_OPT_3 void
+static NPY_GCC_OPT_3 int
 @prefix@_@oper@_size@elsize@_srcstride0(char *dst,
                         npy_intp dst_stride,
                         char *src, npy_intp NPY_UNUSED(src_stride),
@@ -197,7 +198,7 @@ static NPY_GCC_OPT_3 void
     npy_uint64 temp0, temp1;
 #endif
     if (N == 0) {
-        return;
+        return 0;
     }
 #if @is_aligned@ && @elsize@ != 16
     /* sanity check */
@@ -238,6 +239,7 @@ static NPY_GCC_OPT_3 void
         --N;
     }
 #endif/* @elsize == 1 && @dst_contig@ -- else */
+    return 0;
 }
 #endif/* (@src_contig@ == 0) && @is_aligned@ */
 
@@ -247,7 +249,7 @@ static NPY_GCC_OPT_3 void
 /**end repeat1**/
 /**end repeat**/
 
-static void
+static int
 _strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -259,9 +261,10 @@ _strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _swap_strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -284,9 +287,10 @@ _swap_strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
                         npy_intp N, npy_intp src_itemsize,
@@ -319,15 +323,17 @@ _swap_pair_strided_to_strided(char *dst, npy_intp dst_stride,
         src += src_stride;
         --N;
     }
+    return 0;
 }
 
-static void
+static int
 _contig_to_contig(char *dst, npy_intp NPY_UNUSED(dst_stride),
                         char *src, npy_intp NPY_UNUSED(src_stride),
                         npy_intp N, npy_intp src_itemsize,
                         NpyAuxData *NPY_UNUSED(data))
 {
     memmove(dst, src, src_itemsize*N);
+    return 0;
 }
 
 
@@ -787,7 +793,7 @@ NPY_NO_EXPORT PyArray_StridedUnaryOp *
 
 #endif
 
-static NPY_GCC_OPT_3 void
+static NPY_GCC_OPT_3 int
 @prefix@_cast_@name1@_to_@name2@(
                         char *dst, npy_intp dst_stride,
                         char *src, npy_intp src_stride,
@@ -873,6 +879,7 @@ static NPY_GCC_OPT_3 void
         src += src_stride;
 #endif
     }
+    return 0;
 }
 
 #undef _CONVERT_FN
@@ -989,10 +996,14 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
     src_stride0 = src_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride, src, src_stride0, count, src_itemsize, data);
-        return 0;
+        return stransfer(dst, dst_stride, src, src_stride0,
+                         count, src_itemsize, data);
+    }
+    int res = stransfer(dst, dst_stride, src, src_stride0,
+                        N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
     }
-    stransfer(dst, dst_stride, src, src_stride0, N, src_itemsize, data);
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1012,13 +1023,15 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride, src, src_stride0,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(dst, dst_stride, src, src_stride0,
+                             count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride, src, src_stride0,
-                        shape0, src_itemsize, data);
+            res = stransfer(dst, dst_stride, src, src_stride0,
+                            shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         src += src_stride1;
@@ -1073,13 +1086,15 @@ PyArray_TransferNDimToStrided(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride, src, src_stride0,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(dst, dst_stride, src, src_stride0,
+                                     count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride, src, src_stride0,
-                                shape0, src_itemsize, data);
+                    res = stransfer(dst, dst_stride, src, src_stride0,
+                                    shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 src += src_stride1;
@@ -1108,10 +1123,14 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
     dst_stride0 = dst_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride0, src, src_stride, count, src_itemsize, data);
-        return 0;
+        return stransfer(dst, dst_stride0, src, src_stride,
+                         count, src_itemsize, data);
+    }
+    int res = stransfer(dst, dst_stride0, src, src_stride,
+                        N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
     }
-    stransfer(dst, dst_stride0, src, src_stride, N, src_itemsize, data);
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1131,13 +1150,15 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride0, src, src_stride,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(dst, dst_stride0, src, src_stride,
+                             count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride0, src, src_stride,
-                        shape0, src_itemsize, data);
+            res = stransfer(dst, dst_stride0, src, src_stride,
+                            shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         dst += dst_stride1;
@@ -1192,13 +1213,15 @@ PyArray_TransferStridedToNDim(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride0, src, src_stride,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(dst, dst_stride0, src, src_stride,
+                                     count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride0, src, src_stride,
-                                shape0, src_itemsize, data);
+                    res = stransfer(dst, dst_stride0, src, src_stride,
+                                    shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 dst += dst_stride1;
@@ -1228,16 +1251,18 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
     dst_stride0 = dst_strides[0];
     N = shape0 - coord0;
     if (N >= count) {
-        stransfer(dst, dst_stride0,
-                    src, src_stride,
-                    mask, mask_stride,
-                    count, src_itemsize, data);
-        return 0;
-    }
-    stransfer(dst, dst_stride0,
-                src, src_stride,
+        return stransfer(
+                dst, dst_stride0, src, src_stride,
                 mask, mask_stride,
-                N, src_itemsize, data);
+                count, src_itemsize, data);
+    }
+    int res = stransfer(
+            dst, dst_stride0, src, src_stride,
+            mask, mask_stride,
+            N, src_itemsize, data);
+    if (res < 0) {
+        return -1;
+    }
     count -= N;
 
     /* If it's 1-dimensional, there's no more to copy */
@@ -1258,17 +1283,19 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
     N = shape0*M;
     for (i = 0; i < M; ++i) {
         if (shape0 >= count) {
-            stransfer(dst, dst_stride0,
-                        src, src_stride,
-                        mask, mask_stride,
-                        count, src_itemsize, data);
-            return 0;
+            return stransfer(
+                    dst, dst_stride0, src, src_stride,
+                    mask, mask_stride,
+                    count, src_itemsize, data);
         }
         else {
-            stransfer(dst, dst_stride0,
-                        src, src_stride,
-                        mask, mask_stride,
-                        shape0, src_itemsize, data);
+            int res = stransfer(
+                    dst, dst_stride0, src, src_stride,
+                    mask, mask_stride,
+                    shape0, src_itemsize, data);
+            if (res < 0) {
+                return -1;
+            }
         }
         count -= shape0;
         dst += dst_stride1;
@@ -1324,17 +1351,19 @@ PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
             /* A loop for dimensions 0 and 1 */
             for (i = 0; i < shape1; ++i) {
                 if (shape0 >= count) {
-                    stransfer(dst, dst_stride0,
-                                src, src_stride,
-                                mask, mask_stride,
-                                count, src_itemsize, data);
-                    return 0;
+                    return stransfer(
+                            dst, dst_stride0, src, src_stride,
+                            mask, mask_stride,
+                            count, src_itemsize, data);
                 }
                 else {
-                    stransfer(dst, dst_stride0,
-                                src, src_stride,
-                                mask, mask_stride,
-                                shape0, src_itemsize, data);
+                    res = stransfer(
+                            dst, dst_stride0, src, src_stride,
+                            mask, mask_stride,
+                            shape0, src_itemsize, data);
+                    if (res < 0) {
+                        return -1;
+                    }
                 }
                 count -= shape0;
                 dst += dst_stride1;
@@ -1760,13 +1789,23 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                 do {
 
 #if @isget@
-                    stransfer(subspace_ptrs[1], subspace_strides[1],
-                              subspace_ptrs[0], subspace_strides[0],
-                              *counter, src_itemsize, transferdata);
+                    if (NPY_UNLIKELY(stransfer(
+                            subspace_ptrs[1], subspace_strides[1],
+                            subspace_ptrs[0], subspace_strides[0],
+                            *counter, src_itemsize, transferdata) < 0)) {
+                        NPY_END_THREADS;
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
 #else
-                    stransfer(subspace_ptrs[0], subspace_strides[0],
-                              subspace_ptrs[1], subspace_strides[1],
-                              *counter, src_itemsize, transferdata);
+                    if (NPY_UNLIKELY(stransfer(
+                            subspace_ptrs[0], subspace_strides[0],
+                            subspace_ptrs[1], subspace_strides[1],
+                            *counter, src_itemsize, transferdata) < 0)) {
+                        NPY_END_THREADS;
+                        NPY_AUXDATA_FREE(transferdata);
+                        return -1;
+                    }
 #endif
                 } while (mit->subspace_next(mit->subspace_iter));
 
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index c27e0c391..d64962f87 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -233,7 +233,7 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
             || PySlice_Check(index)
             || PyArray_Check(index)
             || !PySequence_Check(index)
-            || PyBaseString_Check(index)) {
+            || PyUnicode_Check(index)) {
 
         return unpack_scalar(index, result, result_n);
     }
@@ -539,22 +539,22 @@ prepare_index(PyArrayObject *self, PyObject *index,
             /*
              * There are two types of boolean indices (which are equivalent,
              * for the most part though). A single boolean index of matching
-             * dimensionality and size is a boolean index.
-             * If this is not the case, it is instead expanded into (multiple)
-             * integer array indices.
+             * shape is a boolean index. If this is not the case, it is
+             * instead expanded into (multiple) integer array indices.
              */
             PyArrayObject *nonzero_result[NPY_MAXDIMS];
 
             if ((index_ndim == 1) && allow_boolean) {
                 /*
-                 * If ndim and size match, this can be optimized as a single
-                 * boolean index. The size check is necessary only to support
-                 * old non-matching sizes by using fancy indexing instead.
-                 * The reason for that is that fancy indexing uses nonzero,
-                 * and only the result of nonzero is checked for legality.
+                 * If shapes match exactly, this can be optimized as a single
+                 * boolean index. When the dimensions are identical but the shapes are not,
+                 * this is always an error. The check ensures that these errors are raised
+                 * and match those of the generic path.
                  */
                 if ((PyArray_NDIM(arr) == PyArray_NDIM(self))
-                        && PyArray_SIZE(arr) == PyArray_SIZE(self)) {
+                        && PyArray_CompareLists(PyArray_DIMS(arr),
+                                                PyArray_DIMS(self),
+                                                PyArray_NDIM(arr))) {
 
                     index_type = HAS_BOOL;
                     indices[curr_idx].type = HAS_BOOL;
@@ -946,9 +946,9 @@ get_view_from_index(PyArrayObject *self, PyArrayObject **view,
                 }
                 break;
             case HAS_SLICE:
-                if (NpySlice_GetIndicesEx(indices[i].object,
-                                          PyArray_DIMS(self)[orig_dim],
-                                          &start, &stop, &step, &n_steps) < 0) {
+                if (PySlice_GetIndicesEx(indices[i].object,
+                                         PyArray_DIMS(self)[orig_dim],
+                                         &start, &stop, &step, &n_steps) < 0) {
                     return -1;
                 }
                 if (n_steps <= 0) {
@@ -1091,6 +1091,7 @@ array_boolean_subscript(PyArrayObject *self,
 
         self_stride = innerstrides[0];
         bmask_stride = innerstrides[1];
+        int res = 0;
         do {
             innersize = *NpyIter_GetInnerLoopSizePtr(iter);
             self_data = dataptrs[0];
@@ -1105,8 +1106,11 @@ array_boolean_subscript(PyArrayObject *self,
                 /* Process unmasked values */
                 bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
                                         &subloopsize, 0);
-                stransfer(ret_data, itemsize, self_data, self_stride,
-                            subloopsize, itemsize, transferdata);
+                res = stransfer(ret_data, itemsize, self_data, self_stride,
+                                subloopsize, itemsize, transferdata);
+                if (res < 0) {
+                    break;
+                }
                 innersize -= subloopsize;
                 self_data += subloopsize * self_stride;
                 ret_data += subloopsize * itemsize;
@@ -1115,8 +1119,15 @@ array_boolean_subscript(PyArrayObject *self,
 
         NPY_END_THREADS;
 
-        NpyIter_Deallocate(iter);
+        if (!NpyIter_Deallocate(iter)) {
+            res = -1;
+        }
         NPY_AUXDATA_FREE(transferdata);
+        if (res < 0) {
+            /* Should be practically impossible, since there is no cast */
+            Py_DECREF(ret);
+            return NULL;
+        }
     }
 
     if (!PyArray_CheckExact(self)) {
@@ -1209,6 +1220,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
     v_data = PyArray_DATA(v);
 
     /* Create an iterator for the data */
+    int res = 0;
     if (size > 0) {
         NpyIter *iter;
         PyArrayObject *op[2] = {self, bmask};
@@ -1253,7 +1265,7 @@ array_assign_boolean_subscript(PyArrayObject *self,
         /* Get a dtype transfer function */
         NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
         if (PyArray_GetDTypeTransferFunction(
-                        IsUintAligned(self) && IsAligned(self) &&
+                 IsUintAligned(self) && IsAligned(self) &&
                         IsUintAligned(v) && IsAligned(v),
                         v_stride, fixed_strides[0],
                         PyArray_DESCR(v), PyArray_DESCR(self),
@@ -1282,8 +1294,11 @@ array_assign_boolean_subscript(PyArrayObject *self,
                 /* Process unmasked values */
                 bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
                                         &subloopsize, 0);
-                stransfer(self_data, self_stride, v_data, v_stride,
-                            subloopsize, src_itemsize, transferdata);
+                res = stransfer(self_data, self_stride, v_data, v_stride,
+                        subloopsize, src_itemsize, transferdata);
+                if (res < 0) {
+                    break;
+                }
                 innersize -= subloopsize;
                 self_data += subloopsize * self_stride;
                 v_data += subloopsize * v_stride;
@@ -1295,22 +1310,12 @@ array_assign_boolean_subscript(PyArrayObject *self,
         }
 
         NPY_AUXDATA_FREE(transferdata);
-        NpyIter_Deallocate(iter);
-    }
-
-    if (needs_api) {
-        /*
-         * FIXME?: most assignment operations stop after the first occurrence
-         * of an error. Boolean does not currently, but should at least
-         * report the error. (This is only relevant for things like str->int
-         * casts which call into python)
-         */
-        if (PyErr_Occurred()) {
-            return -1;
+        if (!NpyIter_Deallocate(iter)) {
+            res = -1;
         }
     }
 
-    return 0;
+    return res;
 }
 
 
@@ -1402,7 +1407,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
     *view = NULL;
 
     /* first check for a single field name */
-    if (PyBaseString_Check(ind)) {
+    if (PyUnicode_Check(ind)) {
         PyObject *tup;
         PyArray_Descr *fieldtype;
         npy_intp offset;
@@ -1413,10 +1418,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
             return 0;
         }
         else if (tup == NULL){
-            PyObject *errmsg = PyUString_FromString("no field of name ");
-            PyUString_Concat(&errmsg, ind);
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
+            PyErr_Format(PyExc_ValueError, "no field of name %S", ind);
             return 0;
         }
         if (_unpack_field(tup, &fieldtype, &offset) < 0) {
@@ -1466,7 +1468,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyErr_Clear();
                 return -1;
             }
-            is_string = PyBaseString_Check(item);
+            is_string = PyUnicode_Check(item);
             Py_DECREF(item);
             if (!is_string) {
                 return -1;
@@ -2340,7 +2342,6 @@ mapiter_fill_info(PyArrayMapIterObject *mit, npy_index_info *indices,
     int consec_status = -1;
     int axis, broadcast_axis;
     npy_intp dimension;
-    PyObject *errmsg, *tmp;
 
     for (i = 0; i < mit->nd_fancy; i++) {
         mit->dimensions[i] = 1;
@@ -2428,35 +2429,38 @@ mapiter_fill_info(PyArrayMapIterObject *mit, npy_index_info *indices,
 
     return 0;
 
-  broadcast_error:
+broadcast_error: ;  // Declarations cannot follow labels, add empty statement.
     /*
      * Attempt to set a meaningful exception. Could also find out
      * if a boolean index was converted.
      */
-    errmsg = PyUString_FromString("shape mismatch: indexing arrays could not "
-                                  "be broadcast together with shapes ");
+    PyObject *errmsg = PyUnicode_FromString("");
     if (errmsg == NULL) {
         return -1;
     }
-
     for (i = 0; i < index_num; i++) {
         if (!(indices[i].type & HAS_FANCY)) {
             continue;
         }
-        tmp = convert_shape_to_string(
-                    PyArray_NDIM((PyArrayObject *)indices[i].object),
-                    PyArray_SHAPE((PyArrayObject *)indices[i].object),
-                    " ");
+
+        int ndim = PyArray_NDIM((PyArrayObject *)indices[i].object);
+        npy_intp *shape = PyArray_SHAPE((PyArrayObject *)indices[i].object);
+        PyObject *tmp = convert_shape_to_string(ndim, shape, " ");
         if (tmp == NULL) {
+            Py_DECREF(errmsg);
             return -1;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
+
+        Py_SETREF(errmsg, PyUnicode_Concat(errmsg, tmp));
+        Py_DECREF(tmp);
         if (errmsg == NULL) {
             return -1;
         }
     }
 
-    PyErr_SetObject(PyExc_IndexError, errmsg);
+    PyErr_Format(PyExc_IndexError,
+            "shape mismatch: indexing arrays could not "
+            "be broadcast together with shapes %S", errmsg);
     Py_DECREF(errmsg);
     return -1;
 }
@@ -2648,7 +2652,6 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
                    npy_uint32 extra_op_flags, PyArrayObject *extra_op,
                    PyArray_Descr *extra_op_dtype)
 {
-    PyObject *errmsg, *tmp;
     /* For shape reporting on error */
     PyArrayObject *original_extra_op = extra_op;
 
@@ -3178,45 +3181,30 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
     goto finish;
 
   broadcast_error:
-    errmsg = PyUString_FromString("shape mismatch: value array "
-                    "of shape ");
-    if (errmsg == NULL) {
-        goto finish;
-    }
-
     /* Report the shape of the original array if it exists */
     if (original_extra_op == NULL) {
         original_extra_op = extra_op;
     }
 
-    tmp = convert_shape_to_string(PyArray_NDIM(original_extra_op),
-                                  PyArray_DIMS(original_extra_op), " ");
-    if (tmp == NULL) {
-        goto finish;
-    }
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
+    int extra_ndim = PyArray_NDIM(original_extra_op);
+    npy_intp *extra_dims = PyArray_DIMS(original_extra_op);
+    PyObject *shape1 = convert_shape_to_string(extra_ndim, extra_dims, " ");
+    if (shape1 == NULL) {
         goto finish;
     }
 
-    tmp = PyUString_FromString("could not be broadcast to indexing "
-                    "result of shape ");
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
+    PyObject *shape2 = convert_shape_to_string(mit->nd, mit->dimensions, "");
+    if (shape2 == NULL) {
+        Py_DECREF(shape1);
         goto finish;
     }
 
-    tmp = convert_shape_to_string(mit->nd, mit->dimensions, "");
-    if (tmp == NULL) {
-        goto finish;
-    }
-    PyUString_ConcatAndDel(&errmsg, tmp);
-    if (errmsg == NULL) {
-        goto finish;
-    }
+    PyErr_Format(PyExc_ValueError,
+            "shape mismatch: value array of shape %S could not be broadcast "
+            "to indexing result of shape %S", shape1, shape2);
 
-    PyErr_SetObject(PyExc_ValueError, errmsg);
-    Py_DECREF(errmsg);
+    Py_DECREF(shape1);
+    Py_DECREF(shape2);
 
   finish:
     Py_XDECREF(extra_op);
@@ -3315,7 +3303,7 @@ PyArray_MapIterArrayCopyIfOverlap(PyArrayObject * a, PyObject * index,
     Py_XDECREF(a_copy);
     Py_XDECREF(subspace);
     Py_XDECREF((PyObject *)mit);
-    for (i=0; i < index_num; i++) {
+    for (i = 0; i < index_num; i++) {
         Py_XDECREF(indices[i].object);
     }
     return NULL;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index a2db8042f..9c8bb4135 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -578,6 +578,28 @@ array_tostring(PyArrayObject *self, PyObject *args, PyObject *kwds)
     return PyArray_ToString(self, order);
 }
 
+/* Like PyArray_ToFile but takes the file as a python object */
+static int
+PyArray_ToFileObject(PyArrayObject *self, PyObject *file, char *sep, char *format)
+{
+    npy_off_t orig_pos = 0;
+    FILE *fd = npy_PyFile_Dup2(file, "wb", &orig_pos);
+
+    if (fd == NULL) {
+        return -1;
+    }
+
+    int write_ret = PyArray_ToFile(self, fd, sep, format);
+    PyObject *err_type, *err_value, *err_traceback;
+    PyErr_Fetch(&err_type, &err_value, &err_traceback);
+    int close_ret = npy_PyFile_DupClose2(file, fd, orig_pos);
+    npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
+
+    if (write_ret || close_ret) {
+        return -1;
+    }
+    return 0;
+}
 
 /* This should grow an order= keyword to be consistent
  */
@@ -587,10 +609,8 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
 {
     int own;
     PyObject *file;
-    FILE *fd;
     char *sep = "";
     char *format = "";
-    npy_off_t orig_pos = 0;
     static char *kwlist[] = {"file", "sep", "format", NULL};
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|ss:tofile", kwlist,
@@ -615,25 +635,22 @@ array_tofile(PyArrayObject *self, PyObject *args, PyObject *kwds)
         own = 0;
     }
 
-    fd = npy_PyFile_Dup2(file, "wb", &orig_pos);
-    if (fd == NULL) {
-        goto fail;
-    }
-    if (PyArray_ToFile(self, fd, sep, format) < 0) {
-        goto fail;
-    }
-    if (npy_PyFile_DupClose2(file, fd, orig_pos) < 0) {
-        goto fail;
-    }
-    if (own && npy_PyFile_CloseFile(file) < 0) {
-        goto fail;
+    int file_ret = PyArray_ToFileObject(self, file, sep, format);
+    int close_ret = 0;
+
+    if (own) {
+        PyObject *err_type, *err_value, *err_traceback;
+        PyErr_Fetch(&err_type, &err_value, &err_traceback);
+        close_ret = npy_PyFile_CloseFile(file);
+        npy_PyErr_ChainExceptions(err_type, err_value, err_traceback);
     }
-    Py_DECREF(file);
-    Py_RETURN_NONE;
 
-fail:
     Py_DECREF(file);
-    return NULL;
+
+    if (file_ret || close_ret) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
 }
 
 static PyObject *
@@ -844,6 +861,21 @@ array_astype(PyArrayObject *self, PyObject *args, PyObject *kwds)
         if (ret == NULL) {
             return NULL;
         }
+        /* NumPy 1.20, 2020-10-01 */
+        if ((PyArray_NDIM(self) != PyArray_NDIM(ret)) &&
+                DEPRECATE_FUTUREWARNING(
+                    "casting an array to a subarray dtype "
+                    "will not using broadcasting in the future, but cast each "
+                    "element to the new dtype and then append the dtype's shape "
+                    "to the new array. You can opt-in to the new behaviour, by "
+                    "additional field to the cast: "
+                    "`arr.astype(np.dtype([('f', dtype)]))['f']`.\n"
+                    "This may lead to a different result or to current failures "
+                    "succeeding.  "
+                    "(FutureWarning since NumPy 1.20)") < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
 
         if (PyArray_CopyInto(ret, self) < 0) {
             Py_DECREF(ret);
@@ -1508,14 +1540,14 @@ _deepcopy_call(char *iptr, char *optr, PyArray_Descr *dtype,
     else {
         PyObject *itemp, *otemp;
         PyObject *res;
-        NPY_COPY_PYOBJECT_PTR(&itemp, iptr);
-        NPY_COPY_PYOBJECT_PTR(&otemp, optr);
+        memcpy(&itemp, iptr, sizeof(itemp));
+        memcpy(&otemp, optr, sizeof(otemp));
         Py_XINCREF(itemp);
         /* call deepcopy on this argument */
         res = PyObject_CallFunctionObjArgs(deepcopy, itemp, visit, NULL);
         Py_XDECREF(itemp);
         Py_XDECREF(otemp);
-        NPY_COPY_PYOBJECT_PTR(optr, &res);
+        memcpy(optr, &res, sizeof(res));
     }
 
 }
@@ -1616,7 +1648,7 @@ _getlist_pkl(PyArrayObject *self)
     }
     while (iter->index < iter->size) {
         theobject = getitem(iter->dataptr, self);
-        PyList_SET_ITEM(list, (int) iter->index, theobject);
+        PyList_SET_ITEM(list, iter->index, theobject);
         PyArray_ITER_NEXT(iter);
     }
     Py_DECREF(iter);
@@ -1636,7 +1668,7 @@ _setlist_pkl(PyArrayObject *self, PyObject *list)
         return -1;
     }
     while(iter->index < iter->size) {
-        theobject = PyList_GET_ITEM(list, (int) iter->index);
+        theobject = PyList_GET_ITEM(list, iter->index);
         setitem(theobject, iter->dataptr, self);
         PyArray_ITER_NEXT(iter);
     }
@@ -1676,7 +1708,7 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
                      Py_BuildValue("ONc",
                                    (PyObject *)Py_TYPE(self),
                                    Py_BuildValue("(N)",
-                                                 PyInt_FromLong(0)),
+                                                 PyLong_FromLong(0)),
                                    /* dummy data-type */
                                    'b'));
 
@@ -1701,7 +1733,7 @@ array_reduce(PyArrayObject *self, PyObject *NPY_UNUSED(args))
         Py_DECREF(ret);
         return NULL;
     }
-    PyTuple_SET_ITEM(state, 0, PyInt_FromLong(version));
+    PyTuple_SET_ITEM(state, 0, PyLong_FromLong(version));
     PyTuple_SET_ITEM(state, 1, PyObject_GetAttrString((PyObject *)self,
                                                       "shape"));
     descr = PyArray_DESCR(self);
@@ -1763,7 +1795,7 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
 #if PY_VERSION_HEX >= 0x03080000
     /* we expect protocol 5 to be available in Python 3.8 */
     pickle_module = PyImport_ImportModule("pickle");
-#elif PY_VERSION_HEX >= 0x03060000
+#else
     pickle_module = PyImport_ImportModule("pickle5");
     if (pickle_module == NULL) {
         /* for protocol 5, raise a clear ImportError if pickle5 is not found
@@ -1772,10 +1804,6 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
                 "requires the pickle5 module for Python >=3.6 and <3.8");
         return NULL;
     }
-#else
-    PyErr_SetString(PyExc_ValueError, "pickle protocol 5 is not available "
-                                      "for Python < 3.6");
-    return NULL;
 #endif
     if (pickle_module == NULL){
         return NULL;
@@ -2152,7 +2180,7 @@ static PyObject *
 array_sizeof(PyArrayObject *self)
 {
     /* object + dimension and strides */
-    Py_ssize_t nbytes = NPY_SIZEOF_PYARRAYOBJECT +
+    Py_ssize_t nbytes = Py_TYPE(self)->tp_basicsize +
         PyArray_NDIM(self) * sizeof(npy_intp) * 2;
     if (PyArray_CHKFLAGS(self, NPY_ARRAY_OWNDATA)) {
         nbytes += PyArray_NBYTES(self);
@@ -2585,9 +2613,10 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     PyArrayObject *arr;
     PyArray_Descr *dtype;
     PyObject *c;
+
     if (PyArray_SIZE(self) != 1) {
-        PyErr_SetString(PyExc_TypeError, "only length-1 arrays can "\
-                        "be converted to Python scalars");
+        PyErr_SetString(PyExc_TypeError,
+                "only length-1 arrays can be converted to Python scalars");
         return NULL;
     }
 
@@ -2598,38 +2627,18 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 
     if (!PyArray_CanCastArrayTo(self, dtype, NPY_SAME_KIND_CASTING) &&
             !(PyArray_TYPE(self) == NPY_OBJECT)) {
-        PyObject *err, *msg_part;
+        PyObject *descr = (PyObject*)PyArray_DESCR(self);
+
         Py_DECREF(dtype);
-        err = PyString_FromString("unable to convert ");
-        if (err == NULL) {
-            return NULL;
-        }
-        msg_part = PyObject_Repr((PyObject*)PyArray_DESCR(self));
-        if (msg_part == NULL) {
-            Py_DECREF(err);
-            return NULL;
-        }
-        PyString_ConcatAndDel(&err, msg_part);
-        if (err == NULL) {
-            return NULL;
-        }
-        msg_part = PyString_FromString(", to complex.");
-        if (msg_part == NULL) {
-            Py_DECREF(err);
-            return NULL;
-        }
-        PyString_ConcatAndDel(&err, msg_part);
-        if (err == NULL) {
-            return NULL;
-        }
-        PyErr_SetObject(PyExc_TypeError, err);
-        Py_DECREF(err);
+        PyErr_Format(PyExc_TypeError,
+                "Unable to convert %R to complex", descr);
         return NULL;
     }
 
     if (PyArray_TYPE(self) == NPY_OBJECT) {
         /* let python try calling __complex__ on the object. */
         PyObject *args, *res;
+
         Py_DECREF(dtype);
         args = Py_BuildValue("(O)", *((PyObject**)PyArray_DATA(self)));
         if (args == NULL) {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 9a34685f4..32c5ac0dc 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -30,6 +30,8 @@
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
+#include "convert_datatype.h"
+#include "legacy_dtype_implementation.h"
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
@@ -65,7 +67,6 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "templ_common.h" /* for npy_mul_with_overflow_intp */
 #include "compiled_base.h"
 #include "mem_overlap.h"
-#include "alloc.h"
 #include "typeinfo.h"
 
 #include "get_attr_string.h"
@@ -363,7 +364,8 @@ PyArray_GetSubType(int narrays, PyArrayObject **arrays) {
  */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
-                          PyArrayObject* ret)
+                          PyArrayObject* ret, PyArray_Descr *dtype,
+                          NPY_CASTING casting)
 {
     int iarrays, idim, ndim;
     npy_intp shape[NPY_MAXDIMS];
@@ -427,6 +429,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
     }
 
     if (ret != NULL) {
+        assert(dtype == NULL);
         if (PyArray_NDIM(ret) != ndim) {
             PyErr_SetString(PyExc_ValueError,
                             "Output array has wrong dimensionality");
@@ -446,10 +449,16 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
-        /* Get the resulting dtype from combining all the arrays */
-        PyArray_Descr *dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
         if (dtype == NULL) {
-            return NULL;
+            /* Get the resulting dtype from combining all the arrays */
+            dtype = (PyArray_Descr *)PyArray_ResultType(
+                                                narrays, arrays, 0, NULL);
+            if (dtype == NULL) {
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(dtype);
         }
 
         /*
@@ -495,7 +504,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
 
         /* Copy the data for this array */
         if (PyArray_AssignArray((PyArrayObject *)sliding_view, arrays[iarrays],
-                            NULL, NPY_SAME_KIND_CASTING) < 0) {
+                            NULL, casting) < 0) {
             Py_DECREF(sliding_view);
             Py_DECREF(ret);
             return NULL;
@@ -515,7 +524,9 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
  */
 NPY_NO_EXPORT PyArrayObject *
 PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
-                                    NPY_ORDER order, PyArrayObject *ret)
+                                   NPY_ORDER order, PyArrayObject *ret,
+                                   PyArray_Descr *dtype, NPY_CASTING casting,
+                                   npy_bool casting_not_passed)
 {
     int iarrays;
     npy_intp shape = 0;
@@ -542,7 +553,10 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         }
     }
 
+    int out_passed = 0;
     if (ret != NULL) {
+        assert(dtype == NULL);
+        out_passed = 1;
         if (PyArray_NDIM(ret) != 1) {
             PyErr_SetString(PyExc_ValueError,
                             "Output array must be 1D");
@@ -561,10 +575,16 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
-        /* Get the resulting dtype from combining all the arrays */
-        PyArray_Descr *dtype = PyArray_ResultType(narrays, arrays, 0, NULL);
         if (dtype == NULL) {
-            return NULL;
+            /* Get the resulting dtype from combining all the arrays */
+            dtype = (PyArray_Descr *)PyArray_ResultType(
+                                            narrays, arrays, 0, NULL);
+            if (dtype == NULL) {
+                return NULL;
+            }
+        }
+        else {
+            Py_INCREF(dtype);
         }
 
         stride = dtype->elsize;
@@ -594,10 +614,37 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         return NULL;
     }
 
+    int give_deprecation_warning = 1;  /* To give warning for just one input array. */
     for (iarrays = 0; iarrays < narrays; ++iarrays) {
         /* Adjust the window dimensions for this array */
         sliding_view->dimensions[0] = PyArray_SIZE(arrays[iarrays]);
 
+        if (!PyArray_CanCastArrayTo(
+                arrays[iarrays], PyArray_DESCR(ret), casting)) {
+            /* This should be an error, but was previously allowed here. */
+            if (casting_not_passed && out_passed) {
+                /* NumPy 1.20, 2020-09-03 */
+                if (give_deprecation_warning && DEPRECATE(
+                        "concatenate() with `axis=None` will use same-kind "
+                        "casting by default in the future. Please use "
+                        "`casting='unsafe'` to retain the old behaviour. "
+                        "In the future this will be a TypeError.") < 0) {
+                    Py_DECREF(sliding_view);
+                    Py_DECREF(ret);
+                    return NULL;
+                }
+                give_deprecation_warning = 0;
+            }
+            else {
+                npy_set_invalid_cast_error(
+                        PyArray_DESCR(arrays[iarrays]), PyArray_DESCR(ret),
+                        casting, PyArray_NDIM(arrays[iarrays]) == 0);
+                Py_DECREF(sliding_view);
+                Py_DECREF(ret);
+                return NULL;
+            }
+        }
+
         /* Copy the data for this array */
         if (PyArray_CopyAsFlat((PyArrayObject *)sliding_view, arrays[iarrays],
                             order) < 0) {
@@ -615,8 +662,21 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
     return ret;
 }
 
+
+/**
+ * Implementation for np.concatenate
+ *
+ * @param op Sequence of arrays to concatenate
+ * @param axis Axis to concatenate along
+ * @param ret output array to fill
+ * @param dtype Forced output array dtype (cannot be combined with ret)
+ * @param casting Casting mode used
+ * @param casting_not_passed Deprecation helper
+ */
 NPY_NO_EXPORT PyObject *
-PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
+PyArray_ConcatenateInto(PyObject *op,
+        int axis, PyArrayObject *ret, PyArray_Descr *dtype,
+        NPY_CASTING casting, npy_bool casting_not_passed)
 {
     int iarrays, narrays;
     PyArrayObject **arrays;
@@ -626,6 +686,12 @@ PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
                         "The first input argument needs to be a sequence");
         return NULL;
     }
+    if (ret != NULL && dtype != NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                "concatenate() only takes `out` or `dtype` as an "
+                "argument, but both were provided.");
+        return NULL;
+    }
 
     /* Convert the input list into arrays */
     narrays = PySequence_Size(op);
@@ -652,10 +718,13 @@ PyArray_ConcatenateInto(PyObject *op, int axis, PyArrayObject *ret)
     }
 
     if (axis >= NPY_MAXDIMS) {
-        ret = PyArray_ConcatenateFlattenedArrays(narrays, arrays, NPY_CORDER, ret);
+        ret = PyArray_ConcatenateFlattenedArrays(
+                narrays, arrays, NPY_CORDER, ret, dtype,
+                casting, casting_not_passed);
     }
     else {
-        ret = PyArray_ConcatenateArrays(narrays, arrays, axis, ret);
+        ret = PyArray_ConcatenateArrays(
+                narrays, arrays, axis, ret, dtype, casting);
     }
 
     for (iarrays = 0; iarrays < narrays; ++iarrays) {
@@ -687,7 +756,16 @@ fail:
 NPY_NO_EXPORT PyObject *
 PyArray_Concatenate(PyObject *op, int axis)
 {
-    return PyArray_ConcatenateInto(op, axis, NULL);
+    /* retain legacy behaviour for casting */
+    NPY_CASTING casting;
+    if (axis >= NPY_MAXDIMS) {
+        casting = NPY_UNSAFE_CASTING;
+    }
+    else {
+        casting = NPY_SAME_KIND_CASTING;
+    }
+    return PyArray_ConcatenateInto(
+            op, axis, NULL, NULL, casting, 0);
 }
 
 static int
@@ -1404,65 +1482,6 @@ array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
-/*
- * Compare the field dictionaries for two types.
- *
- * Return 1 if the field types and field names of the two descrs are equal and
- * in the same order, 0 if not.
- */
-static int
-_equivalent_fields(PyArray_Descr *type1, PyArray_Descr *type2) {
-
-    int val;
-
-    if (type1->fields == type2->fields && type1->names == type2->names) {
-        return 1;
-    }
-    if (type1->fields == NULL || type2->fields == NULL) {
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(type1->fields, type2->fields, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(type1->names, type2->names, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    return 1;
-}
-
-/*
- * Compare the subarray data for two types.
- * Return 1 if they are the same, 0 if not.
- */
-static int
-_equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
-{
-    int val;
-
-    if (sub1 == sub2) {
-        return 1;
-
-    }
-    if (sub1 == NULL || sub2 == NULL) {
-        return 0;
-    }
-
-    val = PyObject_RichCompareBool(sub1->shape, sub2->shape, Py_EQ);
-    if (val != 1 || PyErr_Occurred()) {
-        PyErr_Clear();
-        return 0;
-    }
-
-    return PyArray_EquivTypes(sub1->base, sub2->base);
-}
-
 
 /*NUMPY_API
  *
@@ -1472,40 +1491,24 @@ _equivalent_subarrays(PyArray_ArrayDescr *sub1, PyArray_ArrayDescr *sub2)
 NPY_NO_EXPORT unsigned char
 PyArray_EquivTypes(PyArray_Descr *type1, PyArray_Descr *type2)
 {
-    int type_num1, type_num2, size1, size2;
-
-    if (type1 == type2) {
-        return NPY_TRUE;
-    }
-
-    type_num1 = type1->type_num;
-    type_num2 = type2->type_num;
-    size1 = type1->elsize;
-    size2 = type2->elsize;
-
-    if (size1 != size2) {
-        return NPY_FALSE;
-    }
-    if (PyArray_ISNBO(type1->byteorder) != PyArray_ISNBO(type2->byteorder)) {
-        return NPY_FALSE;
-    }
-    if (type1->subarray || type2->subarray) {
-        return ((type_num1 == type_num2)
-                && _equivalent_subarrays(type1->subarray, type2->subarray));
-    }
-    if (type_num1 == NPY_VOID || type_num2 == NPY_VOID) {
-        return ((type_num1 == type_num2) && _equivalent_fields(type1, type2));
-    }
-    if (type_num1 == NPY_DATETIME
-            || type_num1 == NPY_TIMEDELTA
-            || type_num2 == NPY_DATETIME
-            || type_num2 == NPY_TIMEDELTA) {
-        return ((type_num1 == type_num2)
-                && has_equivalent_datetime_metadata(type1, type2));
+#if NPY_USE_NEW_CASTINGIMPL
+    /*
+     * Do not use PyArray_CanCastTypeTo because it supports legacy flexible
+     * dtypes as input.
+     */
+    NPY_CASTING safety = PyArray_GetCastSafety(type1, type2, NULL);
+    if (safety < 0) {
+        PyErr_Clear();
+        return 0;
     }
-    return type1->kind == type2->kind;
+    /* If casting is "no casting" this dtypes are considered equivalent. */
+    return PyArray_MinCastSafety(safety, NPY_NO_CASTING) == NPY_NO_CASTING;
+#else
+    return PyArray_LegacyEquivTypes(type1, type2);
+#endif
 }
 
+
 /*NUMPY_API*/
 NPY_NO_EXPORT unsigned char
 PyArray_EquivTypenums(int typenum1, int typenum2)
@@ -1582,13 +1585,16 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     npy_bool subok = NPY_FALSE;
     npy_bool copy = NPY_TRUE;
     int ndmin = 0, nd;
+    PyObject* like;
     PyArray_Descr *type = NULL;
     PyArray_Descr *oldtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
     int flags = 0;
 
-    static char *kwd[]= {"object", "dtype", "copy", "order", "subok",
-                         "ndmin", NULL};
+    PyObject* array_function_result = NULL;
+
+    static char *kwd[] = {"object", "dtype", "copy", "order", "subok",
+                          "ndmin", "like", NULL};
 
     if (PyTuple_GET_SIZE(args) > 2) {
         PyErr_Format(PyExc_TypeError,
@@ -1597,6 +1603,12 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "array", args, kws);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     /* super-fast path for ndarray argument calls */
     if (PyTuple_GET_SIZE(args) == 0) {
         goto full_path;
@@ -1674,13 +1686,14 @@ _array_fromobject(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws)
     }
 
 full_path:
-    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i:array", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|O&O&O&O&i$O:array", kwd,
                 &op,
                 PyArray_DescrConverter2, &type,
                 PyArray_BoolConverter, &copy,
                 PyArray_OrderConverter, &order,
                 PyArray_BoolConverter, &subok,
-                &ndmin)) {
+                &ndmin,
+                &like)) {
         goto clean_type;
     }
 
@@ -1817,20 +1830,29 @@ static PyObject *
 array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
 
-    static char *kwlist[] = {"shape", "dtype", "order", NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", "like", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
+    PyObject *like = NULL;
     npy_bool is_f_order;
+    PyObject *array_function_result = NULL;
     PyArrayObject *ret = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&:empty", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&$O:empty", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &like)) {
         goto fail;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "empty", args, kwds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     switch (order) {
         case NPY_CORDER:
             is_f_order = NPY_FALSE;
@@ -1908,20 +1930,41 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     int alloc = 0;
     void *dptr;
     PyObject *ret;
-
+    PyObject *base = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|O:scalar", kwlist,
                 &PyArrayDescr_Type, &typecode, &obj)) {
         return NULL;
     }
     if (PyDataType_FLAGCHK(typecode, NPY_LIST_PICKLE)) {
-        if (!PySequence_Check(obj)) {
-            PyErr_SetString(PyExc_TypeError,
-                            "found non-sequence while unpickling scalar with "
-                            "NPY_LIST_PICKLE set");
+        if (typecode->type_num == NPY_OBJECT) {
+            /* Deprecated 2020-11-24, NumPy 1.20 */
+            if (DEPRECATE(
+                    "Unpickling a scalar with object dtype is deprecated. "
+                    "Object scalars should never be created. If this was a "
+                    "properly created pickle, please open a NumPy issue. In "
+                    "a best effort this returns the original object.") < 0) {
+                return NULL;
+            }
+            Py_INCREF(obj);
+            return obj;
+        }
+        /* We store the full array to unpack it here: */
+        if (!PyArray_CheckExact(obj)) {
+            /* We pickle structured voids as arrays currently */
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Unpickling NPY_LIST_PICKLE (structured void) scalar "
+                    "requires an array.  The pickle file may be corrupted?");
             return NULL;
         }
-        dptr = &obj;
+        if (!PyArray_EquivTypes(PyArray_DESCR((PyArrayObject *)obj), typecode)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Pickled array is not compatible with requested scalar "
+                    "dtype.  The pickle file may be corrupted?");
+            return NULL;
+        }
+        base = obj;
+        dptr = PyArray_BYTES((PyArrayObject *)obj);
     }
 
     else if (PyDataType_FLAGCHK(typecode, NPY_ITEM_IS_POINTER)) {
@@ -1956,22 +1999,22 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
                     return NULL;
                 }
             }
-            if (!PyString_Check(obj)) {
+            if (!PyBytes_Check(obj)) {
                 PyErr_SetString(PyExc_TypeError,
-                        "initializing object must be a string");
+                        "initializing object must be a bytes object");
                 Py_XDECREF(tmpobj);
                 return NULL;
             }
-            if (PyString_GET_SIZE(obj) < typecode->elsize) {
+            if (PyBytes_GET_SIZE(obj) < typecode->elsize) {
                 PyErr_SetString(PyExc_ValueError,
                         "initialization string is too small");
                 Py_XDECREF(tmpobj);
                 return NULL;
             }
-            dptr = PyString_AS_STRING(obj);
+            dptr = PyBytes_AS_STRING(obj);
         }
     }
-    ret = PyArray_Scalar(dptr, typecode, NULL);
+    ret = PyArray_Scalar(dptr, typecode, base);
 
     /* free dptr which contains zeros */
     if (alloc) {
@@ -1984,20 +2027,29 @@ array_scalar(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 static PyObject *
 array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 {
-    static char *kwlist[] = {"shape", "dtype", "order", NULL};
+    static char *kwlist[] = {"shape", "dtype", "order", "like", NULL};
     PyArray_Descr *typecode = NULL;
     PyArray_Dims shape = {NULL, 0};
     NPY_ORDER order = NPY_CORDER;
+    PyObject *like = NULL;
     npy_bool is_f_order = NPY_FALSE;
+    PyObject *array_function_result = NULL;
     PyArrayObject *ret = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&:zeros", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&$O:zeros", kwlist,
                 PyArray_IntpConverter, &shape,
                 PyArray_DescrConverter, &typecode,
-                PyArray_OrderConverter, &order)) {
+                PyArray_OrderConverter, &order,
+                &like)) {
         goto fail;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "zeros", args, kwds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     switch (order) {
         case NPY_CORDER:
             is_f_order = NPY_FALSE;
@@ -2050,16 +2102,24 @@ array_fromstring(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
     Py_ssize_t nin = -1;
     char *sep = NULL;
     Py_ssize_t s;
-    static char *kwlist[] = {"string", "dtype", "count", "sep", NULL};
+    static char *kwlist[] = {"string", "dtype", "count", "sep", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *descr = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "s#|O&" NPY_SSIZE_T_PYFMT "s:fromstring", kwlist,
-                &data, &s, PyArray_DescrConverter, &descr, &nin, &sep)) {
+                "s#|O&" NPY_SSIZE_T_PYFMT "s$O:fromstring", kwlist,
+                &data, &s, PyArray_DescrConverter, &descr, &nin, &sep, &like)) {
         Py_XDECREF(descr);
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "fromstring", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     /* binary mode, condition copied from PyArray_FromString */
     if (sep == NULL || strlen(sep) == 0) {
         /* Numpy 1.14, 2017-10-19 */
@@ -2082,19 +2142,27 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
     PyObject *err_type = NULL, *err_value = NULL, *err_traceback = NULL;
     char *sep = "";
     Py_ssize_t nin = -1;
-    static char *kwlist[] = {"file", "dtype", "count", "sep", "offset", NULL};
+    static char *kwlist[] = {"file", "dtype", "count", "sep", "offset", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *type = NULL;
+    PyObject *array_function_result = NULL;
     int own;
     npy_off_t orig_pos = 0, offset = 0;
     FILE *fp;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "O|O&" NPY_SSIZE_T_PYFMT "s" NPY_OFF_T_PYFMT ":fromfile", kwlist,
-                &file, PyArray_DescrConverter, &type, &nin, &sep, &offset)) {
+                "O|O&" NPY_SSIZE_T_PYFMT "s" NPY_OFF_T_PYFMT "$O:fromfile", kwlist,
+                &file, PyArray_DescrConverter, &type, &nin, &sep, &offset, &like)) {
         Py_XDECREF(type);
         return NULL;
     }
 
+    array_function_result = array_implement_c_array_function_creation(
+            "fromfile", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     file = NpyPath_PathlikeToFspath(file);
     if (file == NULL) {
         return NULL;
@@ -2106,7 +2174,7 @@ array_fromfile(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
         Py_DECREF(file);
         return NULL;
     }
-    if (PyString_Check(file) || PyUnicode_Check(file)) {
+    if (PyBytes_Check(file) || PyUnicode_Check(file)) {
         Py_SETREF(file, npy_PyFile_OpenFile(file, "rb"));
         if (file == NULL) {
             Py_XDECREF(type);
@@ -2161,15 +2229,25 @@ array_fromiter(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds)
 {
     PyObject *iter;
     Py_ssize_t nin = -1;
-    static char *kwlist[] = {"iter", "dtype", "count", NULL};
+    static char *kwlist[] = {"iter", "dtype", "count", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *descr = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "OO&|" NPY_SSIZE_T_PYFMT ":fromiter", kwlist,
-                &iter, PyArray_DescrConverter, &descr, &nin)) {
+                "OO&|" NPY_SSIZE_T_PYFMT "$O:fromiter", kwlist,
+                &iter, PyArray_DescrConverter, &descr, &nin, &like)) {
         Py_XDECREF(descr);
         return NULL;
     }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "fromiter", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        Py_DECREF(descr);
+        return array_function_result;
+    }
+
     return PyArray_FromIter(iter, descr, (npy_intp)nin);
 }
 
@@ -2178,15 +2256,24 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
 {
     PyObject *obj = NULL;
     Py_ssize_t nin = -1, offset = 0;
-    static char *kwlist[] = {"buffer", "dtype", "count", "offset", NULL};
+    static char *kwlist[] = {"buffer", "dtype", "count", "offset", "like", NULL};
+    PyObject *like = NULL;
     PyArray_Descr *type = NULL;
+    PyObject *array_function_result = NULL;
 
     if (!PyArg_ParseTupleAndKeywords(args, keywds,
-                "O|O&" NPY_SSIZE_T_PYFMT NPY_SSIZE_T_PYFMT ":frombuffer", kwlist,
-                &obj, PyArray_DescrConverter, &type, &nin, &offset)) {
+                "O|O&" NPY_SSIZE_T_PYFMT NPY_SSIZE_T_PYFMT "$O:frombuffer", kwlist,
+                &obj, PyArray_DescrConverter, &type, &nin, &offset, &like)) {
         Py_XDECREF(type);
         return NULL;
     }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "frombuffer", args, keywds);
+    if (array_function_result != Py_NotImplemented) {
+        return array_function_result;
+    }
+
     if (type == NULL) {
         type = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -2198,11 +2285,27 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 {
     PyObject *a0;
     PyObject *out = NULL;
+    PyArray_Descr *dtype = NULL;
+    NPY_CASTING casting = NPY_SAME_KIND_CASTING;
+    PyObject *casting_obj = NULL;
+    PyObject *res;
     int axis = 0;
-    static char *kwlist[] = {"seq", "axis", "out", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O:concatenate", kwlist,
-                &a0, PyArray_AxisConverter, &axis, &out)) {
+    static char *kwlist[] = {"seq", "axis", "out", "dtype", "casting", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O$O&O:concatenate", kwlist,
+                &a0, PyArray_AxisConverter, &axis, &out,
+                PyArray_DescrConverter2, &dtype, &casting_obj)) {
+        return NULL;
+    }
+    int casting_not_passed = 0;
+    if (casting_obj == NULL) {
+        /*
+         * Casting was not passed in, needed for deprecation only.
+         * This should be simplified once the deprecation is finished.
+         */
+        casting_not_passed = 1;
+    }
+    else if (!PyArray_CastingConverter(casting_obj, &casting)) {
+        Py_XDECREF(dtype);
         return NULL;
     }
     if (out != NULL) {
@@ -2211,10 +2314,14 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
         }
         else if (!PyArray_Check(out)) {
             PyErr_SetString(PyExc_TypeError, "'out' must be an array");
+            Py_XDECREF(dtype);
             return NULL;
         }
     }
-    return PyArray_ConcatenateInto(a0, axis, (PyArrayObject *)out);
+    res = PyArray_ConcatenateInto(a0, axis, (PyArrayObject *)out, dtype,
+            casting, casting_not_passed);
+    Py_XDECREF(dtype);
+    return res;
 }
 
 static PyObject *
@@ -2635,7 +2742,7 @@ array_einsum(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     arg0 = PyTuple_GET_ITEM(args, 0);
 
     /* einsum('i,j', a, b), einsum('i,j->ij', a, b) */
-    if (PyString_Check(arg0) || PyUnicode_Check(arg0)) {
+    if (PyBytes_Check(arg0) || PyUnicode_Check(arg0)) {
         nop = einsum_sub_op_from_str(args, &str_obj, &subscripts, op);
     }
     /* einsum(a, [0], b, [1]), einsum(a, [0], b, [1], [0,1]) */
@@ -2766,17 +2873,41 @@ array_correlate2(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 static PyObject *
 array_arange(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kws) {
     PyObject *o_start = NULL, *o_stop = NULL, *o_step = NULL, *range=NULL;
-    static char *kwd[]= {"start", "stop", "step", "dtype", NULL};
+    PyObject *like = NULL;
+    PyObject *array_function_result = NULL;
+    static char *kwd[] = {"start", "stop", "step", "dtype", "like", NULL};
     PyArray_Descr *typecode = NULL;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kws, "O|OOO&:arange", kwd,
+    if (!PyArg_ParseTupleAndKeywords(args, kws, "|OOOO&$O:arange", kwd,
                 &o_start,
                 &o_stop,
                 &o_step,
-                PyArray_DescrConverter2, &typecode)) {
+                PyArray_DescrConverter2, &typecode,
+                &like)) {
         Py_XDECREF(typecode);
         return NULL;
     }
+
+    if (o_stop == NULL) {
+        if (args == NULL || PyTuple_GET_SIZE(args) == 0){
+            PyErr_SetString(PyExc_TypeError,
+                "arange() requires stop to be specified.");
+            Py_XDECREF(typecode);
+            return NULL;
+        }
+    }
+    else if (o_start == NULL) {
+        o_start = o_stop;
+        o_stop = NULL;
+    }
+
+    array_function_result = array_implement_c_array_function_creation(
+            "arange", args, kws);
+    if (array_function_result != Py_NotImplemented) {
+        Py_XDECREF(typecode);
+        return array_function_result;
+    }
+
     range = PyArray_ArangeObj(o_start, o_stop, o_step, typecode);
     Py_XDECREF(typecode);
 
@@ -2810,7 +2941,7 @@ array__get_ndarray_c_version(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObje
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
         return NULL;
     }
-    return PyInt_FromLong( (long) PyArray_GetNDArrayCVersion() );
+    return PyLong_FromLong( (long) PyArray_GetNDArrayCVersion() );
 }
 
 /*NUMPY_API
@@ -3708,7 +3839,7 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
     }
 
     if (PyArray_TYPE(char_array) == NPY_STRING) {
-        method = PyObject_GetAttr((PyObject *)&PyString_Type, method_name);
+        method = PyObject_GetAttr((PyObject *)&PyBytes_Type, method_name);
     }
     else if (PyArray_TYPE(char_array) == NPY_UNICODE) {
         method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
@@ -3804,36 +3935,6 @@ _PyArray_GetSigintBuf(void)
 
 
 static PyObject *
-test_interrupt(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    int kind = 0;
-    int a = 0;
-
-    if (!PyArg_ParseTuple(args, "|i:test_interrupt", &kind)) {
-        return NULL;
-    }
-    if (kind) {
-        Py_BEGIN_ALLOW_THREADS;
-        while (a >= 0) {
-            if ((a % 1000 == 0) && PyOS_InterruptOccurred()) {
-                break;
-            }
-            a += 1;
-        }
-        Py_END_ALLOW_THREADS;
-    }
-    else {
-        NPY_SIGINT_ON
-        while(a >= 0) {
-            a += 1;
-        }
-        NPY_SIGINT_OFF
-    }
-    return PyInt_FromLong(a);
-}
-
-
-static PyObject *
 array_shares_memory_impl(PyObject *args, PyObject *kwds, Py_ssize_t default_max_work,
                          int raise_exceptions)
 {
@@ -3980,7 +4081,7 @@ normalize_axis_index(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    return PyInt_FromLong(axis);
+    return PyLong_FromLong(axis);
 }
 
 
@@ -4128,9 +4229,6 @@ static struct PyMethodDef array_module_methods[] = {
     {"_vec_string",
         (PyCFunction)_vec_string,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"test_interrupt",
-        (PyCFunction)test_interrupt,
-        METH_VARARGS, NULL},
     {"_insert", (PyCFunction)arr_insert,
         METH_VARARGS | METH_KEYWORDS,
         "Insert vals sequentially into equivalent 1-d positions "
@@ -4162,6 +4260,8 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS, NULL},
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     /* from umath */
     {"frompyfunc",
         (PyCFunction) ufunc_frompyfunc,
@@ -4180,6 +4280,7 @@ static struct PyMethodDef array_module_methods[] = {
 };
 
 #include "__multiarray_api.c"
+#include "array_method.h"
 
 /* Establish scalar-type hierarchy
  *
@@ -4202,7 +4303,7 @@ setup_scalartypes(PyObject *NPY_UNUSED(dict))
     if (PyType_Ready(&PyComplex_Type) < 0) {
         return -1;
     }
-    if (PyType_Ready(&PyString_Type) < 0) {
+    if (PyType_Ready(&PyBytes_Type) < 0) {
         return -1;
     }
     if (PyType_Ready(&PyUnicode_Type) < 0) {
@@ -4274,12 +4375,6 @@ setup_scalartypes(PyObject *NPY_UNUSED(dict))
     /* Timedelta is an integer with an associated unit */
     SINGLE_INHERIT(Timedelta, SignedInteger);
 
-    /*
-       fprintf(stderr,
-        "tp_free = %p, PyObject_Del = %p, int_tp_free = %p, base.tp_free = %p\n",
-         PyIntArrType_Type.tp_free, PyObject_Del, PyInt_Type.tp_free,
-         PySignedIntegerArrType_Type.tp_free);
-     */
     SINGLE_INHERIT(UByte, UnsignedInteger);
     SINGLE_INHERIT(UShort, UnsignedInteger);
     SINGLE_INHERIT(UInt, UnsignedInteger);
@@ -4325,13 +4420,13 @@ set_flaginfo(PyObject *d)
     newd = PyDict_New();
 
 #define _addnew(key, val, one)                                       \
-    PyDict_SetItemString(newd, #key, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #key, s=PyLong_FromLong(val));    \
     Py_DECREF(s);                                               \
-    PyDict_SetItemString(newd, #one, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #one, s=PyLong_FromLong(val));    \
     Py_DECREF(s)
 
 #define _addone(key, val)                                            \
-    PyDict_SetItemString(newd, #key, s=PyInt_FromLong(val));    \
+    PyDict_SetItemString(newd, #key, s=PyLong_FromLong(val));    \
     Py_DECREF(s)
 
     _addnew(OWNDATA, NPY_ARRAY_OWNDATA, O);
@@ -4364,28 +4459,33 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_dtype = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_ndmin = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis1 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_like = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_numpy = NULL;
 
 static int
 intern_strings(void)
 {
-    npy_ma_str_array = PyUString_InternFromString("__array__");
-    npy_ma_str_array_prepare = PyUString_InternFromString("__array_prepare__");
-    npy_ma_str_array_wrap = PyUString_InternFromString("__array_wrap__");
-    npy_ma_str_array_finalize = PyUString_InternFromString("__array_finalize__");
-    npy_ma_str_ufunc = PyUString_InternFromString("__array_ufunc__");
-    npy_ma_str_implementation = PyUString_InternFromString("_implementation");
-    npy_ma_str_order = PyUString_InternFromString("order");
-    npy_ma_str_copy = PyUString_InternFromString("copy");
-    npy_ma_str_dtype = PyUString_InternFromString("dtype");
-    npy_ma_str_ndmin = PyUString_InternFromString("ndmin");
-    npy_ma_str_axis1 = PyUString_InternFromString("axis1");
-    npy_ma_str_axis2 = PyUString_InternFromString("axis2");
+    npy_ma_str_array = PyUnicode_InternFromString("__array__");
+    npy_ma_str_array_prepare = PyUnicode_InternFromString("__array_prepare__");
+    npy_ma_str_array_wrap = PyUnicode_InternFromString("__array_wrap__");
+    npy_ma_str_array_finalize = PyUnicode_InternFromString("__array_finalize__");
+    npy_ma_str_ufunc = PyUnicode_InternFromString("__array_ufunc__");
+    npy_ma_str_implementation = PyUnicode_InternFromString("_implementation");
+    npy_ma_str_order = PyUnicode_InternFromString("order");
+    npy_ma_str_copy = PyUnicode_InternFromString("copy");
+    npy_ma_str_dtype = PyUnicode_InternFromString("dtype");
+    npy_ma_str_ndmin = PyUnicode_InternFromString("ndmin");
+    npy_ma_str_axis1 = PyUnicode_InternFromString("axis1");
+    npy_ma_str_axis2 = PyUnicode_InternFromString("axis2");
+    npy_ma_str_like = PyUnicode_InternFromString("like");
+    npy_ma_str_numpy = PyUnicode_InternFromString("numpy");
 
     return npy_ma_str_array && npy_ma_str_array_prepare &&
            npy_ma_str_array_wrap && npy_ma_str_array_finalize &&
            npy_ma_str_ufunc && npy_ma_str_implementation &&
            npy_ma_str_order && npy_ma_str_copy && npy_ma_str_dtype &&
-           npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
+           npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2 &&
+           npy_ma_str_like && npy_ma_str_numpy;
 }
 
 static struct PyModuleDef moduledef = {
@@ -4510,14 +4610,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         goto err;
     }
 
-    c_api = NpyCapsule_FromVoidPtr((void *)PyArray_API, NULL);
+    c_api = PyCapsule_New((void *)PyArray_API, NULL, NULL);
     if (c_api == NULL) {
         goto err;
     }
     PyDict_SetItemString(d, "_ARRAY_API", c_api);
     Py_DECREF(c_api);
 
-    c_api = NpyCapsule_FromVoidPtr((void *)PyUFunc_API, NULL);
+    c_api = PyCapsule_New((void *)PyUFunc_API, NULL, NULL);
     if (c_api == NULL) {
         goto err;
     }
@@ -4535,11 +4635,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
      */
     PyDict_SetItemString (d, "error", PyExc_Exception);
 
-    s = PyInt_FromLong(NPY_TRACE_DOMAIN);
+    s = PyLong_FromLong(NPY_TRACE_DOMAIN);
     PyDict_SetItemString(d, "tracemalloc_domain", s);
     Py_DECREF(s);
 
-    s = PyUString_FromString("3.1");
+    s = PyUnicode_FromString("3.1");
     PyDict_SetItemString(d, "__version__", s);
     Py_DECREF(s);
 
@@ -4573,7 +4673,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     }
     Py_DECREF(s);
 
-    s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL);
+    s = PyCapsule_New((void *)_datetime_strings, NULL, NULL);
     if (s == NULL) {
         goto err;
     }
@@ -4581,7 +4681,7 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     Py_DECREF(s);
 
 #define ADDCONST(NAME)                          \
-    s = PyInt_FromLong(NPY_##NAME);             \
+    s = PyLong_FromLong(NPY_##NAME);             \
     PyDict_SetItemString(d, #NAME, s);          \
     Py_DECREF(s)
 
@@ -4631,9 +4731,20 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (set_typeinfo(d) != 0) {
         goto err;
     }
+    if (PyType_Ready(&PyArrayMethod_Type) < 0) {
+        goto err;
+    }
+    if (PyType_Ready(&PyBoundArrayMethod_Type) < 0) {
+        goto err;
+    }
     if (initialize_and_map_pytypes_to_dtypes() < 0) {
         goto err;
     }
+
+    if (PyArray_InitializeCasts() < 0) {
+        goto err;
+    }
+
     if (initumath(m) != 0) {
         goto err;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index dd437e091..d3ee3337c 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -13,5 +13,7 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_dtype;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_ndmin;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis1;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
 
 #endif
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index a5b5e5c51..059f2c437 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -229,13 +229,22 @@ NpyIter_EnableExternalLoop(NpyIter *iter)
     return NpyIter_Reset(iter, NULL);
 }
 
+
+static char *_reset_cast_error = (
+        "Iterator reset failed due to a casting failure. "
+        "This error is set as a Python error.");
+
 /*NUMPY_API
  * Resets the iterator to its initial state
  *
+ * The use of errmsg is discouraged, it cannot be guaranteed that the GIL
+ * will not be grabbed on casting errors even when this is passed.
+ *
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_Reset(NpyIter *iter, char **errmsg)
@@ -250,6 +259,9 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
         /* If buffer allocation was delayed, do it now */
         if (itflags&NPY_ITFLAG_DELAYBUF) {
             if (!npyiter_allocate_buffers(iter, errmsg)) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
                 return NPY_FAIL;
             }
             NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
@@ -257,7 +269,7 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
         else {
             /*
              * If the iterindex is already right, no need to
-             * do anything
+             * do anything (and no cast error has previously occurred).
              */
             bufferdata = NIT_BUFFERDATA(iter);
             if (NIT_ITERINDEX(iter) == NIT_ITERSTART(iter) &&
@@ -265,9 +277,12 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
                     NBF_SIZE(bufferdata) > 0) {
                 return NPY_SUCCEED;
             }
-
-            /* Copy any data from the buffers back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
+                return NPY_FAIL;
+            }
         }
     }
 
@@ -275,7 +290,12 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         /* Prepare the next buffers and set iterend/size */
-        npyiter_copy_to_buffers(iter, NULL);
+        if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+            if (errmsg != NULL) {
+                *errmsg = _reset_cast_error;
+            }
+            return NPY_FAIL;
+        }
     }
 
     return NPY_SUCCEED;
@@ -288,7 +308,8 @@ NpyIter_Reset(NpyIter *iter, char **errmsg)
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
@@ -309,8 +330,12 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
             NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_DELAYBUF;
         }
         else {
-            /* Copy any data from the buffers back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                if (errmsg != NULL) {
+                    *errmsg = _reset_cast_error;
+                }
+                return NPY_FAIL;
+            }
         }
     }
 
@@ -323,7 +348,12 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
 
     if (itflags&NPY_ITFLAG_BUFFER) {
         /* Prepare the next buffers and set iterend/size */
-        npyiter_copy_to_buffers(iter, NULL);
+        if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+            if (errmsg != NULL) {
+                *errmsg = _reset_cast_error;
+            }
+            return NPY_FAIL;
+        }
     }
 
     return NPY_SUCCEED;
@@ -335,7 +365,8 @@ NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char **errmsg)
  * If errmsg is non-NULL, it should point to a variable which will
  * receive the error message, and no Python exception will be set.
  * This is so that the function can be called from code not holding
- * the GIL.
+ * the GIL. Note that cast errors may still lead to the GIL being
+ * grabbed temporarily.
  */
 NPY_NO_EXPORT int
 NpyIter_ResetToIterIndexRange(NpyIter *iter,
@@ -633,12 +664,16 @@ NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex)
         /* Start the buffer at the provided iterindex */
         else {
             /* Write back to the arrays */
-            npyiter_copy_from_buffers(iter);
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                return NPY_FAIL;
+            }
 
             npyiter_goto_iterindex(iter, iterindex);
 
             /* Prepare the next buffers and set iterend/size */
-            npyiter_copy_to_buffers(iter, NULL);
+            if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+                return NPY_FAIL;
+            }
         }
     }
     else {
@@ -1376,6 +1411,7 @@ NpyIter_GetInnerLoopSizePtr(NpyIter *iter)
     }
 }
 
+
 /*NUMPY_API
  * For debugging
  */
@@ -1828,7 +1864,7 @@ npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex)
  * their data needs to be written back to the arrays.  The multi-index
  * must be positioned for the beginning of the buffer.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_from_buffers(NpyIter *iter)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
@@ -1861,7 +1897,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
 
     /* If we're past the end, nothing to copy */
     if (NBF_SIZE(bufferdata) == 0) {
-        return;
+        return 0;
     }
 
     NPY_IT_DBG_PRINT("Iterator: Copying buffers to outputs\n");
@@ -1968,7 +2004,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
                     maskptr = (npy_bool *)ad_ptrs[maskop];
                 }
 
-                PyArray_TransferMaskedStridedToNDim(ndim_transfer,
+                if (PyArray_TransferMaskedStridedToNDim(ndim_transfer,
                         ad_ptrs[iop], dst_strides, axisdata_incr,
                         buffer, src_stride,
                         maskptr, strides[maskop],
@@ -1976,18 +2012,22 @@ npyiter_copy_from_buffers(NpyIter *iter)
                         dst_shape, axisdata_incr,
                         op_transfersize, dtypes[iop]->elsize,
                         (PyArray_MaskedStridedUnaryOp *)stransfer,
-                        transferdata);
+                        transferdata) < 0) {
+                    return -1;
+                }
             }
             /* Regular operand */
             else {
-                PyArray_TransferStridedToNDim(ndim_transfer,
+                if (PyArray_TransferStridedToNDim(ndim_transfer,
                         ad_ptrs[iop], dst_strides, axisdata_incr,
                         buffer, src_stride,
                         dst_coords, axisdata_incr,
                         dst_shape, axisdata_incr,
                         op_transfersize, dtypes[iop]->elsize,
                         stransfer,
-                        transferdata);
+                        transferdata) < 0) {
+                    return -1;
+                }
             }
         }
         /* If there's no copy back, we may have to decrement refs.  In
@@ -2002,9 +2042,13 @@ npyiter_copy_from_buffers(NpyIter *iter)
             NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
                                 "of operand %d\n", (int)iop);
             /* Decrement refs */
-            stransfer(NULL, 0, buffer, dtypes[iop]->elsize,
-                        transfersize, dtypes[iop]->elsize,
-                        transferdata);
+            if (stransfer(NULL, 0, buffer, dtypes[iop]->elsize,
+                          transfersize, dtypes[iop]->elsize,
+                          transferdata) < 0) {
+                /* Since this should only decrement, it should never error */
+                assert(0);
+                return -1;
+            }
             /*
              * Zero out the memory for safety.  For instance,
              * if during iteration some Python code copied an
@@ -2016,6 +2060,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
     }
 
     NPY_IT_DBG_PRINT("Iterator: Finished copying buffers to outputs\n");
+    return 0;
 }
 
 /*
@@ -2023,7 +2068,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
  * for the start of a buffer.  It decides which operands need a buffer,
  * and copies the data into the buffers.
  */
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
@@ -2142,7 +2187,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         NBF_BUFITEREND(bufferdata) = iterindex + reduce_innersize;
         if (reduce_innersize == 0) {
             NBF_REDUCE_OUTERSIZE(bufferdata) = 0;
-            return;
+            return 0;
         }
         else {
             NBF_REDUCE_OUTERSIZE(bufferdata) = transfersize/reduce_innersize;
@@ -2508,14 +2553,15 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
                                 "buffer (%d items)\n",
                                 (int)iop, (int)op_transfersize);
 
-                PyArray_TransferNDimToStrided(ndim_transfer,
-                        ptrs[iop], dst_stride,
-                        ad_ptrs[iop], src_strides, axisdata_incr,
-                        src_coords, axisdata_incr,
-                        src_shape, axisdata_incr,
-                        op_transfersize, src_itemsize,
-                        stransfer,
-                        transferdata);
+                if (PyArray_TransferNDimToStrided(
+                                ndim_transfer, ptrs[iop], dst_stride,
+                                ad_ptrs[iop], src_strides, axisdata_incr,
+                                src_coords, axisdata_incr,
+                                src_shape, axisdata_incr,
+                                op_transfersize, src_itemsize,
+                                stransfer, transferdata) < 0) {
+                    return -1;
+                }
             }
         }
         else if (ptrs[iop] == buffers[iop]) {
@@ -2551,8 +2597,80 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
 
     NPY_IT_DBG_PRINT1("Iterator: Finished copying inputs to buffers "
                         "(buffered size is %d)\n", (int)NBF_SIZE(bufferdata));
+    return 0;
 }
 
+
+/**
+ * This function clears any references still held by the buffers and should
+ * only be used to discard buffers if an error occurred.
+ *
+ * @param iter Iterator
+ */
+NPY_NO_EXPORT void
+npyiter_clear_buffers(NpyIter *iter)
+{
+    int nop = iter->nop;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    if (NBF_SIZE(bufferdata) == 0) {
+        /* if the buffers are empty already, there is nothing to do */
+        return;
+    }
+
+    if (!(NIT_ITFLAGS(iter) & NPY_ITFLAG_NEEDSAPI)) {
+        /* Buffers do not require clearing, but should not be copied back */
+        NBF_SIZE(bufferdata) = 0;
+        return;
+    }
+
+    /*
+     * The iterator may be using a dtype with references, which always
+     * requires the API. In that case, further cleanup may be necessary.
+     *
+     * TODO: At this time, we assume that a dtype having references
+     *       implies the need to hold the GIL at all times. In theory
+     *       we could broaden this definition for a new
+     *       `PyArray_Item_XDECREF` API and the assumption may become
+     *       incorrect.
+     */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type,  &value, &traceback);
+
+    /* Cleanup any buffers with references */
+    char **buffers = NBF_BUFFERS(bufferdata);
+    PyArray_Descr **dtypes = NIT_DTYPES(iter);
+    for (int iop = 0; iop < nop; ++iop, ++buffers) {
+        /*
+         * We may want to find a better way to do this, on the other hand,
+         * this cleanup seems rare and fairly special.  A dtype using
+         * references (right now only us) must always keep the buffer in
+         * a well defined state (either NULL or owning the reference).
+         * Only we implement cleanup
+         */
+        if (!PyDataType_REFCHK(dtypes[iop])) {
+            continue;
+        }
+        if (*buffers == 0) {
+            continue;
+        }
+        int itemsize = dtypes[iop]->elsize;
+        for (npy_intp i = 0; i < NBF_SIZE(bufferdata); i++) {
+            /*
+             * See above comment, if this API is expanded the GIL assumption
+             * could become incorrect.
+             */
+            PyArray_Item_XDECREF(*buffers + (itemsize * i), dtypes[iop]);
+        }
+        /* Clear out the buffer just to be sure */
+        memset(*buffers, 0, NBF_SIZE(bufferdata) * itemsize);
+    }
+    /* Signal that the buffers are empty */
+    NBF_SIZE(bufferdata) = 0;
+    PyErr_Restore(type, value, traceback);
+}
+
+
 /*
  * This checks how much space can be buffered without encountering the
  * same value twice, or for operands whose innermost stride is zero,
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 7da17eafe..b379a28ac 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -476,7 +476,10 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
             }
 
             /* Prepare the next buffers and set iterend/size */
-            npyiter_copy_to_buffers(iter, NULL);
+            if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+                NpyIter_Deallocate(iter);
+                return NULL;
+            }
         }
     }
 
@@ -642,21 +645,27 @@ NpyIter_Copy(NpyIter *iter)
 }
 
 /*NUMPY_API
- * Deallocate an iterator
+ * Deallocate an iterator.
+ *
+ * To correctly work when an error is in progress, we have to check
+ * `PyErr_Occurred()`. This is necessary when buffers are not finalized
+ * or WritebackIfCopy is used. We could avoid that check by exposing a new
+ * function which is passed in whether or not a Python error is already set.
  */
 NPY_NO_EXPORT int
 NpyIter_Deallocate(NpyIter *iter)
 {
+    int success = PyErr_Occurred() == NULL;
+
     npy_uint32 itflags;
     /*int ndim = NIT_NDIM(iter);*/
     int iop, nop;
     PyArray_Descr **dtype;
     PyArrayObject **object;
     npyiter_opitflags *op_itflags;
-    npy_bool resolve = 1;
 
     if (iter == NULL) {
-        return NPY_SUCCEED;
+        return success;
     }
 
     itflags = NIT_ITFLAGS(iter);
@@ -667,13 +676,23 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /* Deallocate any buffers and buffering data */
     if (itflags & NPY_ITFLAG_BUFFER) {
+        /* Ensure no data is held by the buffers before they are cleared */
+        if (success) {
+            if (npyiter_copy_from_buffers(iter) < 0) {
+                success = NPY_FAIL;
+            }
+        }
+        else {
+            npyiter_clear_buffers(iter);
+        }
+
         NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
         char **buffers;
         NpyAuxData **transferdata;
 
         /* buffers */
         buffers = NBF_BUFFERS(bufferdata);
-        for(iop = 0; iop < nop; ++iop, ++buffers) {
+        for (iop = 0; iop < nop; ++iop, ++buffers) {
             PyArray_free(*buffers);
         }
         /* read bufferdata */
@@ -694,12 +713,12 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /*
      * Deallocate all the dtypes and objects that were iterated and resolve
-     * any writeback buffers created by the iterator
+     * any writeback buffers created by the iterator.
      */
-    for(iop = 0; iop < nop; ++iop, ++dtype, ++object) {
+    for (iop = 0; iop < nop; ++iop, ++dtype, ++object) {
         if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
-            if (resolve && PyArray_ResolveWritebackIfCopy(*object) < 0) {
-                resolve = 0;
+            if (success && PyArray_ResolveWritebackIfCopy(*object) < 0) {
+                success = 0;
             }
             else {
                 PyArray_DiscardWritebackIfCopy(*object);
@@ -711,12 +730,10 @@ NpyIter_Deallocate(NpyIter *iter)
 
     /* Deallocate the iterator memory */
     PyObject_Free(iter);
-    if (resolve == 0) {
-        return NPY_FAIL;
-    }
-    return NPY_SUCCEED;
+    return success;
 }
 
+
 /* Checks 'flags' for (C|F)_ORDER_INDEX, MULTI_INDEX, and EXTERNAL_LOOP,
  * setting the appropriate internal flags in 'itflags'.
  *
@@ -1733,73 +1750,70 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
     return 1;
 
 broadcast_error: {
-        PyObject *errmsg, *tmp;
         npy_intp remdims[NPY_MAXDIMS];
-        char *tmpstr;
 
         if (op_axes == NULL) {
-            errmsg = PyUString_FromString("operands could not be broadcast "
-                                          "together with shapes ");
-            if (errmsg == NULL) {
+            PyObject *shape1 = PyUnicode_FromString("");
+            if (shape1 == NULL) {
                 return 0;
             }
             for (iop = 0; iop < nop; ++iop) {
                 if (op[iop] != NULL) {
-                    tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                                    PyArray_DIMS(op[iop]),
-                                                    " ");
+                    int ndims = PyArray_NDIM(op[iop]);
+                    npy_intp *dims = PyArray_DIMS(op[iop]);
+                    PyObject *tmp = convert_shape_to_string(ndims, dims, " ");
                     if (tmp == NULL) {
-                        Py_DECREF(errmsg);
+                        Py_DECREF(shape1);
                         return 0;
                     }
-                    PyUString_ConcatAndDel(&errmsg, tmp);
-                    if (errmsg == NULL) {
+                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                    Py_DECREF(tmp);
+                    if (shape1 == NULL) {
                         return 0;
                     }
                 }
             }
-            if (itershape != NULL) {
-                tmp = PyUString_FromString("and requested shape ");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
-                    return 0;
-                }
-
-                tmp = convert_shape_to_string(ndim, itershape, "");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
+            if (itershape == NULL) {
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "shapes %S", shape1);
+                Py_DECREF(shape1);
+                return 0;
+            }
+            else {
+                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
+                if (shape2 == NULL) {
+                    Py_DECREF(shape1);
                     return 0;
                 }
-
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "shapes %S and requested shape %S", shape1, shape2);
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
+                return 0;
             }
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
         }
         else {
-            errmsg = PyUString_FromString("operands could not be broadcast "
-                                          "together with remapped shapes "
-                                          "[original->remapped]: ");
+            PyObject *shape1 = PyUnicode_FromString("");
+            if (shape1 == NULL) {
+                return 0;
+            }
             for (iop = 0; iop < nop; ++iop) {
                 if (op[iop] != NULL) {
                     int *axes = op_axes[iop];
+                    int ndims = PyArray_NDIM(op[iop]);
+                    npy_intp *dims = PyArray_DIMS(op[iop]);
+                    char *tmpstr = (axes == NULL) ? " " : "->";
 
-                    tmpstr = (axes == NULL) ? " " : "->";
-                    tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                                    PyArray_DIMS(op[iop]),
-                                                    tmpstr);
+                    PyObject *tmp = convert_shape_to_string(ndims, dims, tmpstr);
                     if (tmp == NULL) {
+                        Py_DECREF(shape1);
                         return 0;
                     }
-                    PyUString_ConcatAndDel(&errmsg, tmp);
-                    if (errmsg == NULL) {
+                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                    Py_DECREF(tmp);
+                    if (shape1 == NULL) {
                         return 0;
                     }
 
@@ -1814,80 +1828,83 @@ broadcast_error: {
                                 remdims[idim] = -1;
                             }
                         }
-                        tmp = convert_shape_to_string(ndim, remdims, " ");
+                        PyObject *tmp = convert_shape_to_string(ndim, remdims, " ");
                         if (tmp == NULL) {
+                            Py_DECREF(shape1);
                             return 0;
                         }
-                        PyUString_ConcatAndDel(&errmsg, tmp);
-                        if (errmsg == NULL) {
+                        Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
+                        Py_DECREF(tmp);
+                        if (shape1 == NULL) {
                             return 0;
                         }
                     }
                 }
             }
-            if (itershape != NULL) {
-                tmp = PyUString_FromString("and requested shape ");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
-                    return 0;
-                }
-
-                tmp = convert_shape_to_string(ndim, itershape, "");
-                if (tmp == NULL) {
-                    Py_DECREF(errmsg);
-                    return 0;
-                }
-                PyUString_ConcatAndDel(&errmsg, tmp);
-                if (errmsg == NULL) {
+            if (itershape == NULL) {
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "remapped shapes [original->remapped]: %S", shape1);
+                Py_DECREF(shape1);
+                return 0;
+            }
+            else {
+                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
+                if (shape2 == NULL) {
+                    Py_DECREF(shape1);
                     return 0;
                 }
-
+                PyErr_Format(PyExc_ValueError,
+                        "operands could not be broadcast together with "
+                        "remapped shapes [original->remapped]: %S and "
+                        "requested shape %S", shape1, shape2);
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
+                return 0;
             }
-            PyErr_SetObject(PyExc_ValueError, errmsg);
-            Py_DECREF(errmsg);
         }
-
-        return 0;
     }
 
 operand_different_than_broadcast: {
-        npy_intp remdims[NPY_MAXDIMS];
-        PyObject *errmsg, *tmp;
-
-        /* Start of error message */
-        if (op_flags[iop] & NPY_ITER_READONLY) {
-            errmsg = PyUString_FromString("non-broadcastable operand "
-                                          "with shape ");
-        }
-        else {
-            errmsg = PyUString_FromString("non-broadcastable output "
-                                          "operand with shape ");
-        }
-        if (errmsg == NULL) {
+        /* operand shape */
+        int ndims = PyArray_NDIM(op[iop]);
+        npy_intp *dims = PyArray_DIMS(op[iop]);
+        PyObject *shape1 = convert_shape_to_string(ndims, dims, "");
+        if (shape1 == NULL) {
             return 0;
         }
 
-        /* Operand shape */
-        tmp = convert_shape_to_string(PyArray_NDIM(op[iop]),
-                                        PyArray_DIMS(op[iop]), "");
-        if (tmp == NULL) {
+        /* Broadcast shape */
+        PyObject *shape2 = convert_shape_to_string(ndim, broadcast_shape, "");
+        if (shape2 == NULL) {
+            Py_DECREF(shape1);
             return 0;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
+
+        if (op_axes == NULL || op_axes[iop] == NULL) {
+            /* operand shape not remapped */
+
+            if (op_flags[iop] & NPY_ITER_READONLY) {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable operand with shape %S doesn't "
+                    "match the broadcast shape %S", shape1, shape2);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable output operand with shape %S doesn't "
+                    "match the broadcast shape %S", shape1, shape2);
+            }
+            Py_DECREF(shape1);
+            Py_DECREF(shape2);
             return 0;
         }
-        /* Remapped operand shape */
-        if (op_axes != NULL && op_axes[iop] != NULL) {
-            int *axes = op_axes[iop];
+        else {
+            /* operand shape remapped */
 
+            npy_intp remdims[NPY_MAXDIMS];
+            int *axes = op_axes[iop];
             for (idim = 0; idim < ndim; ++idim) {
-                npy_intp i = axes[ndim-idim-1];
-
+                npy_intp i = axes[ndim - idim - 1];
                 if (i >= 0 && i < PyArray_NDIM(op[iop])) {
                     remdims[idim] = PyArray_DIM(op[iop], i);
                 }
@@ -1896,48 +1913,30 @@ operand_different_than_broadcast: {
                 }
             }
 
-            tmp = PyUString_FromString(" [remapped to ");
-            if (tmp == NULL) {
-                return 0;
-            }
-            PyUString_ConcatAndDel(&errmsg, tmp);
-            if (errmsg == NULL) {
+            PyObject *shape3 = convert_shape_to_string(ndim, remdims, "");
+            if (shape3 == NULL) {
+                Py_DECREF(shape1);
+                Py_DECREF(shape2);
                 return 0;
             }
 
-            tmp = convert_shape_to_string(ndim, remdims, "]");
-            if (tmp == NULL) {
-                return 0;
+            if (op_flags[iop] & NPY_ITER_READONLY) {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable operand with shape %S "
+                    "[remapped to %S] doesn't match the broadcast shape %S",
+                    shape1, shape3, shape2);
             }
-            PyUString_ConcatAndDel(&errmsg, tmp);
-            if (errmsg == NULL) {
-                return 0;
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "non-broadcastable output operand with shape %S "
+                    "[remapped to %S] doesn't match the broadcast shape %S",
+                    shape1, shape3, shape2);
             }
-        }
-
-        tmp = PyUString_FromString(" doesn't match the broadcast shape ");
-        if (tmp == NULL) {
-            return 0;
-        }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
-            return 0;
-        }
-
-        /* Broadcast shape */
-        tmp = convert_shape_to_string(ndim, broadcast_shape, "");
-        if (tmp == NULL) {
+            Py_DECREF(shape1);
+            Py_DECREF(shape2);
+            Py_DECREF(shape3);
             return 0;
         }
-        PyUString_ConcatAndDel(&errmsg, tmp);
-        if (errmsg == NULL) {
-            return 0;
-        }
-
-        PyErr_SetObject(PyExc_ValueError, errmsg);
-        Py_DECREF(errmsg);
-
-        return 0;
     }
 }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 1477c8631..378d6f711 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -269,7 +269,7 @@ struct NpyIter_AD {
 #define NAD_STRIDES(axisdata) ( \
         &(axisdata)->ad_flexdata + 0)
 #define NAD_PTRS(axisdata) ((char **) \
-        &(axisdata)->ad_flexdata + 1*(nop+1))
+        (&(axisdata)->ad_flexdata + 1*(nop+1)))
 
 #define NAD_NSTRIDES() \
         ((nop) + ((itflags&NPY_ITFLAG_HASINDEX) ? 1 : 0))
@@ -342,10 +342,11 @@ NPY_NO_EXPORT int
 npyiter_allocate_buffers(NpyIter *iter, char **errmsg);
 NPY_NO_EXPORT void
 npyiter_goto_iterindex(NpyIter *iter, npy_intp iterindex);
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_from_buffers(NpyIter *iter);
-NPY_NO_EXPORT void
+NPY_NO_EXPORT int
 npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs);
-
+NPY_NO_EXPORT void
+npyiter_clear_buffers(NpyIter *iter);
 
 #endif
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 7f31a5096..8839d1be7 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -894,7 +894,7 @@ NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
                 Py_DECREF(item);
                 return NULL;
             }
-            axis = PyInt_AsLong(v);
+            axis = PyLong_AsLong(v);
             Py_DECREF(v);
             if (axis < 0 || axis >= NPY_MAXDIMS) {
                 PyErr_SetString(PyExc_ValueError,
@@ -1142,7 +1142,7 @@ npyiter_dealloc(NewNpyArrayIterObject *self)
                     "results.", 1) < 0) {
                 PyObject *s;
 
-                s = PyUString_FromString("npyiter_dealloc");
+                s = PyUnicode_FromString("npyiter_dealloc");
                 if (s) {
                     PyErr_WriteUnraisable(s);
                     Py_DECREF(s);
@@ -1268,6 +1268,10 @@ npyiter_iternext(NewNpyArrayIterObject *self)
         Py_RETURN_TRUE;
     }
     else {
+        if (PyErr_Occurred()) {
+            /* casting error, buffer cleanup will occur at reset or dealloc */
+            return NULL;
+        }
         self->finished = 1;
         Py_RETURN_FALSE;
     }
@@ -1483,6 +1487,10 @@ npyiter_next(NewNpyArrayIterObject *self)
      */
     if (self->started) {
         if (!self->iternext(self->iter)) {
+            /*
+             * A casting error may be set here (or no error causing a
+             * StopIteration). Buffers may only be cleaned up later.
+             */
             self->finished = 1;
             return NULL;
         }
@@ -1514,7 +1522,7 @@ static PyObject *npyiter_shape_get(NewNpyArrayIterObject *self)
         if (ret != NULL) {
             for (idim = 0; idim < ndim; ++idim) {
                 PyTuple_SET_ITEM(ret, idim,
-                        PyInt_FromLong(shape[idim]));
+                        PyLong_FromLong(shape[idim]));
             }
             return ret;
         }
@@ -1543,7 +1551,7 @@ static PyObject *npyiter_multi_index_get(NewNpyArrayIterObject *self)
         }
         for (idim = 0; idim < ndim; ++idim) {
             PyTuple_SET_ITEM(ret, idim,
-                    PyInt_FromLong(multi_index[idim]));
+                    PyLong_FromLong(multi_index[idim]));
         }
         return ret;
     }
@@ -1597,7 +1605,7 @@ npyiter_multi_index_set(NewNpyArrayIterObject *self, PyObject *value)
         }
         for (idim = 0; idim < ndim; ++idim) {
             PyObject *v = PySequence_GetItem(value, idim);
-            multi_index[idim] = PyInt_AsLong(v);
+            multi_index[idim] = PyLong_AsLong(v);
             if (error_converting(multi_index[idim])) {
                 Py_XDECREF(v);
                 return -1;
@@ -1633,7 +1641,7 @@ static PyObject *npyiter_index_get(NewNpyArrayIterObject *self)
 
     if (NpyIter_HasIndex(self->iter)) {
         npy_intp ind = *NpyIter_GetIndexPtr(self->iter);
-        return PyInt_FromLong(ind);
+        return PyLong_FromLong(ind);
     }
     else {
         PyErr_SetString(PyExc_ValueError,
@@ -1657,7 +1665,7 @@ static int npyiter_index_set(NewNpyArrayIterObject *self, PyObject *value)
 
     if (NpyIter_HasIndex(self->iter)) {
         npy_intp ind;
-        ind = PyInt_AsLong(value);
+        ind = PyLong_AsLong(value);
         if (error_converting(ind)) {
             return -1;
         }
@@ -1689,7 +1697,7 @@ static PyObject *npyiter_iterindex_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetIterIndex(self->iter));
+    return PyLong_FromLong(NpyIter_GetIterIndex(self->iter));
 }
 
 static int npyiter_iterindex_set(NewNpyArrayIterObject *self, PyObject *value)
@@ -1707,7 +1715,7 @@ static int npyiter_iterindex_set(NewNpyArrayIterObject *self, PyObject *value)
         return -1;
     }
 
-    iterindex = PyInt_AsLong(value);
+    iterindex = PyLong_AsLong(value);
     if (error_converting(iterindex)) {
         return -1;
     }
@@ -1743,8 +1751,8 @@ static PyObject *npyiter_iterrange_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    PyTuple_SET_ITEM(ret, 0, PyInt_FromLong(istart));
-    PyTuple_SET_ITEM(ret, 1, PyInt_FromLong(iend));
+    PyTuple_SET_ITEM(ret, 0, PyLong_FromLong(istart));
+    PyTuple_SET_ITEM(ret, 1, PyLong_FromLong(iend));
 
     return ret;
 }
@@ -1892,7 +1900,7 @@ static PyObject *npyiter_ndim_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNDim(self->iter));
+    return PyLong_FromLong(NpyIter_GetNDim(self->iter));
 }
 
 static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
@@ -1903,7 +1911,7 @@ static PyObject *npyiter_nop_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetNOp(self->iter));
+    return PyLong_FromLong(NpyIter_GetNOp(self->iter));
 }
 
 static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
@@ -1914,7 +1922,7 @@ static PyObject *npyiter_itersize_get(NewNpyArrayIterObject *self)
         return NULL;
     }
 
-    return PyInt_FromLong(NpyIter_GetIterSize(self->iter));
+    return PyLong_FromLong(NpyIter_GetIterSize(self->iter));
 }
 
 static PyObject *npyiter_finished_get(NewNpyArrayIterObject *self)
@@ -2213,7 +2221,7 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
         return NULL;
     }
 
-    if (PyInt_Check(op) || PyLong_Check(op) ||
+    if (PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
         if (error_converting(i)) {
@@ -2223,8 +2231,8 @@ npyiter_subscript(NewNpyArrayIterObject *self, PyObject *op)
     }
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0, islicelength;
-        if (NpySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
-                                  &istart, &iend, &istep, &islicelength) < 0) {
+        if (PySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
+                                 &istart, &iend, &istep, &islicelength) < 0) {
             return NULL;
         }
         if (istep != 1) {
@@ -2262,7 +2270,7 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
         return -1;
     }
 
-    if (PyInt_Check(op) || PyLong_Check(op) ||
+    if (PyLong_Check(op) ||
                     (PyIndex_Check(op) && !PySequence_Check(op))) {
         npy_intp i = PyArray_PyIntAsIntp(op);
         if (error_converting(i)) {
@@ -2272,8 +2280,8 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
     }
     else if (PySlice_Check(op)) {
         Py_ssize_t istart = 0, iend = 0, istep = 0, islicelength = 0;
-        if (NpySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
-                                  &istart, &iend, &istep, &islicelength) < 0) {
+        if (PySlice_GetIndicesEx(op, NpyIter_GetNOp(self->iter),
+                                 &istart, &iend, &istep, &islicelength) < 0) {
             return -1;
         }
         if (istep != 1) {
diff --git a/numpy/core/src/multiarray/nditer_templ.c.src b/numpy/core/src/multiarray/nditer_templ.c.src
index 0f0d59972..05ce6ae75 100644
--- a/numpy/core/src/multiarray/nditer_templ.c.src
+++ b/numpy/core/src/multiarray/nditer_templ.c.src
@@ -249,7 +249,10 @@ npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
     memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
 
     /* Write back to the arrays */
-    npyiter_copy_from_buffers(iter);
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     /* Check if we're past the end */
     if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
@@ -262,7 +265,10 @@ npyiter_buffered_reduce_iternext_iters@tag_nop@(NpyIter *iter)
     }
 
     /* Prepare the next buffers and set iterend/size */
-    npyiter_copy_to_buffers(iter, prev_dataptrs);
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     return 1;
 }
@@ -303,7 +309,10 @@ npyiter_buffered_iternext(NpyIter *iter)
     }
 
     /* Write back to the arrays */
-    npyiter_copy_from_buffers(iter);
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     /* Check if we're past the end */
     if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
@@ -316,7 +325,10 @@ npyiter_buffered_iternext(NpyIter *iter)
     }
 
     /* Prepare the next buffers and set iterend/size */
-    npyiter_copy_to_buffers(iter, NULL);
+    if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
 
     return 1;
 }
diff --git a/numpy/core/src/multiarray/npy_buffer.h b/numpy/core/src/multiarray/npy_buffer.h
index 5ff8b6c2c..d10f1a020 100644
--- a/numpy/core/src/multiarray/npy_buffer.h
+++ b/numpy/core/src/multiarray/npy_buffer.h
@@ -3,8 +3,8 @@
 
 extern NPY_NO_EXPORT PyBufferProcs array_as_buffer;
 
-NPY_NO_EXPORT void
-_dealloc_cached_buffer_info(PyObject *self);
+NPY_NO_EXPORT int
+_buffer_info_free(void *buffer_info, PyObject *obj);
 
 NPY_NO_EXPORT PyArray_Descr*
 _descriptor_from_pep3118_format(char const *s);
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 19ac7d7f9..a629dfe97 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -397,14 +397,21 @@ is_scalar_with_conversion(PyObject *o2, double* out_exponent)
     PyObject *temp;
     const int optimize_fpexps = 1;
 
-    if (PyInt_Check(o2)) {
-        *out_exponent = (double)PyInt_AsLong(o2);
+    if (PyLong_Check(o2)) {
+        long tmp = PyLong_AsLong(o2);
+        if (error_converting(tmp)) {
+            PyErr_Clear();
+            return NPY_NOSCALAR;
+        }
+        *out_exponent = (double)tmp;
         return NPY_INTPOS_SCALAR;
     }
+
     if (optimize_fpexps && PyFloat_Check(o2)) {
         *out_exponent = PyFloat_AsDouble(o2);
         return NPY_FLOAT_SCALAR;
     }
+
     if (PyArray_Check(o2)) {
         if ((PyArray_NDIM((PyArrayObject *)o2) == 0) &&
                 ((PyArray_ISINTEGER((PyArrayObject *)o2) ||
@@ -442,13 +449,13 @@ is_scalar_with_conversion(PyObject *o2, double* out_exponent)
     else if (PyIndex_Check(o2)) {
         PyObject* value = PyNumber_Index(o2);
         Py_ssize_t val;
-        if (value==NULL) {
+        if (value == NULL) {
             if (PyErr_Occurred()) {
                 PyErr_Clear();
             }
             return NPY_NOSCALAR;
         }
-        val = PyInt_AsSsize_t(value);
+        val = PyLong_AsSsize_t(value);
         if (error_converting(val)) {
             PyErr_Clear();
             return NPY_NOSCALAR;
@@ -826,7 +833,7 @@ _array_nonzero(PyArrayObject *mp)
     n = PyArray_SIZE(mp);
     if (n == 1) {
         int res;
-        if (Npy_EnterRecursiveCall(" while converting array to bool")) {
+        if (Py_EnterRecursiveCall(" while converting array to bool")) {
             return -1;
         }
         res = PyArray_DESCR(mp)->f->nonzero(PyArray_DATA(mp), mp);
@@ -880,7 +887,7 @@ array_scalar_forward(PyArrayObject *v,
     /* Need to guard against recursion if our array holds references */
     if (PyDataType_REFCHK(PyArray_DESCR(v))) {
         PyObject *res;
-        if (Npy_EnterRecursiveCall(where) != 0) {
+        if (Py_EnterRecursiveCall(where) != 0) {
             Py_DECREF(scalar);
             return NULL;
         }
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index c869b5eea..41dd059b0 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -36,7 +36,7 @@ PyArray_Item_INCREF(char *data, PyArray_Descr *descr)
         return;
     }
     if (descr->type_num == NPY_OBJECT) {
-        NPY_COPY_PYOBJECT_PTR(&temp, data);
+        memcpy(&temp, data, sizeof(temp));
         Py_XINCREF(temp);
     }
     else if (PyDataType_HASFIELDS(descr)) {
@@ -98,7 +98,7 @@ PyArray_Item_XDECREF(char *data, PyArray_Descr *descr)
     }
 
     if (descr->type_num == NPY_OBJECT) {
-        NPY_COPY_PYOBJECT_PTR(&temp, data);
+        memcpy(&temp, data, sizeof(temp));
         Py_XDECREF(temp);
     }
     else if (PyDataType_HASFIELDS(descr)) {
@@ -181,7 +181,7 @@ PyArray_INCREF(PyArrayObject *mp)
         }
         else {
             for( i = 0; i < n; i++, data++) {
-                NPY_COPY_PYOBJECT_PTR(&temp, data);
+                memcpy(&temp, data, sizeof(temp));
                 Py_XINCREF(temp);
             }
         }
@@ -192,7 +192,7 @@ PyArray_INCREF(PyArrayObject *mp)
             return -1;
         }
         while(it->index < it->size) {
-            NPY_COPY_PYOBJECT_PTR(&temp, it->dataptr);
+            memcpy(&temp, it->dataptr, sizeof(temp));
             Py_XINCREF(temp);
             PyArray_ITER_NEXT(it);
         }
@@ -238,7 +238,7 @@ PyArray_XDECREF(PyArrayObject *mp)
         }
         else {
             for (i = 0; i < n; i++, data++) {
-                NPY_COPY_PYOBJECT_PTR(&temp, data);
+                memcpy(&temp, data, sizeof(temp));
                 Py_XDECREF(temp);
             }
         }
@@ -246,7 +246,7 @@ PyArray_XDECREF(PyArrayObject *mp)
     else { /* handles misaligned data too */
         PyArray_RawIterBaseInit(&it, mp);
         while(it.index < it.size) {
-            NPY_COPY_PYOBJECT_PTR(&temp, it.dataptr);
+            memcpy(&temp, it.dataptr, sizeof(temp));
             Py_XDECREF(temp);
             PyArray_ITER_NEXT(&it);
         }
@@ -292,24 +292,26 @@ static void
 _fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype)
 {
     if (!PyDataType_FLAGCHK(dtype, NPY_ITEM_REFCOUNT)) {
-        if ((obj == Py_None) || (PyInt_Check(obj) && PyInt_AsLong(obj)==0)) {
+        PyObject *arr;
+
+        if ((obj == Py_None) ||
+                (PyLong_Check(obj) && PyLong_AsLong(obj) == 0)) {
             return;
         }
-        else {
-            PyObject *arr;
-            Py_INCREF(dtype);
-            arr = PyArray_NewFromDescr(&PyArray_Type, dtype,
-                                       0, NULL, NULL, NULL,
-                                       0, NULL);
-            if (arr!=NULL) {
-                dtype->f->setitem(obj, optr, arr);
-            }
-            Py_XDECREF(arr);
+        /* Clear possible long conversion error */
+        PyErr_Clear();
+        Py_INCREF(dtype);
+        arr = PyArray_NewFromDescr(&PyArray_Type, dtype,
+                                   0, NULL, NULL, NULL,
+                                   0, NULL);
+        if (arr!=NULL) {
+            dtype->f->setitem(obj, optr, arr);
         }
+        Py_XDECREF(arr);
     }
     if (dtype->type_num == NPY_OBJECT) {
         Py_XINCREF(obj);
-        NPY_COPY_PYOBJECT_PTR(optr, &obj);
+        memcpy(optr, &obj, sizeof(obj));
     }
     else if (PyDataType_HASFIELDS(dtype)) {
         PyObject *key, *value, *title = NULL;
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 6f3d102a4..0e93cbbe9 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -35,7 +35,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
 {
     int type_num;
     int align;
-    npy_intp memloc;
+    uintptr_t memloc;
     if (descr == NULL) {
         descr = PyArray_DescrFromScalar(scalar);
         type_num = descr->type_num;
@@ -138,7 +138,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
     }
     else if (_CHK(Flexible)) {
         if (_CHK(String)) {
-            return (void *)PyString_AS_STRING(scalar);
+            return (void *)PyBytes_AS_STRING(scalar);
         }
         if (_CHK(Unicode)) {
             /* Treat this the same as the NPY_UNICODE base class */
@@ -168,7 +168,7 @@ scalar_value(PyObject *scalar, PyArray_Descr *descr)
      * Use the alignment flag to figure out where the data begins
      * after a PyObject_HEAD
      */
-    memloc = (npy_intp)scalar;
+    memloc = (uintptr_t)scalar;
     memloc += sizeof(PyObject);
     /* now round-up to the nearest alignment value */
     align = descr->alignment;
@@ -373,14 +373,15 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 NPY_NO_EXPORT PyObject *
 PyArray_ScalarFromObject(PyObject *object)
 {
-    PyObject *ret=NULL;
+    PyObject *ret = NULL;
+
     if (PyArray_IsZeroDim(object)) {
         return PyArray_ToScalar(PyArray_DATA((PyArrayObject *)object),
                                 (PyArrayObject *)object);
     }
     /*
      * Booleans in Python are implemented as a subclass of integers,
-     * so PyBool_Check must be called before PyInt_Check.
+     * so PyBool_Check must be called before PyLong_Check.
      */
     if (PyBool_Check(object)) {
         if (object == Py_True) {
@@ -390,42 +391,49 @@ PyArray_ScalarFromObject(PyObject *object)
             PyArrayScalar_RETURN_FALSE;
         }
     }
-    else if (PyInt_Check(object)) {
-        ret = PyArrayScalar_New(Long);
-        if (ret == NULL) {
-            return NULL;
+    else if (PyLong_Check(object)) {
+        /* Check if fits in long */
+        npy_long val_long = PyLong_AsLong(object);
+        if (!error_converting(val_long)) {
+            ret = PyArrayScalar_New(Long);
+            if (ret != NULL) {
+                PyArrayScalar_VAL(ret, Long) = val_long;
+            }
+            return ret;
         }
-        PyArrayScalar_VAL(ret, Long) = PyInt_AS_LONG(object);
+        PyErr_Clear();
+
+        /* Check if fits in long long */
+        npy_longlong val_longlong = PyLong_AsLongLong(object);
+        if (!error_converting(val_longlong)) {
+            ret = PyArrayScalar_New(LongLong);
+            if (ret != NULL) {
+                PyArrayScalar_VAL(ret, LongLong) = val_longlong;
+            }
+            return ret;
+        }
+        PyErr_Clear();
+
+        return NULL;
     }
     else if (PyFloat_Check(object)) {
         ret = PyArrayScalar_New(Double);
-        if (ret == NULL) {
-            return NULL;
+        if (ret != NULL) {
+            PyArrayScalar_VAL(ret, Double) = PyFloat_AS_DOUBLE(object);
         }
-        PyArrayScalar_VAL(ret, Double) = PyFloat_AS_DOUBLE(object);
+        return ret;
     }
     else if (PyComplex_Check(object)) {
         ret = PyArrayScalar_New(CDouble);
-        if (ret == NULL) {
-            return NULL;
+        if (ret != NULL) {
+            PyArrayScalar_VAL(ret, CDouble).real = PyComplex_RealAsDouble(object);
+            PyArrayScalar_VAL(ret, CDouble).imag = PyComplex_ImagAsDouble(object);
         }
-        PyArrayScalar_VAL(ret, CDouble).real = PyComplex_RealAsDouble(object);
-        PyArrayScalar_VAL(ret, CDouble).imag = PyComplex_ImagAsDouble(object);
+        return ret;
     }
-    else if (PyLong_Check(object)) {
-        npy_longlong val;
-        val = PyLong_AsLongLong(object);
-        if (error_converting(val)) {
-            PyErr_Clear();
-            return NULL;
-        }
-        ret = PyArrayScalar_New(LongLong);
-        if (ret == NULL) {
-            return NULL;
-        }
-        PyArrayScalar_VAL(ret, LongLong) = val;
+    else {
+        return NULL;
     }
-    return ret;
 }
 
 /*New reference */
@@ -613,7 +621,7 @@ PyArray_DescrFromScalar(PyObject *sc)
         PyArray_DESCR_REPLACE(descr);
         type_num = descr->type_num;
         if (type_num == NPY_STRING) {
-            descr->elsize = PyString_GET_SIZE(sc);
+            descr->elsize = PyBytes_GET_SIZE(sc);
         }
         else if (type_num == NPY_UNICODE) {
             descr->elsize = PyUnicode_GET_LENGTH(sc) * 4;
@@ -755,8 +763,8 @@ PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base)
     }
     if (PyTypeNum_ISFLEXIBLE(type_num)) {
         if (type_num == NPY_STRING) {
-            destptr = PyString_AS_STRING(obj);
-            ((PyStringObject *)obj)->ob_shash = -1;
+            destptr = PyBytes_AS_STRING(obj);
+            ((PyBytesObject *)obj)->ob_shash = -1;
             memcpy(destptr, data, itemsize);
             return obj;
         }
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 088b380aa..e480628e7 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -67,8 +67,11 @@ gentype_alloc(PyTypeObject *type, Py_ssize_t nitems)
     const size_t size = _PyObject_VAR_SIZE(type, nitems + 1);
 
     obj = (PyObject *)PyObject_Malloc(size);
+    if (obj == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
     /*
-     * Fixme. Need to check for no memory.
      * If we don't need to zero memory, we could use
      * PyObject_{New, NewVar} for this whole function.
      */
@@ -149,7 +152,7 @@ static PyObject *
 gentype_add(PyObject *m1, PyObject* m2)
 {
     /* special case str.__radd__, which should not call array_add */
-    if (PyString_Check(m1) || PyUnicode_Check(m1)) {
+    if (PyBytes_Check(m1) || PyUnicode_Check(m1)) {
         Py_INCREF(Py_NotImplemented);
         return Py_NotImplemented;
     }
@@ -274,7 +277,8 @@ gentype_format(PyObject *self, PyObject *args)
     if (Py_TYPE(self) == &PyBoolArrType_Type) {
         obj = PyBool_FromLong(PyArrayScalar_VAL(self, Bool));
     }
-    else if (PyArray_IsScalar(self, Integer)) {
+    else if (PyArray_IsScalar(self, Integer)
+             && !PyArray_IsScalar(self, Timedelta)) {
         obj = Py_TYPE(self)->tp_as_number->nb_int(self);
     }
     else if (PyArray_IsScalar(self, Floating)) {
@@ -447,7 +451,7 @@ _void_to_hex(const char* argbuf, const Py_ssize_t arglen,
     }
     memcpy(&retbuf[j], echars, strlen(echars));
 
-    retval = PyUString_FromStringAndSize(retbuf, slen);
+    retval = PyUnicode_FromStringAndSize(retbuf, slen);
     PyMem_Free(retbuf);
 
     return retval;
@@ -518,21 +522,15 @@ datetimetype_repr(PyObject *self)
      */
     if ((scal->obmeta.num == 1 && scal->obmeta.base != NPY_FR_h) ||
             scal->obmeta.base == NPY_FR_GENERIC) {
-        ret = PyUString_FromString("numpy.datetime64('");
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(iso));
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s')", iso);
     }
     else {
-        ret = PyUString_FromString("numpy.datetime64('");
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(iso));
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("','"));
-        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s','%S')", iso, meta);
+        Py_DECREF(meta);
     }
 
     return ret;
@@ -542,7 +540,7 @@ static PyObject *
 timedeltatype_repr(PyObject *self)
 {
     PyTimedeltaScalarObject *scal;
-    PyObject *ret;
+    PyObject *val, *ret;
 
     if (!PyArray_IsScalar(self, Timedelta)) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -554,32 +552,34 @@ timedeltatype_repr(PyObject *self)
 
     /* The value */
     if (scal->obval == NPY_DATETIME_NAT) {
-        ret = PyUString_FromString("numpy.timedelta64('NaT'");
+        val = PyUnicode_FromString("'NaT'");
     }
     else {
-        /*
-         * Can't use "%lld" if HAVE_LONG_LONG is not defined
-         */
+         /* Can't use "%lld" if HAVE_LONG_LONG is not defined */
 #if defined(HAVE_LONG_LONG)
-        ret = PyUString_FromFormat("numpy.timedelta64(%lld",
-                                            (long long)scal->obval);
+        val = PyUnicode_FromFormat("%lld", (long long)scal->obval);
 #else
-        ret = PyUString_FromFormat("numpy.timedelta64(%ld",
-                                            (long)scal->obval);
+        val = PyUnicode_FromFormat("%ld", (long)scal->obval);
 #endif
     }
+    if (val == NULL) {
+        return NULL;
+    }
+
     /* The metadata unit */
     if (scal->obmeta.base == NPY_FR_GENERIC) {
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(")"));
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S)", val);
     }
     else {
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(",'"));
-        ret = append_metastr_to_string(&scal->obmeta, 1, ret);
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString("')"));
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            Py_DECREF(val);
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S,'%S')", val, meta);
+        Py_DECREF(meta);
     }
+    Py_DECREF(val);
 
     return ret;
 }
@@ -611,7 +611,7 @@ datetimetype_str(PyObject *self)
         return NULL;
     }
 
-    return PyUString_FromString(iso);
+    return PyUnicode_FromString(iso);
 }
 
 static char *_datetime_verbose_strings[NPY_DATETIME_NUMUNITS] = {
@@ -657,21 +657,19 @@ timedeltatype_str(PyObject *self)
     }
 
     if (scal->obval == NPY_DATETIME_NAT) {
-        ret = PyUString_FromString("NaT");
+        ret = PyUnicode_FromString("NaT");
     }
     else {
         /*
          * Can't use "%lld" if HAVE_LONG_LONG is not defined
          */
 #if defined(HAVE_LONG_LONG)
-        ret = PyUString_FromFormat("%lld ",
-                                (long long)(scal->obval * scal->obmeta.num));
+        ret = PyUnicode_FromFormat("%lld %s",
+            (long long)(scal->obval * scal->obmeta.num), basestr);
 #else
-        ret = PyUString_FromFormat("%ld ",
-                                (long)(scal->obval * scal->obmeta.num));
+        ret = PyUnicode_FromFormat("%ld %s",
+            (long)(scal->obval * scal->obmeta.num), basestr);
 #endif
-        PyUString_ConcatAndDel(&ret,
-                PyUString_FromString(basestr));
     }
 
     return ret;
@@ -795,7 +793,7 @@ legacy_@name@_format@kind@(@type@ val)
         PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
     }
 
-    return PyUString_FromString(buf);
+    return PyUnicode_FromString(buf);
 }
 
 #undef _FMT1
@@ -836,7 +834,7 @@ legacy_@name@_format@kind@(npy_@name@ val){
         strcpy(&buf[cnt],".0");
     }
 
-    return PyUString_FromString(buf);
+    return PyUnicode_FromString(buf);
 }
 
 #undef _FMT1
@@ -890,7 +888,7 @@ static PyObject *
 static PyObject *
 c@name@type_@kind@(PyObject *self)
 {
-    PyObject *rstr, *istr, *ret;
+    PyObject *rstr, *istr;
     npy_c@name@ val = PyArrayScalar_VAL(self, C@Name@);
     TrimMode trim = TrimMode_DptZeros;
 
@@ -903,47 +901,47 @@ c@name@type_@kind@(PyObject *self)
         if (istr == NULL) {
             return NULL;
         }
-
-        PyUString_ConcatAndDel(&istr, PyUString_FromString("j"));
-        return istr;
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
     }
 
     if (npy_isfinite(val.real)) {
         rstr = @name@type_@kind@_either(val.real, trim, trim, 0);
-        if (rstr == NULL) {
-            return NULL;
-        }
     }
     else if (npy_isnan(val.real)) {
-        rstr = PyUString_FromString("nan");
+        rstr = PyUnicode_FromString("nan");
     }
     else if (val.real > 0){
-        rstr = PyUString_FromString("inf");
+        rstr = PyUnicode_FromString("inf");
     }
     else {
-        rstr = PyUString_FromString("-inf");
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
     }
 
     if (npy_isfinite(val.imag)) {
         istr = @name@type_@kind@_either(val.imag, trim, trim, 1);
-        if (istr == NULL) {
-            return NULL;
-        }
     }
     else if (npy_isnan(val.imag)) {
-        istr = PyUString_FromString("+nan");
+        istr = PyUnicode_FromString("+nan");
     }
     else if (val.imag > 0){
-        istr = PyUString_FromString("+inf");
+        istr = PyUnicode_FromString("+inf");
     }
     else {
-        istr = PyUString_FromString("-inf");
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
     }
 
-    ret = PyUString_FromString("(");
-    PyUString_ConcatAndDel(&ret, rstr);
-    PyUString_ConcatAndDel(&ret, istr);
-    PyUString_ConcatAndDel(&ret, PyUString_FromString("j)"));
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
     return ret;
 }
 
@@ -1058,7 +1056,7 @@ gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
 static PyObject *
 gentype_ndim_get(PyObject *NPY_UNUSED(self))
 {
-    return PyInt_FromLong(0);
+    return PyLong_FromLong(0);
 }
 
 static PyObject *
@@ -1099,7 +1097,7 @@ inttype_numerator_get(PyObject *self)
 static PyObject *
 inttype_denominator_get(PyObject *self)
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 
@@ -1119,7 +1117,7 @@ gentype_itemsize_get(PyObject *self)
 
     typecode = PyArray_DescrFromScalar(self);
     elsize = typecode->elsize;
-    ret = PyInt_FromLong((long) elsize);
+    ret = PyLong_FromLong((long) elsize);
     Py_DECREF(typecode);
     return ret;
 }
@@ -1127,7 +1125,7 @@ gentype_itemsize_get(PyObject *self)
 static PyObject *
 gentype_size_get(PyObject *NPY_UNUSED(self))
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 static PyObject *
@@ -1147,12 +1145,16 @@ gentype_sizeof(PyObject *self)
 NPY_NO_EXPORT void
 gentype_struct_free(PyObject *ptr)
 {
-    PyArrayInterface *arrif;
-    PyObject *context;
-
-    arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
-    context = (PyObject *)PyCapsule_GetContext(ptr);
-    Py_DECREF(context);
+    PyArrayInterface *arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
+    if (arrif == NULL) {
+        PyErr_WriteUnraisable(ptr);
+        return;
+    }
+    PyObject *context = (PyObject *)PyCapsule_GetContext(ptr);
+    if (context == NULL && PyErr_Occurred()) {
+        PyErr_WriteUnraisable(ptr);
+    }
+    Py_XDECREF(context);
     Py_XDECREF(arrif->descr);
     PyArray_free(arrif->shape);
     PyArray_free(arrif);
@@ -1307,7 +1309,7 @@ gentype_imag_get(PyObject *self)
         ret = PyObject_GetAttrString(obj, "imag");
         if (ret == NULL) {
             PyErr_Clear();
-            obj = PyInt_FromLong(0);
+            obj = PyLong_FromLong(0);
             newtype = PyArray_DescrFromType(NPY_OBJECT);
             ret = PyArray_Scalar((char *)&obj, newtype, NULL);
             Py_DECREF(newtype);
@@ -1743,13 +1745,8 @@ gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
         if (arr == NULL) {
             return NULL;
         }
-        /* arr.item() */
-        PyObject *val = PyArray_GETITEM(arr, PyArray_DATA(arr));
-        Py_DECREF(arr);
-        if (val == NULL) {
-            return NULL;
-        }
-        PyObject *tup = Py_BuildValue("NN", obj, val);
+        /* Use the whole array which handles sturctured void correctly */
+        PyObject *tup = Py_BuildValue("NN", obj, arr);
         if (tup == NULL) {
             return NULL;
         }
@@ -2312,7 +2309,7 @@ voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val)
         return -1;
     }
 
-    if (PyBaseString_Check(ind)) {
+    if (PyUnicode_Check(ind)) {
         /*
          * Much like in voidtype_setfield, we cannot simply use ndarray's
          * __setitem__ since assignment to void scalars should not broadcast
@@ -2385,6 +2382,55 @@ static PySequenceMethods voidtype_as_sequence = {
 };
 
 
+/*
+ * This function implements simple buffer export for user defined subclasses
+ * of `np.generic`. All other scalar types override the buffer export.
+ */
+static int
+gentype_arrtype_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        PyErr_Format(PyExc_TypeError,
+                "NumPy scalar %R can only exported as a buffer without format.",
+                self);
+        return -1;
+    }
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyArray_Descr *descr = PyArray_DescrFromScalar(self);
+    if (descr == NULL) {
+        return -1;
+    }
+    if (!PyDataType_ISUSERDEF(descr)) {
+        /* This path would also reject the (hopefully) impossible "object" */
+        PyErr_Format(PyExc_TypeError,
+                "user-defined scalar %R registered for built-in dtype %S? "
+                "This should be impossible.",
+                self, descr);
+        return -1;
+    }
+    view->ndim = 0;
+    view->len = descr->elsize;
+    view->itemsize = descr->elsize;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;  /* assume general (user) scalars are readonly. */
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = scalar_value(self, descr);
+    Py_DECREF(descr);
+    view->format = NULL;
+    return 0;
+}
+
+
+static PyBufferProcs gentype_arrtype_as_buffer = {
+    .bf_getbuffer = (getbufferproc)gentype_arrtype_getbuffer,
+};
+
 
 /**begin repeat
  * #name = bool, byte, short, int, long, longlong, ubyte, ushort, uint, ulong,
@@ -2403,6 +2449,7 @@ static int
 @name@_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     Py@Name@ScalarObject *scalar = (Py@Name@ScalarObject *)self;
@@ -2415,6 +2462,7 @@ static int
     view->shape = NULL;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
     view->buf = &(scalar->obval);
@@ -2441,6 +2489,7 @@ static int
 unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     PyUnicodeScalarObject *scalar = (PyUnicodeScalarObject *)self;
@@ -2452,6 +2501,7 @@ unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
     view->shape = NULL;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
 
@@ -2481,7 +2531,7 @@ unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
         view->format = scalar->buffer_fmt;
     }
     else {
-        scalar->buffer_fmt = PyObject_Malloc(22);
+        scalar->buffer_fmt = PyMem_Malloc(22);
         if (scalar->buffer_fmt == NULL) {
             Py_SETREF(view->obj, NULL);
             return -1;
@@ -2508,6 +2558,7 @@ static int
 @name@_getbuffer(PyObject *self, Py_buffer *view, int flags)
 {
     if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
         return -1;
     }
     Py@Name@ScalarObject *scalar = (Py@Name@ScalarObject *)self;
@@ -2519,6 +2570,7 @@ static int
     view->shape = &length;
     view->strides = NULL;
     view->suboffsets = NULL;
+    view->readonly = 1;
     Py_INCREF(self);
     view->obj = self;
 
@@ -2558,19 +2610,46 @@ NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type = {
     .tp_basicsize = sizeof(PyObject),
 };
 
+
 static void
 void_dealloc(PyVoidScalarObject *v)
 {
-    _dealloc_cached_buffer_info((PyObject *)v);
-
     if (v->flags & NPY_ARRAY_OWNDATA) {
         npy_free_cache(v->obval, Py_SIZE(v));
     }
     Py_XDECREF(v->descr);
     Py_XDECREF(v->base);
+    if (_buffer_info_free(v->_buffer_info, (PyObject *)v) < 0) {
+        PyErr_WriteUnraisable(NULL);
+    }
     Py_TYPE(v)->tp_free(v);
 }
 
+
+static PyObject *
+object_arrtype_alloc(PyTypeObject *type, Py_ssize_t items)
+{
+    /*
+     * Object scalars should not actually exist, if they exist we should
+     * consider it to be a bug.
+     */
+    static PyObject *visibleDeprecationWarning = NULL;
+    npy_cache_import("numpy", "VisibleDeprecationWarning",
+                     &visibleDeprecationWarning);
+    if (visibleDeprecationWarning == NULL) {
+        return NULL;
+    }
+    if (PyErr_WarnEx(visibleDeprecationWarning,
+            "Creating a NumPy object scalar.  NumPy object scalars should "
+            "never be created.  If you see this message please inform the "
+            "NumPy developers.  Since this message should never be shown "
+            "this will raise a TypeError in the future.", 1) < 0) {
+        return NULL;
+    }
+    return gentype_alloc(type, items);
+}
+
+
 static void
 object_arrtype_dealloc(PyObject *v)
 {
@@ -2583,6 +2662,7 @@ unicode_arrtype_dealloc(PyObject *v)
 {
     /* note: may be null if it was never requested */
     PyMem_Free(PyArrayScalar_VAL(v, Unicode));
+    PyMem_Free(((PyUnicodeScalarObject *)v)->buffer_fmt);
     /* delegate to the base class */
     PyUnicode_Type.tp_dealloc(v);
 }
@@ -2868,7 +2948,7 @@ bool_arrtype_nonzero(PyObject *a)
  *         ulong, ulonglong#
  * #Name = Byte, Short, Int, Long, UByte, UShort, LongLong, UInt,
  *         ULong, ULongLong#
- * #type = PyInt_FromLong*6, PyLong_FromLongLong*1,
+ * #type = PyLong_FromLong*6, PyLong_FromLongLong*1,
  *         PyLong_FromUnsignedLong*2, PyLong_FromUnsignedLongLong#
  */
 static PyNumberMethods @name@_arrtype_as_number;
@@ -2897,7 +2977,7 @@ bool_index(PyObject *a)
         return NULL;
     }
     else {
-        return PyInt_FromLong(PyArrayScalar_VAL(a, Bool));
+        return PyLong_FromLong(PyArrayScalar_VAL(a, Bool));
     }
 }
 
@@ -2923,7 +3003,7 @@ void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
      * For a VOID scalar first see if obj is an integer or long
      * and create new memory of that size (filled with 0) for the scalar
      */
-    if (PyLong_Check(obj) || PyInt_Check(obj) ||
+    if (PyLong_Check(obj) ||
             PyArray_IsScalar(obj, Integer) ||
             (PyArray_Check(obj) &&
                      PyArray_NDIM((PyArrayObject *)obj)==0 &&
@@ -3298,6 +3378,7 @@ NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
     .tp_name = "numpy.object_",
     .tp_basicsize = sizeof(PyObjectScalarObject),
+    .tp_alloc = object_arrtype_alloc,
     .tp_dealloc = (destructor)object_arrtype_dealloc,
     .tp_as_sequence = &object_arrtype_as_sequence,
     .tp_as_mapping = &object_arrtype_as_mapping,
@@ -3770,6 +3851,7 @@ initialize_numeric_types(void)
     PyGenericArrType_Type.tp_alloc = gentype_alloc;
     PyGenericArrType_Type.tp_free = (freefunc)gentype_free;
     PyGenericArrType_Type.tp_richcompare = gentype_richcompare;
+    PyGenericArrType_Type.tp_as_buffer = &gentype_arrtype_as_buffer;
 
     PyBoolArrType_Type.tp_as_number = &bool_arrtype_as_number;
     /*
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 30507112d..02c349759 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -133,7 +133,7 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
     if (newnbytes > oldnbytes && PyArray_ISWRITEABLE(self)) {
         /* Fill new memory with zeros */
         if (PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_REFCOUNT)) {
-            PyObject *zero = PyInt_FromLong(0);
+            PyObject *zero = PyLong_FromLong(0);
             char *optr;
             optr = PyArray_BYTES(self) + oldnbytes;
             npy_intp n_new = newsize - oldsize;
@@ -332,7 +332,7 @@ _putzero(char *optr, PyObject *zero, PyArray_Descr *dtype)
 
         for (i = 0; i < nsize; i++) {
             Py_INCREF(zero);
-            NPY_COPY_PYOBJECT_PTR(optr, &zero);
+            memcpy(optr, &zero, sizeof(zero));
             optr += sizeof(zero);
         }
     }
@@ -458,14 +458,12 @@ _attempt_nocopy_reshape(PyArrayObject *self, int newnd, const npy_intp *newdims,
 static void
 raise_reshape_size_mismatch(PyArray_Dims *newshape, PyArrayObject *arr)
 {
-    PyObject *msg = PyUString_FromFormat("cannot reshape array of size %zd "
-                                         "into shape ", PyArray_SIZE(arr));
     PyObject *tmp = convert_shape_to_string(newshape->len, newshape->ptr, "");
-
-    PyUString_ConcatAndDel(&msg, tmp);
-    if (msg != NULL) {
-        PyErr_SetObject(PyExc_ValueError, msg);
-        Py_DECREF(msg);
+    if (tmp != NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "cannot reshape array of size %zd into shape %S",
+                PyArray_SIZE(arr), tmp);
+        Py_DECREF(tmp);
     }
 }
 
@@ -979,55 +977,6 @@ PyArray_Flatten(PyArrayObject *a, NPY_ORDER order)
     return (PyObject *)ret;
 }
 
-/* See shape.h for parameters documentation */
-NPY_NO_EXPORT PyObject *
-build_shape_string(npy_intp n, npy_intp const *vals)
-{
-    npy_intp i;
-    PyObject *ret, *tmp;
-
-    /*
-     * Negative dimension indicates "newaxis", which can
-     * be discarded for printing if it's a leading dimension.
-     * Find the first non-"newaxis" dimension.
-     */
-    i = 0;
-    while (i < n && vals[i] < 0) {
-        ++i;
-    }
-
-    if (i == n) {
-        return PyUString_FromFormat("()");
-    }
-    else {
-        ret = PyUString_FromFormat("(%" NPY_INTP_FMT, vals[i++]);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    for (; i < n; ++i) {
-        if (vals[i] < 0) {
-            tmp = PyUString_FromString(",newaxis");
-        }
-        else {
-            tmp = PyUString_FromFormat(",%" NPY_INTP_FMT, vals[i]);
-        }
-        if (tmp == NULL) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-
-        PyUString_ConcatAndDel(&ret, tmp);
-        if (ret == NULL) {
-            return NULL;
-        }
-    }
-
-    tmp = PyUString_FromFormat(")");
-    PyUString_ConcatAndDel(&ret, tmp);
-    return ret;
-}
 
 /*NUMPY_API
  *
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index d25292556..875b5430f 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -2,13 +2,6 @@
 #define _NPY_ARRAY_SHAPE_H_
 
 /*
- * Builds a string representation of the shape given in 'vals'.
- * A negative value in 'vals' gets interpreted as newaxis.
- */
-NPY_NO_EXPORT PyObject *
-build_shape_string(npy_intp n, npy_intp const *vals);
-
-/*
  * Creates a sorted stride perm matching the KEEPORDER behavior
  * of the NpyIter object. Because this operates based on multiple
  * input strides, the 'stride' member of the npy_stride_sort_item
diff --git a/numpy/core/src/multiarray/strfuncs.c b/numpy/core/src/multiarray/strfuncs.c
index 363cbdba2..d9d9b7c0a 100644
--- a/numpy/core/src/multiarray/strfuncs.c
+++ b/numpy/core/src/multiarray/strfuncs.c
@@ -3,14 +3,25 @@
 
 #include <Python.h>
 #include <numpy/arrayobject.h>
-
 #include "npy_pycompat.h"
-
+#include "npy_import.h"
 #include "strfuncs.h"
 
 static PyObject *PyArray_StrFunction = NULL;
 static PyObject *PyArray_ReprFunction = NULL;
 
+
+static void
+npy_PyErr_SetStringChained(PyObject *type, const char *message)
+{
+    PyObject *exc, *val, *tb;
+
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_SetString(type, message);
+    npy_PyErr_ChainExceptionsCause(exc, val, tb);
+}
+
+
 /*NUMPY_API
  * Set the array print function to be a Python function.
  */
@@ -36,164 +47,52 @@ PyArray_SetStringFunction(PyObject *op, int repr)
 }
 
 
-/*
- * Extend string. On failure, returns NULL and leaves *strp alone.
- * XXX we do this in multiple places; time for a string library?
- */
-static char *
-extend_str(char **strp, Py_ssize_t n, Py_ssize_t *maxp)
-{
-    char *str = *strp;
-    Py_ssize_t new_cap;
-
-    if (n >= *maxp - 16) {
-        new_cap = *maxp * 2;
-
-        if (new_cap <= *maxp) {     /* overflow */
-            return NULL;
-        }
-        str = PyArray_realloc(*strp, new_cap);
-        if (str != NULL) {
-            *strp = str;
-            *maxp = new_cap;
-        }
-    }
-    return str;
-}
-
-
-static int
-dump_data(char **string, Py_ssize_t *n, Py_ssize_t *max_n, char *data, int nd,
-          npy_intp const *dimensions, npy_intp const *strides, PyArrayObject* self)
-{
-    PyObject *op = NULL, *sp = NULL;
-    char *ostring;
-    npy_intp i, N, ret = 0;
-
-#define CHECK_MEMORY do {                           \
-        if (extend_str(string, *n, max_n) == NULL) {    \
-            ret = -1;                               \
-            goto end;                               \
-        }                                           \
-    } while (0)
-
-    if (nd == 0) {
-        if ((op = PyArray_GETITEM(self, data)) == NULL) {
-            return -1;
-        }
-        sp = PyObject_Repr(op);
-        if (sp == NULL) {
-            ret = -1;
-            goto end;
-        }
-        ostring = PyString_AsString(sp);
-        N = PyString_Size(sp)*sizeof(char);
-        *n += N;
-        CHECK_MEMORY;
-        memmove(*string + (*n - N), ostring, N);
-    }
-    else {
-        CHECK_MEMORY;
-        (*string)[*n] = '[';
-        *n += 1;
-        for (i = 0; i < dimensions[0]; i++) {
-            if (dump_data(string, n, max_n,
-                          data + (*strides)*i,
-                          nd - 1, dimensions + 1,
-                          strides + 1, self) < 0) {
-                return -1;
-            }
-            CHECK_MEMORY;
-            if (i < dimensions[0] - 1) {
-                (*string)[*n] = ',';
-                (*string)[*n+1] = ' ';
-                *n += 2;
-            }
-        }
-        CHECK_MEMORY;
-        (*string)[*n] = ']';
-        *n += 1;
-    }
-
-#undef CHECK_MEMORY
-
-end:
-    Py_XDECREF(op);
-    Py_XDECREF(sp);
-    return ret;
-}
-
-
-static PyObject *
-array_repr_builtin(PyArrayObject *self, int repr)
-{
-    PyObject *ret;
-    char *string;
-    /* max_n initial value is arbitrary, dump_data will extend it */
-    Py_ssize_t n = 0, max_n = PyArray_NBYTES(self) * 4 + 7;
-
-    if ((string = PyArray_malloc(max_n)) == NULL) {
-        return PyErr_NoMemory();
-    }
-
-    if (dump_data(&string, &n, &max_n, PyArray_DATA(self),
-                  PyArray_NDIM(self), PyArray_DIMS(self),
-                  PyArray_STRIDES(self), self) < 0) {
-        PyArray_free(string);
-        return NULL;
-    }
-
-    if (repr) {
-        if (PyArray_ISEXTENDED(self)) {
-            ret = PyUString_FromFormat("array(%s, '%c%d')",
-                                       string,
-                                       PyArray_DESCR(self)->type,
-                                       PyArray_DESCR(self)->elsize);
-        }
-        else {
-            ret = PyUString_FromFormat("array(%s, '%c')",
-                                       string,
-                                       PyArray_DESCR(self)->type);
-        }
-    }
-    else {
-        ret = PyUString_FromStringAndSize(string, n);
-    }
-
-    PyArray_free(string);
-    return ret;
-}
-
-
 NPY_NO_EXPORT PyObject *
 array_repr(PyArrayObject *self)
 {
-    PyObject *s;
+    static PyObject *repr = NULL;
 
-    if (PyArray_ReprFunction == NULL) {
-        s = array_repr_builtin(self, 1);
+    if (PyArray_ReprFunction != NULL) {
+        return PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL);
     }
-    else {
-        s = PyObject_CallFunctionObjArgs(PyArray_ReprFunction, self, NULL);
+
+    /*
+     * We need to do a delayed import here as initialization on module load
+     * leads to circular import problems.
+     */
+    npy_cache_import("numpy.core.arrayprint", "_default_array_repr", &repr);
+    if (repr == NULL) {
+        npy_PyErr_SetStringChained(PyExc_RuntimeError,
+                "Unable to configure default ndarray.__repr__");
+        return NULL;
     }
-    return s;
+    return PyObject_CallFunctionObjArgs(repr, self, NULL);
 }
 
 
 NPY_NO_EXPORT PyObject *
 array_str(PyArrayObject *self)
 {
-    PyObject *s;
+    static PyObject *str = NULL;
 
-    if (PyArray_StrFunction == NULL) {
-        s = array_repr_builtin(self, 0);
+    if (PyArray_StrFunction != NULL) {
+        return PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL);
     }
-    else {
-        s = PyObject_CallFunctionObjArgs(PyArray_StrFunction, self, NULL);
+
+    /*
+     * We need to do a delayed import here as initialization on module load leads
+     * to circular import problems.
+     */
+    npy_cache_import("numpy.core.arrayprint", "_default_array_str", &str);
+    if (str == NULL) {
+        npy_PyErr_SetStringChained(PyExc_RuntimeError,
+                "Unable to configure default ndarray.__str__");
+        return NULL;
     }
-    return s;
+    return PyObject_CallFunctionObjArgs(str, self, NULL);
 }
 
+
 NPY_NO_EXPORT PyObject *
 array_format(PyArrayObject *self, PyObject *args)
 {
@@ -221,4 +120,3 @@ array_format(PyArrayObject *self, PyObject *args)
         );
     }
 }
-
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
index 09b948218..b19dee418 100644
--- a/numpy/core/src/multiarray/temp_elide.c
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -62,12 +62,8 @@
 #define NPY_ELIDE_DEBUG 0
 #define NPY_MAX_STACKSIZE 10
 
-#if PY_VERSION_HEX >= 0x03060000
 /* TODO can pep523 be used to somehow? */
 #define PYFRAMEEVAL_FUNC "_PyEval_EvalFrameDefault"
-#else
-#define PYFRAMEEVAL_FUNC "PyEval_EvalFrameEx"
-#endif
 /*
  * Heuristic size of the array in bytes at which backtrace overhead generation
  * becomes less than speed gained by in-place operations. Depends on stack depth
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index bc320138d..3eaf99196 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -38,6 +38,11 @@ maintainer email:  oliphant.travis@ieee.org
 
 #include "usertypes.h"
 #include "dtypemeta.h"
+#include "scalartypes.h"
+#include "array_method.h"
+#include "convert_datatype.h"
+#include "legacy_dtype_implementation.h"
+
 
 NPY_NO_EXPORT PyArray_Descr **userdescrs=NULL;
 
@@ -127,6 +132,9 @@ PyArray_InitArrFuncs(PyArray_ArrFuncs *f)
     f->scalarkind = NULL;
     f->cancastscalarkindto = NULL;
     f->cancastto = NULL;
+    f->fastclip = NULL;
+    f->fastputmask = NULL;
+    f->fasttake = NULL;
 }
 
 
@@ -192,7 +200,7 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         }
     }
     typenum = NPY_USERDEF + NPY_NUMUSERTYPES;
-    descr->type_num = typenum;
+    descr->type_num = -1;
     if (PyDataType_ISUNSIZED(descr)) {
         PyErr_SetString(PyExc_ValueError, "cannot register a" \
                         "flexible data-type");
@@ -215,6 +223,27 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_ValueError, "missing typeobject");
         return -1;
     }
+    if (descr->flags & (NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT)) {
+        /*
+         * User dtype can't actually do reference counting, however, there
+         * are existing hacks (e.g. xpress), which use a structured one:
+         *     dtype((xpress.var, [('variable', 'O')]))
+         * so we have to support this. But such a structure must be constant
+         * (i.e. fixed at registration time, this is the case for `xpress`).
+         */
+        if (descr->names == NULL || descr->fields == NULL ||
+            !PyDict_CheckExact(descr->fields)) {
+            PyErr_Format(PyExc_ValueError,
+                    "Failed to register dtype for %S: Legacy user dtypes "
+                    "using `NPY_ITEM_IS_POINTER` or `NPY_ITEM_REFCOUNT` are"
+                    "unsupported.  It is possible to create such a dtype only "
+                    "if it is a structured dtype with names and fields "
+                    "hardcoded at registration time.\n"
+                    "Please contact the NumPy developers if this used to work "
+                    "but now fails.", descr->typeobj);
+            return -1;
+        }
+    }
 
     if (test_deprecated_arrfuncs_members(f) < 0) {
         return -1;
@@ -226,9 +255,13 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_MemoryError, "RegisterDataType");
         return -1;
     }
+
     userdescrs[NPY_NUMUSERTYPES++] = descr;
 
+    descr->type_num = typenum;
     if (dtypemeta_wrap_legacy_descriptor(descr) < 0) {
+        descr->type_num = -1;
+        NPY_NUMUSERTYPES--;
         return -1;
     }
 
@@ -260,11 +293,11 @@ PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
             return -1;
         }
     }
-    key = PyInt_FromLong(totype);
+    key = PyLong_FromLong(totype);
     if (PyErr_Occurred()) {
         return -1;
     }
-    cobj = NpyCapsule_FromVoidPtr((void *)castfunc, NULL);
+    cobj = PyCapsule_New((void *)castfunc, NULL, NULL);
     if (cobj == NULL) {
         Py_DECREF(key);
         return -1;
@@ -291,7 +324,7 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
     if (!PyTypeNum_ISUSERDEF(descr->type_num) &&
                                         !PyTypeNum_ISUSERDEF(totype)) {
         PyErr_SetString(PyExc_ValueError,
-                        "At least one of the types provided to"
+                        "At least one of the types provided to "
                         "RegisterCanCast must be user-defined.");
         return -1;
     }
@@ -339,3 +372,185 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
         return _append_new(&descr->f->cancastscalarkindto[scalar], totype);
     }
 }
+
+
+/*
+ * Legacy user DTypes implemented the common DType operation
+ * (as used in type promotion/result_type, and e.g. the type for
+ * concatenation), by using "safe cast" logic.
+ *
+ * New DTypes do have this behaviour generally, but we use can-cast
+ * when legacy user dtypes are involved.
+ */
+NPY_NO_EXPORT PyArray_DTypeMeta *
+legacy_userdtype_common_dtype_function(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
+{
+    int skind1 = NPY_NOSCALAR, skind2 = NPY_NOSCALAR, skind;
+
+    if (!other->legacy) {
+        /* legacy DTypes can always defer to new style ones */
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+    /* Defer so that only one of the types handles the cast */
+    if (cls->type_num < other->type_num) {
+        Py_INCREF(Py_NotImplemented);
+        return (PyArray_DTypeMeta *)Py_NotImplemented;
+    }
+
+    /* Check whether casting is possible from one type to the other */
+    if (PyArray_CanCastSafely(cls->type_num, other->type_num)) {
+        Py_INCREF(other);
+        return other;
+    }
+    if (PyArray_CanCastSafely(other->type_num, cls->type_num)) {
+        Py_INCREF(cls);
+        return cls;
+    }
+
+    /*
+     * The following code used to be part of PyArray_PromoteTypes().
+     * We can expect that this code is never used.
+     * In principle, it allows for promotion of two different user dtypes
+     * to a single NumPy dtype of the same "kind". In practice
+     * using the same `kind` as NumPy was never possible due to an
+     * simplification where `PyArray_EquivTypes(descr1, descr2)` will
+     * return True if both kind and element size match (e.g. bfloat16 and
+     * float16 would be equivalent).
+     * The option is also very obscure and not used in the examples.
+     */
+
+    /* Convert the 'kind' char into a scalar kind */
+    switch (cls->kind) {
+        case 'b':
+            skind1 = NPY_BOOL_SCALAR;
+            break;
+        case 'u':
+            skind1 = NPY_INTPOS_SCALAR;
+            break;
+        case 'i':
+            skind1 = NPY_INTNEG_SCALAR;
+            break;
+        case 'f':
+            skind1 = NPY_FLOAT_SCALAR;
+            break;
+        case 'c':
+            skind1 = NPY_COMPLEX_SCALAR;
+            break;
+    }
+    switch (other->kind) {
+        case 'b':
+            skind2 = NPY_BOOL_SCALAR;
+            break;
+        case 'u':
+            skind2 = NPY_INTPOS_SCALAR;
+            break;
+        case 'i':
+            skind2 = NPY_INTNEG_SCALAR;
+            break;
+        case 'f':
+            skind2 = NPY_FLOAT_SCALAR;
+            break;
+        case 'c':
+            skind2 = NPY_COMPLEX_SCALAR;
+            break;
+    }
+
+    /* If both are scalars, there may be a promotion possible */
+    if (skind1 != NPY_NOSCALAR && skind2 != NPY_NOSCALAR) {
+
+        /* Start with the larger scalar kind */
+        skind = (skind1 > skind2) ? skind1 : skind2;
+        int ret_type_num = _npy_smallest_type_of_kind_table[skind];
+
+        for (;;) {
+
+            /* If there is no larger type of this kind, try a larger kind */
+            if (ret_type_num < 0) {
+                ++skind;
+                /* Use -1 to signal no promoted type found */
+                if (skind < NPY_NSCALARKINDS) {
+                    ret_type_num = _npy_smallest_type_of_kind_table[skind];
+                }
+                else {
+                    break;
+                }
+            }
+
+            /* If we found a type to which we can promote both, done! */
+            if (PyArray_CanCastSafely(cls->type_num, ret_type_num) &&
+                PyArray_CanCastSafely(other->type_num, ret_type_num)) {
+                return PyArray_DTypeFromTypeNum(ret_type_num);
+            }
+
+            /* Try the next larger type of this kind */
+            ret_type_num = _npy_next_larger_type_table[ret_type_num];
+        }
+    }
+
+    Py_INCREF(Py_NotImplemented);
+    return (PyArray_DTypeMeta *)Py_NotImplemented;
+}
+
+
+/**
+ * This function wraps a legacy cast into an array-method. This is mostly
+ * used for legacy user-dtypes, but for example numeric to/from datetime
+ * casts were only defined that way as well.
+ *
+ * @param from
+ * @param to
+ * @param casting If `NPY_NO_CASTING` will check the legacy registered cast,
+ *        otherwise uses the provided cast.
+ */
+NPY_NO_EXPORT int
+PyArray_AddLegacyWrapping_CastingImpl(
+        PyArray_DTypeMeta *from, PyArray_DTypeMeta *to, NPY_CASTING casting)
+{
+    if (casting < 0) {
+        if (from == to) {
+            casting = NPY_NO_CASTING;
+        }
+        else if (PyArray_LegacyCanCastTypeTo(
+                from->singleton, to->singleton, NPY_SAFE_CASTING)) {
+            casting = NPY_SAFE_CASTING;
+        }
+        else if (PyArray_LegacyCanCastTypeTo(
+                from->singleton, to->singleton, NPY_SAME_KIND_CASTING)) {
+            casting = NPY_SAME_KIND_CASTING;
+        }
+        else {
+            casting = NPY_UNSAFE_CASTING;
+        }
+    }
+
+    PyArray_DTypeMeta *dtypes[2] = {from, to};
+    PyArrayMethod_Spec spec = {
+            /* Name is not actually used, but allows identifying these. */
+            .name = "legacy_cast",
+            .nin = 1,
+            .nout = 1,
+            .casting = casting,
+            .dtypes = dtypes,
+    };
+
+    if (from == to) {
+        spec.flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED;
+        PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &legacy_same_dtype_resolve_descriptors},
+            {0, NULL}};
+        spec.slots = slots;
+        return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    }
+    else {
+        spec.flags = NPY_METH_REQUIRES_PYAPI;
+        PyType_Slot slots[] = {
+            {NPY_METH_get_loop, NULL},
+            {NPY_METH_resolve_descriptors, &simple_cast_resolve_descriptors},
+            {0, NULL}};
+        spec.slots = slots;
+        return PyArray_AddCastingImplementation_FromSpec(&spec, 1);
+    }
+}
diff --git a/numpy/core/src/multiarray/usertypes.h b/numpy/core/src/multiarray/usertypes.h
index b3e386c5c..8b2fc80e6 100644
--- a/numpy/core/src/multiarray/usertypes.h
+++ b/numpy/core/src/multiarray/usertypes.h
@@ -1,6 +1,8 @@
 #ifndef _NPY_PRIVATE_USERTYPES_H_
 #define _NPY_PRIVATE_USERTYPES_H_
 
+#include "array_method.h"
+
 extern NPY_NO_EXPORT PyArray_Descr **userdescrs;
 
 NPY_NO_EXPORT void
@@ -17,4 +19,12 @@ NPY_NO_EXPORT int
 PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
                          PyArray_VectorUnaryFunc *castfunc);
 
+NPY_NO_EXPORT PyArray_DTypeMeta *
+legacy_userdtype_common_dtype_function(
+        PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other);
+
+NPY_NO_EXPORT int
+PyArray_AddLegacyWrapping_CastingImpl(
+        PyArray_DTypeMeta *from, PyArray_DTypeMeta *to, NPY_CASTING casting);
+
 #endif
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index 18b6d1434..ff4663dc3 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -398,8 +398,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,fmod,copysign#
- * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ * #kind = atan2,hypot,pow,copysign#
+ * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
  */
 #ifdef @kind@@c@
 #undef @kind@@c@
@@ -412,6 +412,32 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
+/**begin repeat1
+ * #kind = fmod#
+ * #KIND = FMOD#
+ */
+#ifdef @kind@@c@
+#undef @kind@@c@
+#endif
+#ifndef HAVE_MODF@C@
+NPY_INPLACE @type@
+npy_@kind@@c@(@type@ x, @type@ y)
+{
+    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(x) || npy_isnan(y)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (are_inputs_inf || !y) {
+        if (!npy_isnan(x)) {
+            npy_set_floatstatus_invalid();
+        }
+    }
+    return (@type@) npy_@kind@((double)x, (double) y);
+}
+#endif
+/**end repeat1**/
+
 #ifdef modf@c@
 #undef modf@c@
 #endif
@@ -473,8 +499,8 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = atan2,hypot,pow,fmod,copysign#
- * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
+ * #kind = atan2,hypot,pow,copysign#
+ * #KIND = ATAN2,HYPOT,POW,COPYSIGN#
  */
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
@@ -484,6 +510,29 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #endif
 /**end repeat1**/
 
+/**begin repeat1
+ * #kind = fmod#
+ * #KIND = FMOD#
+ */
+#ifdef HAVE_FMOD@C@
+NPY_INPLACE @type@
+npy_@kind@@c@(@type@ x, @type@ y)
+{
+    int are_inputs_inf = (npy_isinf(x) && npy_isinf(y));
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(x) || npy_isnan(y)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (are_inputs_inf || !y) {
+        if (!npy_isnan(x)) {
+            npy_set_floatstatus_invalid();
+        }
+    }
+    return @kind@@c@(x, y);
+}
+#endif
+/**end repeat1**/
+
 #ifdef HAVE_MODF@C@
 NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
 {
@@ -625,6 +674,38 @@ NPY_INPLACE @type@ npy_logaddexp2@c@(@type@ x, @type@ y)
 }
 
 /*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE @type@
+npy_remainder@c@(@type@ a, @type@ b)
+{
+    @type@ mod;
+    if (NPY_UNLIKELY(!b)) {
+        mod = npy_fmod@c@(a, b);
+    } else {
+        npy_divmod@c@(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE @type@
+npy_floor_divide@c@(@type@ a, @type@ b) {
+    @type@ div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        div = a / b;
+        if (!a || npy_isnan(a)) {
+            npy_set_floatstatus_invalid();
+        } else {
+            npy_set_floatstatus_divbyzero();
+        }
+    } else {
+        div = npy_divmod@c@(a, b, &mod);
+    }
+    return div;
+}
+
+/*
  * Python version of divmod.
  *
  * The implementation is mostly copied from cpython 3.5.
@@ -634,12 +715,19 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 {
     @type@ div, mod, floordiv;
 
+    /* force set invalid flag, doesnt raise by default on gcc < 8 */
+    if (npy_isnan(a) || npy_isnan(b)) {
+        npy_set_floatstatus_invalid();
+    }
     mod = npy_fmod@c@(a, b);
-
-    if (!b) {
+    if (NPY_UNLIKELY(!b)) {
+        div = a / b;
+        if (a && !npy_isnan(a)) {
+            npy_set_floatstatus_divbyzero();
+        }
         /* If b == 0, return result of fmod. For IEEE is nan */
         *modulus = mod;
-        return mod;
+        return div;
     }
 
     /* a - mod should be very nearly an integer multiple of b */
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index e4a919db6..212d11a0b 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -25,7 +25,6 @@
 #include "npy_fpmath.h"
 
 #include "numpy/npy_math.h"
-#include "numpy/npy_cpu.h"
 #include "numpy/npy_endian.h"
 #include "numpy/npy_common.h"
 
diff --git a/numpy/core/src/npysort/binsearch.c.src b/numpy/core/src/npysort/binsearch.c.src
index c04e197b7..41165897b 100644
--- a/numpy/core/src/npysort/binsearch.c.src
+++ b/numpy/core/src/npysort/binsearch.c.src
@@ -35,7 +35,7 @@
  * #CMP  = LT, LTE#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
                         npy_intp arr_len, npy_intp key_len,
                         npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
@@ -81,7 +81,7 @@ binsearch_@side@_@suff@(const char *arr, const char *key, char *ret,
     }
 }
 
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 argbinsearch_@side@_@suff@(const char *arr, const char *key,
                            const char *sort, char *ret,
                            npy_intp arr_len, npy_intp key_len,
@@ -153,7 +153,7 @@ argbinsearch_@side@_@suff@(const char *arr, const char *key,
  * #CMP  = <, <=#
  */
 
-NPY_VISIBILITY_HIDDEN void
+NPY_NO_EXPORT void
 npy_binsearch_@side@(const char *arr, const char *key, char *ret,
                      npy_intp arr_len, npy_intp key_len,
                      npy_intp arr_str, npy_intp key_str, npy_intp ret_str,
@@ -195,7 +195,7 @@ npy_binsearch_@side@(const char *arr, const char *key, char *ret,
     }
 }
 
-NPY_VISIBILITY_HIDDEN int
+NPY_NO_EXPORT int
 npy_argbinsearch_@side@(const char *arr, const char *key,
                         const char *sort, char *ret,
                         npy_intp arr_len, npy_intp key_len,
diff --git a/numpy/core/src/npysort/heapsort.c.src b/numpy/core/src/npysort/heapsort.c.src
index c2e3b63cb..4bfea1388 100644
--- a/numpy/core/src/npysort/heapsort.c.src
+++ b/numpy/core/src/npysort/heapsort.c.src
@@ -60,7 +60,7 @@
  *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
  */
 
-int
+NPY_NO_EXPORT int
 heapsort_@suff@(void *start, npy_intp n, void *NOT_USED)
 {
     @type@ tmp, *a;
@@ -111,7 +111,7 @@ heapsort_@suff@(void *start, npy_intp n, void *NOT_USED)
 }
 
 
-int
+NPY_NO_EXPORT int
 aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *NOT_USED)
 {
     @type@ *v = vv;
@@ -177,7 +177,7 @@ aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *NOT_USED)
  * #type = npy_char, npy_ucs4#
  */
 
-int
+NPY_NO_EXPORT int
 heapsort_@suff@(void *start, npy_intp n, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -231,7 +231,7 @@ heapsort_@suff@(void *start, npy_intp n, void *varr)
 }
 
 
-int
+NPY_NO_EXPORT int
 aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *varr)
 {
     @type@ *v = vv;
@@ -291,7 +291,7 @@ aheapsort_@suff@(void *vv, npy_intp *tosort, npy_intp n, void *varr)
  */
 
 
-int
+NPY_NO_EXPORT int
 npy_heapsort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -348,7 +348,7 @@ npy_heapsort(void *start, npy_intp num, void *varr)
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_aheapsort(void *vv, npy_intp *tosort, npy_intp n, void *varr)
 {
     char *v = vv;
diff --git a/numpy/core/src/npysort/mergesort.c.src b/numpy/core/src/npysort/mergesort.c.src
index 6f659617a..f83fbf758 100644
--- a/numpy/core/src/npysort/mergesort.c.src
+++ b/numpy/core/src/npysort/mergesort.c.src
@@ -103,7 +103,7 @@ mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw)
 }
 
 
-int
+NPY_NO_EXPORT int
 mergesort_@suff@(void *start, npy_intp num, void *NOT_USED)
 {
     @type@ *pl, *pr, *pw;
@@ -166,7 +166,7 @@ amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw)
 }
 
 
-int
+NPY_NO_EXPORT int
 amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *NOT_USED)
 {
     npy_intp *pl, *pr, *pw;
@@ -245,7 +245,7 @@ mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len)
 }
 
 
-int
+NPY_NO_EXPORT int
 mergesort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -326,7 +326,7 @@ amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, size_t l
 }
 
 
-int
+NPY_NO_EXPORT int
 amergesort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -407,7 +407,7 @@ npy_mergesort0(char *pl, char *pr, char *pw, char *vp, npy_intp elsize,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_mergesort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -485,7 +485,7 @@ npy_amergesort0(npy_intp *pl, npy_intp *pr, char *v, npy_intp *pw,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_amergesort(void *v, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
diff --git a/numpy/core/src/npysort/quicksort.c.src b/numpy/core/src/npysort/quicksort.c.src
index 49a2c4906..933f75808 100644
--- a/numpy/core/src/npysort/quicksort.c.src
+++ b/numpy/core/src/npysort/quicksort.c.src
@@ -85,7 +85,7 @@
  *         npy_cdouble, npy_clongdouble, npy_datetime, npy_timedelta#
  */
 
-int
+NPY_NO_EXPORT int
 quicksort_@suff@(void *start, npy_intp num, void *NOT_USED)
 {
     @type@ vp;
@@ -160,7 +160,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *NOT_USED)
 {
     @type@ *v = vv;
@@ -253,7 +253,7 @@ stack_pop:
  * #type = npy_char, npy_ucs4#
  */
 
-int
+NPY_NO_EXPORT int
 quicksort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -341,7 +341,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 aquicksort_@suff@(void *vv, npy_intp* tosort, npy_intp num, void *varr)
 {
     @type@ *v = vv;
@@ -434,7 +434,7 @@ stack_pop:
  */
 
 
-int
+NPY_NO_EXPORT int
 npy_quicksort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -539,7 +539,7 @@ stack_pop:
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_aquicksort(void *vv, npy_intp* tosort, npy_intp num, void *varr)
 {
     char *v = vv;
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
index 72887d7e4..99d8ed42a 100644
--- a/numpy/core/src/npysort/radixsort.c.src
+++ b/numpy/core/src/npysort/radixsort.c.src
@@ -46,7 +46,7 @@ nth_byte_@suff@(@type@ key, npy_intp l) {
     return (key >> (l << 3)) & 0xFF;
 }
 
-@type@*
+static @type@*
 radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
 {
     npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
@@ -95,7 +95,7 @@ radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
     return arr;
 }
 
-int
+NPY_NO_EXPORT int
 radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
 {
     void *sorted;
@@ -136,7 +136,7 @@ radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
     return 0;
 }
 
-npy_intp*
+static npy_intp*
 aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
 {
     npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
@@ -185,7 +185,7 @@ aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
     return tosort;
 }
 
-int
+NPY_NO_EXPORT int
 aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
 {
     npy_intp *sorted;
diff --git a/numpy/core/src/npysort/selection.c.src b/numpy/core/src/npysort/selection.c.src
index be645450f..0e285b320 100644
--- a/numpy/core/src/npysort/selection.c.src
+++ b/numpy/core/src/npysort/selection.c.src
@@ -280,7 +280,7 @@ static int
  * kth 8:   0  1  2  3  4  5  6 [8  7] -> stack []
  *
  */
-int
+NPY_NO_EXPORT int
 @name@introselect_@suff@(@type@ *v,
 #if @arg@
                          npy_intp* tosort,
@@ -323,7 +323,8 @@ int
         store_pivot(kth, kth, pivots, npiv);
         return 0;
     }
-    else if (@inexact@ && kth == num - 1) {
+    // Parenthesis around @inexact@ tells clang dead code as intentional
+    else if ((@inexact@) && kth == num - 1) {
         /* useful to check if NaN present via partition(d, (x, -1)) */
         npy_intp k;
         npy_intp maxidx = low;
diff --git a/numpy/core/src/npysort/timsort.c.src b/numpy/core/src/npysort/timsort.c.src
index 26313ca5b..3fdd46f61 100644
--- a/numpy/core/src/npysort/timsort.c.src
+++ b/numpy/core/src/npysort/timsort.c.src
@@ -42,7 +42,7 @@
 
 
 
-npy_intp compute_min_run(npy_intp num)
+static npy_intp compute_min_run(npy_intp num)
 {
     npy_intp r = 0;
 
@@ -476,7 +476,7 @@ force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 timsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
 {
     int ret;
@@ -854,7 +854,7 @@ aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num,
                 void *NPY_UNUSED(varr))
 {
@@ -904,7 +904,7 @@ cleanup:
  * run length to reduce the cost of insertion sort.
  */
 
-npy_intp compute_min_run_short(npy_intp num)
+static npy_intp compute_min_run_short(npy_intp num)
 {
     npy_intp r = 0;
 
@@ -1303,7 +1303,7 @@ force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 timsort_@suff@(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -1691,7 +1691,7 @@ aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 atimsort_@suff@(void *start, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -2128,7 +2128,7 @@ npy_force_collapse(char *arr, run *stack, npy_intp *stack_ptr,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_timsort(void *start, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
@@ -2524,7 +2524,7 @@ npy_aforce_collapse(char *arr, npy_intp *tosort, run *stack,
 }
 
 
-int
+NPY_NO_EXPORT int
 npy_atimsort(void *start, npy_intp *tosort, npy_intp num, void *varr)
 {
     PyArrayObject *arr = varr;
diff --git a/numpy/core/src/umath/_rational_tests.c.src b/numpy/core/src/umath/_rational_tests.c.src
index 13e33d0a5..7b1e5627a 100644
--- a/numpy/core/src/umath/_rational_tests.c.src
+++ b/numpy/core/src/umath/_rational_tests.c.src
@@ -406,8 +406,9 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
             Py_INCREF(x[0]);
             return x[0];
         }
-        else if (PyString_Check(x[0])) {
-            const char* s = PyString_AS_STRING(x[0]);
+        // TODO: allow construction from unicode strings
+        else if (PyBytes_Check(x[0])) {
+            const char* s = PyBytes_AS_STRING(x[0]);
             rational x;
             if (scan_rational(&s,&x)) {
                 const char* p;
@@ -429,7 +430,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
         PyObject* y;
         int eq;
         x[i] = PyTuple_GET_ITEM(args, i);
-        n[i] = PyInt_AsLong(x[i]);
+        n[i] = PyLong_AsLong(x[i]);
         if (error_converting(n[i])) {
             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
                 PyErr_Format(PyExc_TypeError,
@@ -440,7 +441,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
             return 0;
         }
         /* Check that we had an exact integer */
-        y = PyInt_FromLong(n[i]);
+        y = PyLong_FromLong(n[i]);
         if (!y) {
             return 0;
         }
@@ -477,7 +478,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
         else { \
             PyObject* y_; \
             int eq_; \
-            long n_ = PyInt_AsLong(object); \
+            long n_ = PyLong_AsLong(object); \
             if (error_converting(n_)) { \
                 if (PyErr_ExceptionMatches(PyExc_TypeError)) { \
                     PyErr_Clear(); \
@@ -486,7 +487,7 @@ pyrational_new(PyTypeObject* type, PyObject* args, PyObject* kwds) {
                 } \
                 return 0; \
             } \
-            y_ = PyInt_FromLong(n_); \
+            y_ = PyLong_FromLong(n_); \
             if (!y_) { \
                 return 0; \
             } \
@@ -526,11 +527,11 @@ static PyObject*
 pyrational_repr(PyObject* self) {
     rational x = ((PyRational*)self)->r;
     if (d(x)!=1) {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "rational(%ld,%ld)",(long)x.n,(long)d(x));
     }
     else {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "rational(%ld)",(long)x.n);
     }
 }
@@ -539,11 +540,11 @@ static PyObject*
 pyrational_str(PyObject* self) {
     rational x = ((PyRational*)self)->r;
     if (d(x)!=1) {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "%ld/%ld",(long)x.n,(long)d(x));
     }
     else {
-        return PyUString_FromFormat(
+        return PyUnicode_FromFormat(
                 "%ld",(long)x.n);
     }
 }
@@ -590,7 +591,7 @@ RATIONAL_BINOP_2(floor_divide,
     }
 RATIONAL_UNOP(negative,rational,rational_negative(x),PyRational_FromRational)
 RATIONAL_UNOP(absolute,rational,rational_abs(x),PyRational_FromRational)
-RATIONAL_UNOP(int,long,rational_int(x),PyInt_FromLong)
+RATIONAL_UNOP(int,long,rational_int(x),PyLong_FromLong)
 RATIONAL_UNOP(float,double,rational_double(x),PyFloat_FromDouble)
 
 static PyObject*
@@ -646,12 +647,12 @@ static PyNumberMethods pyrational_as_number = {
 
 static PyObject*
 pyrational_n(PyObject* self, void* closure) {
-    return PyInt_FromLong(((PyRational*)self)->r.n);
+    return PyLong_FromLong(((PyRational*)self)->r.n);
 }
 
 static PyObject*
 pyrational_d(PyObject* self, void* closure) {
-    return PyInt_FromLong(d(((PyRational*)self)->r));
+    return PyLong_FromLong(d(((PyRational*)self)->r));
 }
 
 static PyGetSetDef pyrational_getset[] = {
@@ -662,7 +663,7 @@ static PyGetSetDef pyrational_getset[] = {
 
 static PyTypeObject PyRational_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    "rational",                               /* tp_name */
+    "numpy.core._rational_tests.rational",  /* tp_name */
     sizeof(PyRational),                       /* tp_basicsize */
     0,                                        /* tp_itemsize */
     0,                                        /* tp_dealloc */
@@ -726,17 +727,17 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
         r = ((PyRational*)item)->r;
     }
     else {
-        long n = PyInt_AsLong(item);
+        long long n = PyLong_AsLongLong(item);
         PyObject* y;
         int eq;
         if (error_converting(n)) {
             return -1;
         }
-        y = PyInt_FromLong(n);
+        y = PyLong_FromLongLong(n);
         if (!y) {
             return -1;
         }
-        eq = PyObject_RichCompareBool(item,y,Py_EQ);
+        eq = PyObject_RichCompareBool(item, y, Py_EQ);
         Py_DECREF(y);
         if (eq<0) {
             return -1;
@@ -748,7 +749,7 @@ npyrational_setitem(PyObject* item, void* data, void* arr) {
         }
         r = make_rational_int(n);
     }
-    memcpy(data,&r,sizeof(rational));
+    memcpy(data, &r, sizeof(rational));
     return 0;
 }
 
@@ -1126,7 +1127,7 @@ PyMODINIT_FUNC PyInit__rational_tests(void) {
     if (PyErr_Occurred()) {
         goto fail;
     }
-    numpy_str = PyUString_FromString("numpy");
+    numpy_str = PyUnicode_FromString("numpy");
     if (!numpy_str) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index d08aabd64..750fbeb92 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -461,6 +461,15 @@ addUfuncs(PyObject *dictionary) {
     PyDict_SetItemString(dictionary, "cross1d", f);
     Py_DECREF(f);
 
+    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL,
+            NULL, 0, 0, 0, PyUFunc_None, "_pickleable_module_global.ufunc",
+            "A dotted name for pickle testing, does nothing.", 0, NULL);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "_pickleable_module_global_ufunc", f);
+    Py_DECREF(f);
+
     return 0;
 }
 
@@ -480,7 +489,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    if (PyString_Check(signature)) {
+    if (PyBytes_Check(signature)) {
         sig_str = signature;
     } else if (PyUnicode_Check(signature)) {
         sig_str = PyUnicode_AsUTF8String(signature);
@@ -493,7 +502,7 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         NULL, NULL, NULL,
         0, nin, nout, PyUFunc_None, "no name",
         "doc:none",
-        1, PyString_AS_STRING(sig_str));
+        1, PyBytes_AS_STRING(sig_str));
     if (sig_str != signature) {
         Py_DECREF(sig_str);
     }
@@ -588,11 +597,11 @@ static PyObject *
 UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2))
 {
     const char *highest_func, *highest_var;
-    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ())
-    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var)
+    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var);
     const char *highest_func_xb = "nobase", *highest_var_xb = "nobase";
-    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ())
-    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var)
+    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var);
 
     PyObject *dict = PyDict_New(), *item;
     if (dict == NULL) {
@@ -610,7 +619,7 @@ UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dumm
     if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) {
         goto err;
     }
-    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item))
+    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item));
     if (PyErr_Occurred()) {
         goto err;
     }
@@ -671,7 +680,7 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
 
     d = PyModule_GetDict(m);
 
-    version = PyString_FromString("0.1");
+    version = PyUnicode_FromString("0.1");
     PyDict_SetItemString(d, "__version__", version);
     Py_DECREF(version);
 
diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
index 3404a0c6a..cd81f7734 100644
--- a/numpy/core/src/umath/extobj.c
+++ b/numpy/core/src/umath/extobj.c
@@ -109,8 +109,8 @@ _error_handler(int method, PyObject *errobj, char *errtype, int retstatus, int *
                     errtype, name);
             goto fail;
         }
-        args = Py_BuildValue("NN", PyUString_FromString(errtype),
-                PyInt_FromLong((long) retstatus));
+        args = Py_BuildValue("NN", PyUnicode_FromString(errtype),
+                PyLong_FromLong((long) retstatus));
         if (args == NULL) {
             goto fail;
         }
@@ -212,7 +212,7 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize,
     }
 
     if (bufsize != NULL) {
-        *bufsize = PyInt_AsLong(PyList_GET_ITEM(ref, 0));
+        *bufsize = PyLong_AsLong(PyList_GET_ITEM(ref, 0));
         if (error_converting(*bufsize)) {
             return -1;
         }
@@ -229,7 +229,7 @@ _extract_pyvals(PyObject *ref, const char *name, int *bufsize,
     }
 
     if (errmask != NULL) {
-        *errmask = PyInt_AsLong(PyList_GET_ITEM(ref, 1));
+        *errmask = PyLong_AsLong(PyList_GET_ITEM(ref, 1));
         if (*errmask < 0) {
             if (PyErr_Occurred()) {
                 return -1;
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 74bf01643..5c22c6f1c 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -46,14 +46,20 @@ abs_ptrdiff(char *a, char *b)
     npy_intp i;\
     for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
 
-/** (ip1, ip2) -> (op1) */
-#define BINARY_LOOP\
+#define BINARY_DEFS\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
     npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
     npy_intp n = dimensions[0];\
     npy_intp i;\
+
+#define BINARY_LOOP_SLIDING\
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
 
+/** (ip1, ip2) -> (op1) */
+#define BINARY_LOOP\
+    BINARY_DEFS\
+    BINARY_LOOP_SLIDING
+
 /** (ip1, ip2) -> (op1, op2) */
 #define BINARY_LOOP_TWO_OUT\
     char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
@@ -155,10 +161,7 @@ abs_ptrdiff(char *a, char *b)
 #define IVDEP_LOOP
 #endif
 #define BASE_BINARY_LOOP_INP(tin, tout, op) \
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
-    npy_intp n = dimensions[0];\
-    npy_intp i;\
+    BINARY_DEFS\
     IVDEP_LOOP \
     for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1) { \
         const tin in1 = *(tin *)ip1; \
diff --git a/numpy/core/src/umath/funcs.inc.src b/numpy/core/src/umath/funcs.inc.src
index 273779ee8..9b04dc779 100644
--- a/numpy/core/src/umath/funcs.inc.src
+++ b/numpy/core/src/umath/funcs.inc.src
@@ -26,13 +26,13 @@ Py_square(PyObject *o)
 static PyObject *
 Py_get_one(PyObject *NPY_UNUSED(o))
 {
-    return PyInt_FromLong(1);
+    return PyLong_FromLong(1);
 }
 
 static PyObject *
 Py_reciprocal(PyObject *o)
 {
-    PyObject *one = PyInt_FromLong(1);
+    PyObject *one = PyLong_FromLong(1);
     PyObject *result;
 
     if (!one) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0cfa1cea7..6403efaee 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,9 @@
 
 #include <string.h> /* for memchr */
 
+/* Use Libdivide for faster division */
+#include "numpy/libdivide/libdivide.h"
+
 /*
  * cutoff blocksize for pairwise summation
  * decreasing it decreases errors slightly as more pairs are summed but
@@ -570,7 +573,7 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #isa = , _avx2#
  * #ISA = , AVX2#
- * #CHK = 1, HAVE_ATTRIBUTE_TARGET_AVX2#
+ * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
  * #ATTR = , NPY_GCC_TARGET_AVX2#
  */
 
@@ -840,28 +843,88 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
 }
 
+/* Libdivide only supports 32 and 64 bit types
+ * We try to pick the best possible one */
+#if NPY_BITSOF_@TYPE@ <= 32
+#define libdivide_@type@_t libdivide_s32_t
+#define libdivide_@type@_gen libdivide_s32_gen
+#define libdivide_@type@_do libdivide_s32_do
+#else
+#define libdivide_@type@_t libdivide_s64_t
+#define libdivide_@type@_gen libdivide_s64_gen
+#define libdivide_@type@_do libdivide_s64_do
+#endif
+
 NPY_NO_EXPORT void
 @TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
         const @type@ in2 = *(@type@ *)ip2;
-        /*
-         * FIXME: On x86 at least, dividing the smallest representable integer
-         * by -1 causes a SIFGPE (division overflow). We treat this case here
-         * (to avoid a SIGFPE crash at python level), but a good solution would
-         * be to treat integer division problems separately from FPU exceptions
-         * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
-         */
-        if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+
+        /* If divisor is 0, we need not compute anything */
+        if (in2 == 0) {
             npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
-            *((@type@ *)op1) = in1/in2 - 1;
+            BINARY_LOOP_SLIDING {
+                *((@type@ *)op1) = 0;
+            }
         }
         else {
-            *((@type@ *)op1) = in1/in2;
+            struct libdivide_@type@_t fast_d = libdivide_@type@_gen(in2);
+            BINARY_LOOP_SLIDING {
+                const @type@ in1 = *(@type@ *)ip1;
+                /*
+                 * FIXME: On x86 at least, dividing the smallest representable integer
+                 * by -1 causes a SIFGPE (division overflow). We treat this case here
+                 * (to avoid a SIGFPE crash at python level), but a good solution would
+                 * be to treat integer division problems separately from FPU exceptions
+                 * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+                 */
+                if (in1 == NPY_MIN_@TYPE@ && in2 == -1) {
+                    npy_set_floatstatus_divbyzero();
+                    *((@type@ *)op1) = 0;
+                }
+                else {
+                    *((@type@ *)op1) = libdivide_@type@_do(in1, &fast_d);
+
+                    /* Negative quotients needs to be rounded down */
+                    if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
+                        *((@type@ *)op1) = *((@type@ *)op1) - 1;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            /*
+             * FIXME: On x86 at least, dividing the smallest representable integer
+             * by -1 causes a SIFGPE (division overflow). We treat this case here
+             * (to avoid a SIGFPE crash at python level), but a good solution would
+             * be to treat integer division problems separately from FPU exceptions
+             * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+             */
+            if (in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1)) {
+                npy_set_floatstatus_divbyzero();
+                *((@type@ *)op1) = 0;
+            }
+            else {
+                *((@type@ *)op1) = in1/in2;
+
+                /* Negative quotients needs to be rounded down */
+                if (((in1 > 0) != (in2 > 0)) && (*((@type@ *)op1) * in2 != in1)) {
+                    *((@type@ *)op1) = *((@type@ *)op1) - 1;
+                }
+            }
         }
     }
 }
@@ -1352,14 +1415,48 @@ TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const
 NPY_NO_EXPORT void
 TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
         const npy_int64 in2 = *(npy_int64 *)ip2;
-        if (in1 == NPY_DATETIME_NAT || in2 == 0) {
-            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+
+        /* If divisor is 0, we need not compute anything */
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            BINARY_LOOP_SLIDING {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
         }
         else {
-            *((npy_timedelta *)op1) = in1 / in2;
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+            BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+                }
+                else {
+                    *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);;
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_int64 in2 = *(npy_int64 *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == 0) {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+            else {
+                *((npy_timedelta *)op1) = in1 / in2;
+            }
         }
     }
 }
@@ -1431,23 +1528,69 @@ TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const
 NPY_NO_EXPORT void
 TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const npy_timedelta in1 = *(npy_timedelta *)ip1;
-        const npy_timedelta in2 = *(npy_timedelta *)ip2;
-        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
-            npy_set_floatstatus_invalid();
-            *((npy_int64 *)op1) = 0;
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
         }
-        else if (in2 == 0) {
+
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+
+        /* If divisor is 0 or NAT, we need not compute anything */
+        if (in2 == 0) {
             npy_set_floatstatus_divbyzero();
-            *((npy_int64 *)op1) = 0;
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
         }
         else {
-            if (((in1 > 0) != (in2 > 0)) && (in1 % in2 != 0)) {
-                *((npy_int64 *)op1) = in1/in2 - 1;
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+             BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    npy_set_floatstatus_invalid();
+                    *((npy_int64 *)op1) = 0;
+                }
+                else {
+                    *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d);
+
+                    /* Negative quotients needs to be rounded down */
+                    if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                        *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_timedelta in2 = *(npy_timedelta *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+                npy_set_floatstatus_invalid();
+                *((npy_int64 *)op1) = 0;
+            }
+            else if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_int64 *)op1) = 0;
             }
             else {
                 *((npy_int64 *)op1) = in1/in2;
+
+                /* Negative quotients needs to be rounded down */
+                if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                    *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                }
             }
         }
     }
@@ -1491,26 +1634,6 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = @scalarf@(in1);
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
  */
@@ -1558,6 +1681,14 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         *(npy_double *)op1 = npy_exp(in1);
     }
 }
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log(in1);
+    }
+}
 
 /**begin repeat
  * #isa = avx512f, fma#
@@ -1571,53 +1702,6 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *  #typesub = f, #
  */
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sqrt_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_sqrt_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = npy_sqrt@typesub@(in1);
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_square_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_square_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_reciprocal_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    if (!run_unary_@isa@_reciprocal_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *(@type@ *)op1 = 1.0f/in1;
-        }
-    }
-}
-
 /**begin repeat2
  *  #func = rint, ceil, floor, trunc#
  *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
@@ -1700,6 +1784,16 @@ DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step
     }
 }
 
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    if (!run_unary_avx512f_log_DOUBLE(args, dimensions, steps)) {
+        UNARY_LOOP {
+            const npy_double in1 = *(npy_double *)ip1;
+            *(npy_double *)op1 = npy_log(in1);
+        }
+    }
+}
 
 /**begin repeat
  * Float types
@@ -2004,8 +2098,7 @@ NPY_NO_EXPORT void
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        @type@ mod;
-        *((@type@ *)op1) = npy_divmod@c@(in1, in2, &mod);
+        *((@type@ *)op1) = npy_floor_divide@c@(in1, in2);
     }
 }
 
@@ -2015,7 +2108,7 @@ NPY_NO_EXPORT void
     BINARY_LOOP {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        npy_divmod@c@(in1, in2, (@type@ *)op1);
+        *((@type@ *) op1) = npy_remainder@c@(in1, in2);
     }
 }
 
@@ -2030,33 +2123,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    char * margs[] = {args[0], args[0], args[1]};
-    npy_intp msteps[] = {steps[0], steps[0], steps[1]};
-    if (!run_binary_simd_multiply_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = in1*in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
-@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    @type@ one = 1.@c@;
-    char * margs[] = {(char*)&one, args[0], args[1]};
-    npy_intp msteps[] = {0, steps[0], steps[1]};
-    if (!run_binary_simd_divide_@TYPE@(margs, dimensions, msteps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = 1/in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -2074,20 +2140,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_absolute_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ tmp = in1 > 0 ? in1 : -in1;
-            /* add 0 to clear -0.0 */
-            *((@type@ *)op1) = tmp + 0;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-NPY_NO_EXPORT void
 @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
@@ -2198,6 +2250,42 @@ NPY_NO_EXPORT void
 
 /*
  *****************************************************************************
+ **                          LONGDOUBLE LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble*)ip1;
+        *((npy_longdouble *)op1) = 1/in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
+        /* add 0 to clear -0.0 */
+        *((npy_longdouble *)op1) = tmp + 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1*in1;
+    }
+}
+
+/*
+ *****************************************************************************
  **                          HALF-FLOAT LOOPS                               **
  *****************************************************************************
  */
@@ -2360,8 +2448,13 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     BINARY_LOOP {
         const npy_half in1 = *(npy_half *)ip1;
         const npy_half in2 = *(npy_half *)ip2;
-        npy_half mod;
-        *((npy_half *)op1) = npy_half_divmod(in1, in2, &mod);
+
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float div;
+
+        div = npy_floor_dividef(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(div);
     }
 }
 
@@ -2371,7 +2464,11 @@ HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, v
     BINARY_LOOP {
         const npy_half in1 = *(npy_half *)ip1;
         const npy_half in2 = *(npy_half *)ip2;
-        npy_half_divmod(in1, in2, (npy_half *)op1);
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float mod;
+        mod = npy_remainderf(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(mod);
     }
 }
 
@@ -2629,7 +2726,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_intp n,
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (IS_BINARY_REDUCE && @PW@) {
+    // Parenthesis around @PW@ tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (@PW@)) {
         npy_intp n = dimensions[0];
         @ftype@ * or = ((@ftype@ *)args[0]);
         @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5dd49c465..a0b68d168 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -6,6 +6,10 @@
 #ifndef _NPY_UMATH_LOOPS_H_
 #define _NPY_UMATH_LOOPS_H_
 
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
 #define BOOL_invert BOOL_logical_not
 #define BOOL_add BOOL_logical_or
 #define BOOL_bitwise_and BOOL_logical_and
@@ -167,32 +171,29 @@ NPY_NO_EXPORT void
  **                             FLOAT LOOPS                                 **
  *****************************************************************************
  */
-
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
-NPY_NO_EXPORT void
-@TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 /**begin repeat1
- * #func = maximum, minimum#
+ * #kind = sqrt, absolute, square, reciprocal#
  */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
+/**end repeat**/
 
-/**begin repeat1
- * #isa = avx512f, fma#
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
  */
-
-/**begin repeat2
- * #func = sqrt, absolute, square, reciprocal#
+/**begin repeat1
+ * #func = maximum, minimum#
  */
 NPY_NO_EXPORT void
-@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-/**end repeat2**/
 /**end repeat1**/
 /**end repeat**/
 
@@ -202,6 +203,12 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
 NPY_NO_EXPORT void
 DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
 /**begin repeat
  *  #func = sin, cos, exp, log#
  */
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
new file mode 100644
index 000000000..3a1ea82f9
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -0,0 +1,219 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 vsx2 neon
+ **/
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/**********************************************************
+ ** Scalars
+ **********************************************************/
+#if !NPY_SIMD
+NPY_FINLINE float c_recip_f32(float a)
+{ return 1.0f / a; }
+NPY_FINLINE float c_abs_f32(float a)
+{
+    const float tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE float c_square_f32(float a)
+{ return a * a; }
+#endif // !NPY_SIMD
+
+#if !NPY_SIMD_F64
+NPY_FINLINE double c_recip_f64(double a)
+{ return 1.0 / a; }
+NPY_FINLINE double c_abs_f64(double a)
+{
+    const double tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE double c_square_f64(double a)
+{ return a * a; }
+#endif // !NPY_SIMD_F64
+/**
+ * MSVC(32-bit mode) requires a clarified contiguous loop
+ * in order to use SSE, otherwise it uses a soft version of square root
+ * that doesn't raise a domain error.
+ */
+#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
+    #include <emmintrin.h>
+    NPY_FINLINE float c_sqrt_f32(float _a)
+    {
+        __m128 a = _mm_load_ss(&_a);
+        __m128 lower = _mm_sqrt_ss(a);
+        return _mm_cvtss_f32(lower);
+    }
+    NPY_FINLINE double c_sqrt_f64(double _a)
+    {
+        __m128d a = _mm_load_sd(&_a);
+        __m128d lower = _mm_sqrt_pd(a);
+        return _mm_cvtsd_f64(lower);
+    }
+#else
+    #define c_sqrt_f32 npy_sqrtf
+    #define c_sqrt_f64 npy_sqrt
+#endif
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind     = sqrt, absolute, square, reciprocal#
+ * #intr     = sqrt, abs,      square, recip#
+ * #repl_0w1 = 0,    0,        0,      1#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ * #unroll = 4,      4,       2,       2#
+ */
+static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *src = _src;
+          npyv_lanetype_@sfx@ *dst = _dst;
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * @unroll@;
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @STYPE@ == CONTIG
+                npyv_@sfx@ v_src@N@ = npyv_load_@sfx@(src + vstep*@N@);
+            #else
+                npyv_@sfx@ v_src@N@ = npyv_loadn_@sfx@(src + ssrc*vstep*@N@, ssrc);
+            #endif
+            npyv_@sfx@ v_unary@N@ = npyv_@intr@_@sfx@(v_src@N@);
+        #endif
+        /**end repeat3**/
+        /**begin repeat3
+         * #N  = 0, 1, 2, 3#
+         */
+        #if @unroll@ > @N@
+            #if @DTYPE@ == CONTIG
+                npyv_store_@sfx@(dst + vstep*@N@, v_unary@N@);
+            #else
+                npyv_storen_@sfx@(dst + sdst*vstep*@N@, sdst, v_unary@N@);
+            #endif
+        #endif
+        /**end repeat3**/
+    }
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if @STYPE@ == CONTIG
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_load_tillz_@sfx@(src, len);
+        #endif
+    #else
+        #if @repl_0w1@
+            npyv_@sfx@ v_src0 = npyv_loadn_till_@sfx@(src, ssrc, len, 1);
+        #else
+            npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+        #endif
+    #endif
+        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+    #if @DTYPE@ == CONTIG
+        npyv_store_till_@sfx@(dst, len, v_unary0);
+    #else
+        npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
+    #endif
+    }
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD, NPY_SIMD_F64#
+ */
+/**begin repeat1
+ * #kind  = sqrt, absolute, square, reciprocal#
+ * #intr  = sqrt, abs,      square, recip#
+ * #clear = 0,    1,        0,      0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if @VCHK@
+    const int lsize = sizeof(npyv_lanetype_@sfx@);
+    assert(src_step % lsize == 0 && dst_step % lsize == 0);
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_@sfx@(ssrc) || !npyv_storable_stride_@sfx@(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_@TYPE@_@kind@_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_@TYPE@_@kind@_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_@TYPE@_@kind@_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // @VCHK@
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if @VCHK@
+        // to guarantee the same precsion and fp/domain errors for both scalars and vectors
+        simd_@TYPE@_@kind@_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_@sfx@ src0 = *(npyv_lanetype_@sfx@*)src;
+        *(npyv_lanetype_@sfx@*)dst = c_@intr@_@sfx@(src0);
+    #endif
+    }
+#if @VCHK@
+clear:;
+#endif
+#if @clear@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_utils.h b/numpy/core/src/umath/loops_utils.h
new file mode 100644
index 000000000..f5540bdae
--- /dev/null
+++ b/numpy/core/src/umath/loops_utils.h
@@ -0,0 +1,42 @@
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 5cbb6e94d..0e47d1ab5 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -27,6 +27,7 @@
  *****************************************************************************
  */
 
+#if defined(HAVE_CBLAS)
 /*
  * -1 to be conservative, in case blas internally uses a for loop with an
  * inclusive upper bound
@@ -61,7 +62,6 @@ is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
     return NPY_FALSE;
 }
 
-#if defined(HAVE_CBLAS)
 static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
 static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
 
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index 36c8b6c03..45487d0a8 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -1,6 +1,7 @@
 #ifndef __NPY_SIMD_DATA_H_
 #define __NPY_SIMD_DATA_H_
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
 /*
  * Constants used in vector implementation of float64 exp(x)
  */
@@ -85,6 +86,7 @@ static npy_uint64 EXP_Table_tail[32] = {
     0x3C99D3E12DD8A18B,
 };
 #endif
+#endif
 
 /*
  * Constants used in vector implementation of exp(x)
@@ -134,4 +136,156 @@ static npy_uint64 EXP_Table_tail[32] = {
 #define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
 #define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
 
+/*
+ * Lookup table of log(c_k)
+ * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the 
+ *     logarithm function in IEEE floating-point arithmetic." ACM Transactions 
+ *     on Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static npy_uint64 LOG_TABLE_TOP[64] = {
+    0x0000000000000000,
+    0x3F8FC0A8B1000000,
+    0x3F9F829B0E780000,
+    0x3FA77458F6340000,
+    0x3FAF0A30C0100000,
+    0x3FB341D7961C0000,
+    0x3FB6F0D28AE60000,
+    0x3FBA926D3A4A0000,
+    0x3FBE27076E2A0000,
+    0x3FC0D77E7CD10000,
+    0x3FC29552F8200000,
+    0x3FC44D2B6CCB0000,
+    0x3FC5FF3070A80000,
+    0x3FC7AB8902110000,
+    0x3FC9525A9CF40000,
+    0x3FCAF3C94E810000,
+    0x3FCC8FF7C79B0000,
+    0x3FCE27076E2B0000,
+    0x3FCFB9186D5E0000,
+    0x3FD0A324E2738000,
+    0x3FD1675CABAB8000,
+    0x3FD22941FBCF8000,
+    0x3FD2E8E2BAE10000,
+    0x3FD3A64C55698000,
+    0x3FD4618BC21C8000,
+    0x3FD51AAD872E0000,
+    0x3FD5D1BDBF580000,
+    0x3FD686C81E9B0000,
+    0x3FD739D7F6BC0000,
+    0x3FD7EAF83B828000,
+    0x3FD89A3386C18000,
+    0x3FD947941C210000,
+    0x3FD9F323ECBF8000,
+    0x3FDA9CEC9A9A0000,
+    0x3FDB44F77BCC8000,
+    0x3FDBEB4D9DA70000,
+    0x3FDC8FF7C79A8000,
+    0x3FDD32FE7E010000,
+    0x3FDDD46A04C20000,
+    0x3FDE744261D68000,
+    0x3FDF128F5FAF0000,
+    0x3FDFAF588F790000,
+    0x3FE02552A5A5C000,
+    0x3FE0723E5C1CC000,
+    0x3FE0BE72E4254000,
+    0x3FE109F39E2D4000,
+    0x3FE154C3D2F4C000,
+    0x3FE19EE6B467C000,
+    0x3FE1E85F5E704000,
+    0x3FE23130D7BEC000,
+    0x3FE2795E1289C000,
+    0x3FE2C0E9ED448000,
+    0x3FE307D7334F0000,
+    0x3FE34E289D9D0000,
+    0x3FE393E0D3564000,
+    0x3FE3D9026A714000,
+    0x3FE41D8FE8468000,
+    0x3FE4618BC21C4000,
+    0x3FE4A4F85DB04000,
+    0x3FE4E7D811B74000,
+    0x3FE52A2D265BC000,
+    0x3FE56BF9D5B40000,
+    0x3FE5AD404C358000,
+    0x3FE5EE02A9240000,
+};
+
+static npy_uint64 LOG_TABLE_TAIL[64] = {
+    0x0000000000000000,
+    0xBD5FE0E183092C59,
+    0x3D2980267C7E09E4,
+    0xBD62303B9CB0D5E1,
+    0x3D662A6617CC9717,
+    0xBD4717B6B33E44F8,
+    0xBD62968C836CC8C2,
+    0x3D6AAC6CA17A4554,
+    0x3D6E5CBD3D50FFFC,
+    0xBD6C69A65A23A170,
+    0xBD35B967F4471DFC,
+    0x3D6F4799F4F6543E,
+    0xBD6B0B0DE3077D7E,
+    0xBD537B720E4A694B,
+    0x3D65AD1D904C1D4E,
+    0xBD600349CC67F9B2,
+    0xBD697794F689F843,
+    0xBD3A342C2AF0003C,
+    0x3D5F1546AAA3361C,
+    0x3D50E35F73F7A018,
+    0x3D630701CE63EAB9,
+    0xBD3A6976F5EB0963,
+    0x3D5D309C2CC91A85,
+    0xBD6D0B1C68651946,
+    0xBD609EC17A426426,
+    0xBD3F4BD8DB0A7CC1,
+    0x3D4394A11B1C1EE4,
+    0x3D54AEC442BE1015,
+    0xBD67FCB18ED9D603,
+    0x3D67E1B259D2F3DA,
+    0xBD6ED2A52C73BF78,
+    0x3D56FABA4CDD147D,
+    0x3D584BF2B68D766F,
+    0x3D40931A909FEA5E,
+    0x3D4EC5197DDB55D3,
+    0x3D5B7BF7861D37AC,
+    0x3D5A21AC25DB1EF3,
+    0xBD542A9E21373414,
+    0xBD6DAFA08CECADB1,
+    0x3D3E1F8DF68DBCF3,
+    0x3D3BB2CD720EC44C,
+    0xBD49C24CA098362B,
+    0x3D60FEC69C695D7F,
+    0x3D6F404E57963891,
+    0xBD657D49676844CC,
+    0x3D592DFBC7D93617,
+    0x3D65E9A98F33A396,
+    0x3D52DD98B97BAEF0,
+    0x3D1A07BD8B34BE7C,
+    0xBD17AFA4392F1BA7,
+    0xBD5DCA290F818480,
+    0x3D5D1772F5386374,
+    0x3D60BE1FB590A1F5,
+    0xBD6E2CE9146D271A,
+    0xBD65E6563BBD9FC9,
+    0x3D66FAA404263D0B,
+    0xBD5AA33736867A17,
+    0x3D6EC27D0B7B37B3,
+    0xBD244FDD840B8591,
+    0x3D6BB09CB0985646,
+    0x3D46ABB9DF22BC57,
+    0xBD58CD7DC73BD194,
+    0x3D6F2CFB29AAA5F0,
+    0x3D66757006095FD2,
+};
+
+#define NPY_TANG_LOG_A1 0x1.55555555554e6p-4
+#define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7
+#define NPY_TANG_LOG_A3 0x1.2492307f1519fp-9
+#define NPY_TANG_LOG_A4 0x1.c8034c85dfffp-12
+
+#define NPY_TANG_LOG_LN2HI 0x1.62e42fefa4p-1
+#define NPY_TANG_LOG_LN2LO -0x1.8432a1b0e2634p-43
+#endif
+#endif
+
 #endif
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index bf6e5a698..a0090e302 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -605,7 +605,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
         goto fail;
     }
 
-    method_name = PyUString_FromString(method);
+    method_name = PyUnicode_FromString(method);
     if (method_name == NULL) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 4037a4757..f1423d8b9 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -16,7 +16,6 @@
 #include "npy_config.h"
 #include <numpy/arrayobject.h>
 
-#include "npy_config.h"
 #include "npy_pycompat.h"
 #include "ctors.h"
 
@@ -254,7 +253,6 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         }
         op_flags[2] = NPY_ITER_READONLY;
     }
-
     /* Set up result array axes mapping, operand and wheremask use default */
     int result_axes[NPY_MAXDIMS];
     int *op_axes[3] = {result_axes, NULL, NULL};
@@ -363,7 +361,6 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
 
         if (loop(iter, dataptr, strideptr, countptr,
                         iternext, needs_api, skip_first_count, data) < 0) {
-
             goto fail;
         }
     }
@@ -379,7 +376,10 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
     }
     Py_INCREF(result);
 
-    NpyIter_Deallocate(iter);
+    if (!NpyIter_Deallocate(iter)) {
+        Py_DECREF(result);
+        return NULL;
+    }
     return result;
 
 fail:
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 90cc7a513..86dade0f1 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -285,7 +285,11 @@ static void
 @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
     @type@ mod;
 
-    *out = npy_divmod@c@(a, b, &mod);
+    if (!b) {
+        *out = a / b;
+    } else {
+        *out = npy_divmod@c@(a, b, &mod);
+    }
 }
 
 
@@ -318,7 +322,11 @@ static void
 half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) {
     npy_half mod;
 
-    *out = npy_half_divmod(a, b, &mod);
+    if (!b) {
+        *out = a / b;
+    } else {
+        *out = npy_half_divmod(a, b, &mod);
+    }
 }
 
 
@@ -794,15 +802,8 @@ static PyObject *
 {
     PyObject *ret;
     @type@ arg1, arg2;
-    /*
-     * NOTE: In gcc >= 4.1, the compiler will reorder floating point
-     *       operations and floating point error state checks. In
-     *       particular, the arithmetic operations were being reordered
-     *       so that the errors weren't caught.  Declaring this output
-     *       variable volatile was the minimal fix for the issue.
-     *       (Ticket #1671)
-     */
-    volatile @otype@ out;
+    @otype@ out;
+
 #if @twoout@
     @otype@ out2;
     PyObject *obj;
@@ -932,96 +933,14 @@ static PyObject *
  *          Double, LongDouble,
  *          CFloat, CDouble, CLongDouble#
  *
- * #isint = (1,0)*5,0*7#
+ * #isint = 1*10,0*7#
+ * #isuint = (0,1)*5,0*7#
  * #cmplx = 0*14,1*3#
  * #iszero = _IS_ZERO*10, npy_half_iszero, _IS_ZERO*6#
  * #zero = 0*10, NPY_HALF_ZERO, 0*6#
  * #one = 1*10, NPY_HALF_ONE, 1*6#
  */
 
-#if @cmplx@
-static PyObject *
-@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
-{
-    PyObject *ret;
-    @type@ arg1, arg2;
-    int retstatus;
-    int first;
-    @type@ out = {@zero@, @zero@};
-
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
-
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* can't cast both safely mixed-types? */
-            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
-        case -3:
-        default:
-            /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    if (modulo != Py_None) {
-        /* modular exponentiation is not implemented (gh-8804) */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-
-    npy_clear_floatstatus_barrier((char*)&out);
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * as a function call.
-     */
-    if (@iszero@(arg2.real) && @iszero@(arg2.imag)) {
-        out.real = @one@;
-        out.imag = @zero@;
-    }
-    else {
-        @name@_ctype_power(arg1, arg2, &out);
-    }
-
-    /* Check status flag.  If it is set, then look up what to do */
-    retstatus = npy_get_floatstatus_barrier((char*)&out);
-    if (retstatus) {
-        int bufsize, errmask;
-        PyObject *errobj;
-
-        if (PyUFunc_GetPyValues("@name@_scalars", &bufsize, &errmask,
-                                &errobj) < 0) {
-            return NULL;
-        }
-        first = 1;
-        if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
-            Py_XDECREF(errobj);
-            return NULL;
-        }
-        Py_XDECREF(errobj);
-    }
-
-    ret = PyArrayScalar_New(@Name@);
-    if (ret == NULL) {
-        return NULL;
-    }
-    PyArrayScalar_ASSIGN(ret, @Name@, out);
-
-    return ret;
-}
-
-#elif @isint@
-
 static PyObject *
 @name@_power(PyObject *a, PyObject *b, PyObject *modulo)
 {
@@ -1058,85 +977,25 @@ static PyObject *
         return Py_NotImplemented;
     }
 
+#if !@isint@
     npy_clear_floatstatus_barrier((char*)&out);
-
+#endif
     /*
      * here we do the actual calculation with arg1 and arg2
      * as a function call.
      */
+#if @isint@ && !@isuint@
     if (arg2 < 0) {
         PyErr_SetString(PyExc_ValueError,
                 "Integers to negative integer powers are not allowed.");
         return NULL;
     }
+#endif
     @name@_ctype_power(arg1, arg2, &out);
 
-    ret = PyArrayScalar_New(@Name@);
-    if (ret == NULL) {
-        return NULL;
-    }
-    PyArrayScalar_ASSIGN(ret, @Name@, out);
-
-    return ret;
-}
-
-#else
-
-static PyObject *
-@name@_power(PyObject *a, PyObject *b, PyObject *modulo)
-{
-    PyObject *ret;
-    @type@ arg1, arg2;
-    int retstatus;
-    int first;
-
-    @type@ out = @zero@;
-
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
-
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* can't cast both safely mixed-types? */
-            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
-        case -3:
-        default:
-            /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    if (modulo != Py_None) {
-        /* modular exponentiation is not implemented (gh-8804) */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    }
-
-    npy_clear_floatstatus_barrier((char*)&out);
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * as a function call.
-     */
-    if (@iszero@(arg2)) {
-        out = @one@;
-    }
-    else {
-        @name@_ctype_power(arg1, arg2, &out);
-    }
-
+#if !@isint@
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    int retstatus = npy_get_floatstatus_barrier((char*)&out);
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -1145,13 +1004,14 @@ static PyObject *
                                 &errobj) < 0) {
             return NULL;
         }
-        first = 1;
+        int first = 1;
         if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
             Py_XDECREF(errobj);
             return NULL;
         }
         Py_XDECREF(errobj);
     }
+#endif
 
     ret = PyArrayScalar_New(@Name@);
     if (ret == NULL) {
@@ -1162,7 +1022,6 @@ static PyObject *
     return ret;
 }
 
-#endif
 
 /**end repeat**/
 #undef _IS_ZERO
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8f01d33fa..a118fb0d0 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -28,6 +28,8 @@
 #undef __AVX512F__
 #endif
 #endif
+#include "simd/simd.h"
+#include "loops_utils.h" // nomemoverlap
 #include <assert.h>
 #include <stdlib.h>
 #include <float.h>
@@ -50,37 +52,6 @@
  */
 #define MAX_STEP_SIZE 2097152
 
-/*
- * nomemoverlap - returns true if two strided arrays have an overlapping
- * region in memory. ip_size/op_size = size of the arrays which can be negative
- * indicating negative steps.
- */
-static NPY_INLINE npy_bool
-nomemoverlap(char *ip,
-             npy_intp ip_size,
-             char *op,
-             npy_intp op_size)
-{
-    char *ip_start, *ip_end, *op_start, *op_end;
-    if (ip_size < 0) {
-        ip_start = ip + ip_size;
-        ip_end = ip;
-    }
-    else {
-        ip_start = ip;
-        ip_end = ip + ip_size;
-    }
-    if (op_size < 0) {
-        op_start = op + op_size;
-        op_end = op;
-    }
-    else {
-        op_start = op;
-        op_end = op + op_size;
-    }
-    return (ip_start > op_end) | (op_start > ip_end);
-}
-
 #define IS_BINARY_STRIDE_ONE(esize, vsize) \
     ((steps[0] == esize) && \
      (steps[1] == esize) && \
@@ -114,16 +85,16 @@ nomemoverlap(char *ip,
  * should have no overlap in memory.
  */
 #define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((abs(steps[0]) < MAX_STEP_SIZE)  && \
-     (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
      (nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
 
 #define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
-    ((abs(steps[0]) < MAX_STEP_SIZE)  && \
-     (abs(steps[1]) < MAX_STEP_SIZE)  && \
-     (abs(steps[2]) < MAX_STEP_SIZE)  && \
+    ((labs(steps[0]) < MAX_STEP_SIZE)  && \
+     (labs(steps[1]) < MAX_STEP_SIZE)  && \
+     (labs(steps[2]) < MAX_STEP_SIZE)  && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
      (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
 
@@ -134,7 +105,7 @@ nomemoverlap(char *ip,
  */
 #define IS_OUTPUT_BLOCKABLE_UNARY(esizein, esizeout, vsize) \
     ((steps[0] & (esizein-1)) == 0 && \
-     steps[1] == (esizeout) && abs(steps[0]) < MAX_STEP_SIZE && \
+     steps[1] == (esizeout) && labs(steps[0]) < MAX_STEP_SIZE && \
      (nomemoverlap(args[1], steps[1] * dimensions[0], args[0], steps[0] * dimensions[0])))
 
 #define IS_BLOCKABLE_REDUCE(esize, vsize) \
@@ -389,7 +360,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat2
- *  #func = sqrt, absolute, square, reciprocal, rint, floor, ceil, trunc#
+ *  #func = rint, floor, ceil, trunc#
  */
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -480,17 +451,38 @@ run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
     return 0;
 }
 
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined  NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
+#endif
+static NPY_INLINE int
+run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
+        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+        return 1;
+    }
+    else
+        return 0;
+#endif
+#endif
+    return 0;
+}
+
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double, npy_longdouble#
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #vector = 1, 1, 0#
+ *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
 
 /**begin repeat1
- * #func = sqrt, absolute, negative, minimum, maximum#
- * #check = IS_BLOCKABLE_UNARY*3, IS_BLOCKABLE_REDUCE*2 #
- * #name = unary*3, unary_reduce*2#
+ * #func = absolute, negative, minimum, maximum#
+ * #check = IS_BLOCKABLE_UNARY*2, IS_BLOCKABLE_REDUCE*2 #
+ * #name = unary*2, unary_reduce*2#
  */
 
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -533,6 +525,18 @@ static void
 sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
                                   npy_intp n);
 
+#elif @VECTOR@
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                          npy_intp n);
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2,
+                                  npy_intp n);
+
 #endif
 
 static NPY_INLINE int
@@ -564,6 +568,25 @@ run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp
         sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
         return 1;
     }
+#elif @VECTOR@
+    @type@ * ip1 = (@type@ *)args[0];
+    @type@ * ip2 = (@type@ *)args[1];
+    @type@ * op = (@type@ *)args[2];
+    npy_intp n = dimensions[0];
+    /* argument one scalar */
+    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    /* argument two scalar */
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
+    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
+        return 1;
+    }
 #endif
     return 0;
 }
@@ -841,7 +864,7 @@ sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
             }
         }
     }
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
         op[i] = ip1[i] @OP@ ip2[i];
@@ -983,7 +1006,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
     }
 
 
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
@@ -1050,7 +1073,7 @@ sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
         }
     }
 
-#elif __AVX2__
+#elif defined __AVX2__
     const npy_intp vector_size_bytes = 32;
     const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
     LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
@@ -1304,33 +1327,6 @@ sse2_binary_scalar2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, @type@ * ip2, npy
 }
 /**end repeat1**/
 
-static void
-sse2_sqrt_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    /* align output to VECTOR_SIZE_BYTES bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = @scalarf@(ip[i]);
-    }
-    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
-           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
-    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_load_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ d = @vpre@_loadu_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_sqrt_@vsuf@(d));
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = @scalarf@(ip[i]);
-    }
-}
-
-
 static NPY_INLINE
 @type@ scalar_abs_@type@(@type@ v)
 {
@@ -1795,14 +1791,27 @@ avx512_permute_x4var_pd(__m512d t0,
                         __m512d t3,
                         __m512i index)
 {
-
-    __mmask8 lut_mask = _mm512_cmp_epi64_mask(index, _mm512_set1_epi64(15),
-                                  _MM_CMPINT_GT);
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
     __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
     __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
     return _mm512_mask_blend_pd(lut_mask, res1, res2);
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
+                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
+    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
 /**begin repeat
  *  #vsub  = ps, pd#
  *  #type= npy_float, npy_double#
@@ -2107,7 +2116,7 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
             x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
         }
 #if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit); 
+        x1 = _mm512_and_@vsuffix@(x1,signbit);
 #endif
 
         @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
@@ -2200,7 +2209,7 @@ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const
         }
 
         @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
-        
+
         if (stride_op == 1) {
             _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
         }
@@ -2392,9 +2401,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf = rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2409,10 +2417,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     npy_intp num_remaining_elements = array_size;
     @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f);
     @mask@ load_mask = @isa@_get_full_load_mask_ps();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
-
     /*
      * Note: while generally indices are npy_intp, we ensure that our maximum index
      * will fit in an int32 as a precondition for this function via
@@ -2429,20 +2433,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_ps(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_ps(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_ps(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_ps(x, ones_f, inv_load_mask);
-#endif
         }
         else {
             x = @isa@_masked_gather_ps(ones_f, ip, vindex, load_mask);
@@ -2478,9 +2472,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 /**begin repeat1
- *  #func = sqrt, absolute, square, reciprocal, rint, ceil, floor, trunc#
- *  #vectorf = sqrt, abs, square, reciprocal, rint, ceil, floor, trunc#
- *  #replace_0_with_1 = 0, 0, 0, 1, 0, 0, 0, 0#
+ *  #func = rint, ceil, floor, trunc#
+ *  #vectorf =  rint, ceil, floor, trunc#
  */
 
 #if defined @CHK@
@@ -2494,9 +2487,6 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
     const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double);
     npy_intp num_remaining_elements = array_size;
     @mask@ load_mask = @isa@_get_full_load_mask_pd();
-#if @replace_0_with_1@
-    @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
     @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f);
 
     /*
@@ -2514,20 +2504,10 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         if (num_remaining_elements < num_lanes) {
             load_mask = @isa@_get_partial_load_mask_pd(num_remaining_elements,
                                                        num_lanes);
-#if @replace_0_with_1@
-            inv_load_mask = @isa@_invert_mask_pd(load_mask);
-#endif
         }
         @vtype@ x;
         if (stride == 1) {
             x = @isa@_masked_load_pd(load_mask, ip);
-#if @replace_0_with_1@
-            /*
-             * Replace masked elements with 1.0f to avoid divide by zero fp
-             * exception in reciprocal
-             */
-            x = @isa@_set_masked_lanes_pd(x, ones_d, @castmask@(inv_load_mask));
-#endif
         }
         else {
             x = @isa@_masked_gather_pd(ones_d, ip, vindex, @castmask@(load_mask));
@@ -2550,6 +2530,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  * #vtype = __m256, __m512#
  * #vsize = 256, 512#
  * #BYTES = 32, 64#
+ * #NUM_LANES = 8, 16#
  * #mask = __m256, __mmask16#
  * #vsub = , _mask#
  * #or_masks =_mm256_or_ps, _mm512_kor#
@@ -2593,7 +2574,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                    NPY_TRIG_OP my_trig_op)
 {
     const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
-    const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = @NUM_LANES@;
     npy_float large_number = 71476.0625f;
     if (my_trig_op == npy_compute_sin) {
         large_number = 117435.992f;
@@ -2642,12 +2623,12 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
                                                          num_lanes);
         }
 
-        @vtype@ x;
+        @vtype@ x_in;
         if (stride == 1) {
-            x = @isa@_masked_load_ps(load_mask, ip);
+            x_in = @isa@_masked_load_ps(load_mask, ip);
         }
         else {
-            x = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
+            x_in = @isa@_masked_gather_ps(zero_f, ip, vindex, load_mask);
         }
 
         /*
@@ -2656,10 +2637,10 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
          * these numbers
          */
 
-        glibc_mask = @isa@_in_range_mask(x, large_number,-large_number);
+        glibc_mask = @isa@_in_range_mask(x_in, large_number,-large_number);
         glibc_mask = @and_masks@(load_mask, glibc_mask);
-        nan_mask = _mm@vsize@_cmp_ps@vsub@(x, x, _CMP_NEQ_UQ);
-        x = @isa@_set_masked_lanes_ps(x, zero_f, @or_masks@(nan_mask, glibc_mask));
+        nan_mask = _mm@vsize@_cmp_ps@vsub@(x_in, x_in, _CMP_NEQ_UQ);
+        @vtype@ x = @isa@_set_masked_lanes_ps(x_in, zero_f, @or_masks@(nan_mask, glibc_mask));
         npy_int iglibc_mask = @mask_to_int@(glibc_mask);
 
         if (iglibc_mask != @full_mask@) {
@@ -2698,20 +2679,23 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
         }
 
         /* process elements using glibc for large elements */
-        if (my_trig_op == npy_compute_cos) {
-            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
-                if (iglibc_mask & 0x01) {
-                    op[ii] = npy_cosf(ip[jj]);
+        if (iglibc_mask != 0) {
+            float NPY_DECL_ALIGNED(@BYTES@) ip_fback[@NUM_LANES@];
+            _mm@vsize@_store_ps(ip_fback, x_in);
+
+            if (my_trig_op == npy_compute_cos) {
+                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
+                    if (iglibc_mask & 0x01) {
+                        op[ii] = npy_cosf(ip_fback[ii]);
+                    }
                 }
-                iglibc_mask  = iglibc_mask >> 1;
             }
-        }
-        else {
-            for (int ii = 0, jj = 0; iglibc_mask != 0; ii++, jj += stride) {
-                if (iglibc_mask & 0x01) {
-                    op[ii] = npy_sinf(ip[jj]);
+            else {
+                for (int ii = 0; ii < num_lanes; ++ii, iglibc_mask >>= 1) {
+                    if (iglibc_mask & 0x01) {
+                        op[ii] = npy_sinf(ip_fback[ii]);
+                    }
                 }
-                iglibc_mask  = iglibc_mask >> 1;
             }
         }
         ip += num_lanes*stride;
@@ -3051,7 +3035,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
     __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
     __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
     __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
-    
+
     __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
     __mmask8 load_mask = avx512_get_full_load_mask_pd();
     __mmask8 xmin_mask, xmax_mask, inf_mask, nan_mask, nearzero_mask;
@@ -3075,12 +3059,12 @@ AVX512F_exp_DOUBLE(npy_double * op,
         xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
         xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
         inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
-        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x), 
+        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
                                 _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
-        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs), 
+        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
                                     mTH_nearzero, _CMP_LT_OQ);
         nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
-        overflow_mask = _mm512_kor(overflow_mask, 
+        overflow_mask = _mm512_kor(overflow_mask,
                                 _mm512_kxor(xmax_mask, inf_mask));
         x = avx512_set_masked_lanes_pd(x, zeros_d,
                         _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
@@ -3088,7 +3072,7 @@ AVX512F_exp_DOUBLE(npy_double * op,
 
         /* z = x * 32/ln2 */
         __m512d z = _mm512_mul_pd(x, InvLn2N);
-        
+
         /* round to nearest */
         __m512d kd = _mm512_add_pd(z, mShift);
         __m512i ki = _mm512_castpd_si512(kd);
@@ -3115,9 +3099,9 @@ AVX512F_exp_DOUBLE(npy_double * op,
         __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
                                   mTable_tail_2, mTable_tail_3, j);
 
-        /* 
+        /*
          * s = top + tail;
-         * exp(x) = 2^m * (top + (tail + s * p)); 
+         * exp(x) = 2^m * (top + (tail + s * p));
          */
         __m512d s = _mm512_add_pd(top, tail);
         __m512d res = _mm512_fmadd_pd(s, p, tail);
@@ -3125,9 +3109,9 @@ AVX512F_exp_DOUBLE(npy_double * op,
         res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
 
         /* return special cases */
-        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d), 
+        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
                                         nearzero_mask);
-        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN), 
+        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
                                         nan_mask);
         res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
         res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
@@ -3145,6 +3129,209 @@ AVX512F_exp_DOUBLE(npy_double * op,
 #endif
 #endif
 
+/*
+ * Vectorized implementation of log double using AVX512
+ * Reference:
+ * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
+ *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
+ *     IL (USA), 1991.
+ * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
+ *     function in IEEE floating-point arithmetic." ACM Transactions on
+ *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
+ *     implementation." (2016).
+ * 1) if x = 0; return -INF
+ * 2) if x < 0; return NAN
+ * 3) if x is INF; return INF
+ * 4) if x is NAN; return NAN
+ * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
+ * 6) Range reduction:
+ *    log(x) = log(2^m * z)
+ *           = mln2 + log(z)
+ * 7) log(z) = log(z / c_k) + log(c_k);
+ *    where c_k = 1 + k/64, k = 0,1,...,64
+ *    s.t. |x - c_k| <= 1/128 when x on[1,2].
+ * 8) r = 2(x - c_k)/(x + c_k)
+ *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
+ *               = p(r)
+ *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
+AVX512F_log_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000));
+    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
+    __m512d mNan = _mm512_set1_pd(NPY_NAN);
+    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
+    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
+    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
+
+    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
+    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    /* Load lookup table data */
+    /**begin repeat
+     * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+     */
+
+    __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
+    __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
+
+    /**end repeat**/
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 divide_by_zero_mask = invalid_mask;
+
+    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
+             glibc_mask;
+
+    __m512d x_in;
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        if (1 == stride) {
+            x_in = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
+        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
+        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
+        glibc_mask =  m1 & m2;
+
+        if (glibc_mask != 0xFF) {
+            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
+            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
+            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
+            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);
+
+            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
+            invalid_mask = invalid_mask | negx_mask;
+
+            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
+            __m512i ix = (__m512i)x;
+
+            /* Normalize x when it is denormal */
+            __m512i top12 = _mm512_and_epi64(ix,
+                                _mm512_set1_epi64(0xfff0000000000000));
+            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
+                                _CMP_EQ_OQ);
+            denormal_mask = (~zero_mask) & denormal_mask;
+            ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask,
+                                    x, _mm512_set1_pd(0x1p52));
+            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
+                                    ix, _mm512_set1_epi64(52ULL << 52));
+
+            /*
+             * x = 2^k * z; where z in range [1,2]
+             */
+            __m512i tmp = _mm512_sub_epi64(ix,
+                              _mm512_set1_epi64(0x3ff0000000000000));
+            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
+                            _mm512_set1_epi64(0x3fULL));
+            __m512i ik = _mm512_srai_epi64(tmp, 52);
+            __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
+                            _mm512_set1_epi64(0xfff0000000000000))));
+            /* c = i/64 + 1 */
+            __m256i i_32 = _mm512_cvtepi64_epi32(i);
+            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
+
+            /* u = 2 * (z - c) / (z + c) */
+            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
+            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
+
+            /* v = u * u */
+            __m512d v = _mm512_mul_pd(u,u);
+
+            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
+            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
+            res = _mm512_fmadd_pd(v, res, mA2);
+            res = _mm512_fmadd_pd(v, res, mA1);
+            res = _mm512_mul_pd(v, res);
+            res = _mm512_fmadd_pd(u, res, u);
+
+            /* Load lookup table data */
+            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
+                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
+                            mLUT_TOP_6, mLUT_TOP_7, i);
+            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
+                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
+                              mLUT_TAIL_6, mLUT_TAIL_7, i);
+
+            /*
+             * log(x) = k * ln2_hi + c_hi +
+             *          k * ln2_lo + c_lo +
+             *          log(z/c)
+             */
+            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
+            __m512d k = _mm512_cvtepi32_pd(ik_32);
+            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
+            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
+            tt = _mm512_add_pd(tt, tt2);
+            res = _mm512_add_pd(tt, res);
+
+            /* return special cases */
+            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
+            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
+
+            _mm512_mask_storeu_pd(op, load_mask, res);
+        }
+
+        /* call glibc's log func when x around 1.0f */
+        if (glibc_mask != 0) {
+            double NPY_DECL_ALIGNED(64) ip_fback[8];
+            _mm512_store_pd(ip_fback, x_in);
+
+            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
+                if (glibc_mask & 0x01) {
+                    op[ii] = npy_log(ip_fback[ii]);
+                }
+            }
+        }
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (invalid_mask) {
+        npy_set_floatstatus_invalid();
+    }
+    if (divide_by_zero_mask) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif
+#endif
+
 /**begin repeat
  * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
@@ -3317,7 +3504,7 @@ AVX512F_absolute_@TYPE@(@type@ * op,
         ip += 2*@num_lanes@*stride_ip1;
         num_remaining_elements -= 2*@num_lanes@;
     }
-    npy_clear_floatstatus_barrier((char*)op);
+    npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
 }
 
 #endif
@@ -3461,7 +3648,86 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
 /**end repeat**/
 
 #undef VECTOR_SIZE_BYTES
+#else  /* NPY_HAVE_SSE2_INTRINSICS */
+
+/**begin repeat
+ *  #type = npy_float, npy_double#
+ *  #TYPE = FLOAT, DOUBLE#
+ *  #sfx = f32, f64#
+ *  #CHK =    , _F64#
+ */
+
+#if NPY_SIMD@CHK@
+
+/**begin repeat1
+* Arithmetic
+* # kind = add, subtract, multiply, divide#
+* # OP = +, -, *, /#
+* # VOP = add, sub, mul, div#
+*/
+
+static void
+simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+    /* lots of specializations, to squeeze out max performance */
+    if (ip1 == ip2) {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    else {
+        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
+            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
+            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
+            npyv_store_@sfx@(&op[i], c);
+        }
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[i];
+    }
+}
 
-#endif /* NPY_HAVE_SSE2_INTRINSICS */
+static void
+simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[0] @OP@ ip2[i];
+    }
+}
 
+static void
+simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+{
+    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
+    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
+        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
+        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
+        npyv_store_@sfx@(&op[i], v3);
+    }
+    LOOP_BLOCKED_END {
+        op[i] = ip1[i] @OP@ ip2[0];
+    }
+}
+/**end repeat1**/
+#endif /* NPY_SIMD@CHK@ */
+/**end repeat**/
+#endif
 #endif
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index b35f377d7..1a035eb61 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -1536,7 +1536,14 @@ iterator_loop(PyUFuncObject *ufunc,
 
         NPY_END_THREADS;
     }
-    return NpyIter_Deallocate(iter);
+    /*
+     * Currently `innerloop` may leave an error set, in this case
+     * NpyIter_Deallocate will always return an error as well.
+     */
+    if (NpyIter_Deallocate(iter) == NPY_FAIL) {
+        return -1;
+    }
+    return 0;
 }
 
 /*
@@ -2425,15 +2432,15 @@ _get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
     switch(ufunc->identity) {
     case PyUFunc_One:
         *reorderable = 1;
-        return PyInt_FromLong(1);
+        return PyLong_FromLong(1);
 
     case PyUFunc_Zero:
         *reorderable = 1;
-        return PyInt_FromLong(0);
+        return PyLong_FromLong(0);
 
     case PyUFunc_MinusOne:
         *reorderable = 1;
-        return PyInt_FromLong(-1);
+        return PyLong_FromLong(-1);
 
     case PyUFunc_ReorderableNone:
         *reorderable = 1;
@@ -3233,9 +3240,13 @@ PyUFunc_GenericFunction_int(PyUFuncObject *ufunc,
         goto fail;
     }
 
-    /* Check whether any errors occurred during the loop */
+    /*
+     * Check whether any errors occurred during the loop. The loops should
+     * indicate this in retval, but since the inner-loop currently does not
+     * report errors, this does not happen in all branches (at this time).
+     */
     if (PyErr_Occurred() ||
-        _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
+            _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
         retval = -1;
         goto fail;
     }
@@ -3307,7 +3318,6 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
                         void **out_innerloopdata)
 {
     int i;
-    PyUFunc_Loop1d *funcdata;
 
     NPY_UF_DBG_PRINT1("Getting binary op function for type number %d\n",
                                 *otype);
@@ -3315,7 +3325,7 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
     /* If the type is custom and there are userloops, search for it here */
     if (ufunc->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
         PyObject *key, *obj;
-        key = PyInt_FromLong(*otype);
+        key = PyLong_FromLong(*otype);
         if (key == NULL) {
             return -1;
         }
@@ -3325,7 +3335,10 @@ get_binary_op_function(PyUFuncObject *ufunc, int *otype,
             return -1;
         }
         else if (obj != NULL) {
-            funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
             while (funcdata != NULL) {
                 int *types = funcdata->arg_types;
 
@@ -3997,8 +4010,17 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
 finish:
     Py_XDECREF(op_dtypes[0]);
-    NpyIter_Deallocate(iter);
-    NpyIter_Deallocate(iter_inner);
+    int res = 0;
+    if (!NpyIter_Deallocate(iter)) {
+        res = -1;
+    }
+    if (!NpyIter_Deallocate(iter_inner)) {
+        res = -1;
+    }
+    if (res < 0) {
+        Py_DECREF(out);
+        return NULL;
+    }
 
     return (PyObject *)out;
 
@@ -4379,7 +4401,10 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
 
 finish:
     Py_XDECREF(op_dtypes[0]);
-    NpyIter_Deallocate(iter);
+    if (!NpyIter_Deallocate(iter)) {
+        Py_DECREF(out);
+        return NULL;
+    }
 
     return (PyObject *)out;
 
@@ -4388,7 +4413,6 @@ fail:
     Py_XDECREF(op_dtypes[0]);
 
     NpyIter_Deallocate(iter);
-
     return NULL;
 }
 
@@ -4812,8 +4836,8 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
     if (res == NULL) {
         return NULL;
     }
-    PyList_SET_ITEM(res, 0, PyInt_FromLong(NPY_BUFSIZE));
-    PyList_SET_ITEM(res, 1, PyInt_FromLong(UFUNC_ERR_DEFAULT));
+    PyList_SET_ITEM(res, 0, PyLong_FromLong(NPY_BUFSIZE));
+    PyList_SET_ITEM(res, 1, PyLong_FromLong(UFUNC_ERR_DEFAULT));
     PyList_SET_ITEM(res, 2, Py_None); Py_INCREF(Py_None);
     return res;
 }
@@ -5133,7 +5157,7 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
         return -1;
     }
 
-    key = PyInt_FromLong((long) user_dtype->type_num);
+    key = PyLong_FromLong((long) user_dtype->type_num);
     if (key == NULL) {
         return -1;
     }
@@ -5168,9 +5192,12 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
             result = -1;
         }
         else {
-            PyUFunc_Loop1d *current;
             int cmp = 1;
-            current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj);
+            PyUFunc_Loop1d *current = PyCapsule_GetPointer(cobj, NULL);
+            if (current == NULL) {
+                result = -1;
+                goto done;
+            }
             while (current != NULL) {
                 cmp = cmp_arg_types(current->arg_types,
                     arg_typenums, ufunc->nargs);
@@ -5204,6 +5231,7 @@ PyUFunc_RegisterLoopForDescr(PyUFuncObject *ufunc,
         }
     }
 
+done:
     PyArray_free(arg_typenums);
 
     Py_DECREF(key);
@@ -5235,7 +5263,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     if (ufunc->userloops == NULL) {
         ufunc->userloops = PyDict_New();
     }
-    key = PyInt_FromLong((long) usertype);
+    key = PyLong_FromLong((long) usertype);
     if (key == NULL) {
         return -1;
     }
@@ -5272,7 +5300,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     }
     /* If it's not there, then make one and return. */
     else if (cobj == NULL) {
-        cobj = NpyCapsule_FromVoidPtr((void *)funcdata, _loop1d_list_free);
+        cobj = PyCapsule_New((void *)funcdata, NULL, _loop1d_list_free);
         if (cobj == NULL) {
             goto fail;
         }
@@ -5290,7 +5318,10 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
          * is exactly like this one, then just replace.
          * Otherwise insert.
          */
-        current = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(cobj);
+        current = PyCapsule_GetPointer(cobj, NULL);
+        if (current == NULL) {
+            goto fail;
+        }
         while (current != NULL) {
             cmp = cmp_arg_types(current->arg_types, newtypes, ufunc->nargs);
             if (cmp >= 0) {
@@ -5361,7 +5392,7 @@ ufunc_dealloc(PyUFuncObject *ufunc)
 static PyObject *
 ufunc_repr(PyUFuncObject *ufunc)
 {
-    return PyUString_FromFormat("<ufunc '%s'>", ufunc->name);
+    return PyUnicode_FromFormat("<ufunc '%s'>", ufunc->name);
 }
 
 static int
@@ -5388,13 +5419,11 @@ ufunc_traverse(PyUFuncObject *self, visitproc visit, void *arg)
 static PyObject *
 ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
-    int i;
     int errval;
     PyObject *override = NULL;
     PyObject *ret;
     PyArrayObject *ap1 = NULL, *ap2 = NULL, *ap_new = NULL;
     PyObject *new_args, *tmp;
-    PyObject *shape1, *shape2, *newshape;
     static PyObject *_numpy_matrix;
 
 
@@ -5435,7 +5464,19 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         "matrix",
         &_numpy_matrix);
 
+    const char *matrix_deprecation_msg = (
+            "%s.outer() was passed a numpy matrix as %s argument. "
+            "Special handling of matrix is deprecated and will result in an "
+            "error in most cases. Please convert the matrix to a NumPy "
+            "array to retain the old behaviour. You can use `matrix.A` "
+            "to achieve this.");
+
     if (PyObject_IsInstance(tmp, _numpy_matrix)) {
+        /* DEPRECATED 2020-05-13, NumPy 1.20 */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                matrix_deprecation_msg, ufunc->name, "first") < 0) {
+            return NULL;
+        }
         ap1 = (PyArrayObject *) PyArray_FromObject(tmp, NPY_NOTYPE, 0, 0);
     }
     else {
@@ -5450,6 +5491,12 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         return NULL;
     }
     if (PyObject_IsInstance(tmp, _numpy_matrix)) {
+        /* DEPRECATED 2020-05-13, NumPy 1.20 */
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                matrix_deprecation_msg, ufunc->name, "second") < 0) {
+            Py_DECREF(ap1);
+            return NULL;
+        }
         ap2 = (PyArrayObject *) PyArray_FromObject(tmp, NPY_NOTYPE, 0, 0);
     }
     else {
@@ -5460,34 +5507,45 @@ ufunc_outer(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         Py_DECREF(ap1);
         return NULL;
     }
-    /* Construct new shape tuple */
-    shape1 = PyTuple_New(PyArray_NDIM(ap1));
-    if (shape1 == NULL) {
-        goto fail;
-    }
-    for (i = 0; i < PyArray_NDIM(ap1); i++) {
-        PyTuple_SET_ITEM(shape1, i,
-                PyLong_FromLongLong((npy_longlong)PyArray_DIMS(ap1)[i]));
-    }
-    shape2 = PyTuple_New(PyArray_NDIM(ap2));
-    for (i = 0; i < PyArray_NDIM(ap2); i++) {
-        PyTuple_SET_ITEM(shape2, i, PyInt_FromLong((long) 1));
+    /* Construct new shape from ap1 and ap2 and then reshape */
+    PyArray_Dims newdims;
+    npy_intp newshape[NPY_MAXDIMS];
+    newdims.len = PyArray_NDIM(ap1) + PyArray_NDIM(ap2);
+    newdims.ptr = newshape;
+
+    if (newdims.len > NPY_MAXDIMS) {
+        PyErr_Format(PyExc_ValueError,
+                "maximum supported dimension for an ndarray is %d, but "
+                "`%s.outer()` result would have %d.",
+                NPY_MAXDIMS, ufunc->name, newdims.len);
+        return NPY_FAIL;
     }
-    if (shape2 == NULL) {
-        Py_DECREF(shape1);
+    if (newdims.ptr == NULL) {
         goto fail;
     }
-    newshape = PyNumber_Add(shape1, shape2);
-    Py_DECREF(shape1);
-    Py_DECREF(shape2);
-    if (newshape == NULL) {
-        goto fail;
+    memcpy(newshape, PyArray_DIMS(ap1), PyArray_NDIM(ap1) * sizeof(npy_intp));
+    for (int i = PyArray_NDIM(ap1); i < newdims.len; i++) {
+        newshape[i] = 1;
     }
-    ap_new = (PyArrayObject *)PyArray_Reshape(ap1, newshape);
-    Py_DECREF(newshape);
+
+    ap_new = (PyArrayObject *)PyArray_Newshape(ap1, &newdims, NPY_CORDER);
     if (ap_new == NULL) {
         goto fail;
     }
+    if (PyArray_NDIM(ap_new) != newdims.len ||
+           !PyArray_CompareLists(PyArray_DIMS(ap_new), newshape, newdims.len)) {
+        PyErr_Format(PyExc_TypeError,
+                "%s.outer() called with ndarray-subclass of type '%s' "
+                "which modified its shape after a reshape. `outer()` relies "
+                "on reshaping the inputs and is for example not supported for "
+                "the 'np.matrix' class (the usage of matrix is generally "
+                "discouraged). "
+                "To work around this issue, please convert the inputs to "
+                "numpy arrays.",
+                ufunc->name, Py_TYPE(ap_new)->tp_name);
+        goto fail;
+    }
+
     new_args = Py_BuildValue("(OO)", ap_new, ap2);
     Py_DECREF(ap1);
     Py_DECREF(ap2);
@@ -5920,6 +5978,7 @@ _typecharfromnum(int num) {
     return ret;
 }
 
+
 static PyObject *
 ufunc_get_doc(PyUFuncObject *ufunc)
 {
@@ -5940,40 +5999,40 @@ ufunc_get_doc(PyUFuncObject *ufunc)
      * introspection on name and nin + nout to automate the first part
      * of it the doc string shouldn't need the calling convention
      */
-    doc = PyObject_CallFunctionObjArgs(
-        _sig_formatter, (PyObject *)ufunc, NULL);
+    doc = PyObject_CallFunctionObjArgs(_sig_formatter,
+                                       (PyObject *)ufunc, NULL);
     if (doc == NULL) {
         return NULL;
     }
     if (ufunc->doc != NULL) {
-        PyUString_ConcatAndDel(&doc,
-            PyUString_FromFormat("\n\n%s", ufunc->doc));
+        Py_SETREF(doc, PyUnicode_FromFormat("%S\n\n%s", doc, ufunc->doc));
     }
     return doc;
 }
 
+
 static PyObject *
 ufunc_get_nin(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nin);
+    return PyLong_FromLong(ufunc->nin);
 }
 
 static PyObject *
 ufunc_get_nout(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nout);
+    return PyLong_FromLong(ufunc->nout);
 }
 
 static PyObject *
 ufunc_get_nargs(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->nargs);
+    return PyLong_FromLong(ufunc->nargs);
 }
 
 static PyObject *
 ufunc_get_ntypes(PyUFuncObject *ufunc)
 {
-    return PyInt_FromLong(ufunc->ntypes);
+    return PyLong_FromLong(ufunc->ntypes);
 }
 
 static PyObject *
@@ -6003,7 +6062,7 @@ ufunc_get_types(PyUFuncObject *ufunc)
             t[ni + 2 + j] = _typecharfromnum(ufunc->types[n]);
             n++;
         }
-        str = PyUString_FromStringAndSize(t, no + ni + 2);
+        str = PyUnicode_FromStringAndSize(t, no + ni + 2);
         PyList_SET_ITEM(list, k, str);
     }
     PyArray_free(t);
@@ -6013,7 +6072,7 @@ ufunc_get_types(PyUFuncObject *ufunc)
 static PyObject *
 ufunc_get_name(PyUFuncObject *ufunc)
 {
-    return PyUString_FromString(ufunc->name);
+    return PyUnicode_FromString(ufunc->name);
 }
 
 static PyObject *
@@ -6029,7 +6088,7 @@ ufunc_get_signature(PyUFuncObject *ufunc)
     if (!ufunc->core_enabled) {
         Py_RETURN_NONE;
     }
-    return PyUString_FromString(ufunc->core_signature);
+    return PyUnicode_FromString(ufunc->core_signature);
 }
 
 #undef _typecharfromnum
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index ea20bb24f..3ce06322f 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -12,6 +12,11 @@
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
+// printif debug tracing
+#ifndef NPY_UF_DBG_TRACING
+    #define NPY_UF_DBG_TRACING 0
+#endif
+
 #include <stdbool.h>
 
 #include "Python.h"
@@ -36,17 +41,17 @@ npy_casting_to_py_object(NPY_CASTING casting)
 {
     switch (casting) {
         case NPY_NO_CASTING:
-            return PyUString_FromString("no");
+            return PyUnicode_FromString("no");
         case NPY_EQUIV_CASTING:
-            return PyUString_FromString("equiv");
+            return PyUnicode_FromString("equiv");
         case NPY_SAFE_CASTING:
-            return PyUString_FromString("safe");
+            return PyUnicode_FromString("safe");
         case NPY_SAME_KIND_CASTING:
-            return PyUString_FromString("same_kind");
+            return PyUnicode_FromString("same_kind");
         case NPY_UNSAFE_CASTING:
-            return PyUString_FromString("unsafe");
+            return PyUnicode_FromString("unsafe");
         default:
-            return PyInt_FromLong(casting);
+            return PyLong_FromLong(casting);
     }
 }
 
@@ -236,21 +241,6 @@ PyUFunc_ValidateCasting(PyUFuncObject *ufunc,
     return 0;
 }
 
-/*
- * Returns a new reference to type if it is already NBO, otherwise
- * returns a copy converted to NBO.
- */
-static PyArray_Descr *
-ensure_dtype_nbo(PyArray_Descr *type)
-{
-    if (PyArray_ISNBO(type->byteorder)) {
-        Py_INCREF(type);
-        return type;
-    }
-    else {
-        return PyArray_DescrNewByteorder(type, NPY_NATIVE);
-    }
-}
 
 /*UFUNC_API
  *
@@ -1336,7 +1326,6 @@ find_userloop(PyUFuncObject *ufunc,
                 void **out_innerloopdata)
 {
     npy_intp i, nin = ufunc->nin, j, nargs = nin + ufunc->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1356,7 +1345,7 @@ find_userloop(PyUFuncObject *ufunc,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1368,9 +1357,11 @@ find_userloop(PyUFuncObject *ufunc,
             else if (obj == NULL) {
                 continue;
             }
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
 
                 for (j = 0; j < nargs; ++j) {
@@ -1744,7 +1735,6 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
                         char *out_err_dst_typecode)
 {
     npy_intp i, nop = self->nin + self->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1764,7 +1754,7 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1776,9 +1766,11 @@ linear_search_userloop_type_resolver(PyUFuncObject *self,
             else if (obj == NULL) {
                 continue;
             }
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
                 switch (ufunc_loop_matches(self, op,
                             input_casting, output_casting,
@@ -1816,7 +1808,6 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
                         PyArray_Descr **out_dtype)
 {
     int i, j, nin = self->nin, nop = nin + self->nout;
-    PyUFunc_Loop1d *funcdata;
 
     /* Use this to try to avoid repeating the same userdef loop search */
     int last_userdef = -1;
@@ -1831,7 +1822,7 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
 
             last_userdef = type_num;
 
-            key = PyInt_FromLong(type_num);
+            key = PyLong_FromLong(type_num);
             if (key == NULL) {
                 return -1;
             }
@@ -1844,9 +1835,11 @@ type_tuple_userloop_type_resolver(PyUFuncObject *self,
                 continue;
             }
 
-            for (funcdata = (PyUFunc_Loop1d *)NpyCapsule_AsVoidPtr(obj);
-                 funcdata != NULL;
-                 funcdata = funcdata->next) {
+            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
+            if (funcdata == NULL) {
+                return -1;
+            }
+            for (; funcdata != NULL; funcdata = funcdata->next) {
                 int *types = funcdata->arg_types;
                 int matched = 1;
 
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index bad42d657..474db0245 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -75,7 +75,8 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) {
     int nin, nout, i, nargs;
     PyUFunc_PyFuncData *fdata;
     PyUFuncObject *self;
-    char *fname, *str, *types, *doc;
+    const char *fname = NULL;
+    char *str, *types, *doc;
     Py_ssize_t fname_len = -1;
     void * ptr, **data;
     int offset[2];
@@ -95,12 +96,12 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) {
 
     pyname = PyObject_GetAttrString(function, "__name__");
     if (pyname) {
-        (void) PyString_AsStringAndSize(pyname, &fname, &fname_len);
+        fname = PyUnicode_AsUTF8AndSize(pyname, &fname_len);
     }
-    if (PyErr_Occurred()) {
+    if (fname == NULL) {
+        PyErr_Clear();
         fname = "?";
         fname_len = 1;
-        PyErr_Clear();
     }
 
     /*
@@ -173,25 +174,22 @@ PyObject *
 add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyUFuncObject *ufunc;
-    PyObject *str, *tmp;
-    char *docstr, *newdocstr;
-
+    PyObject *str;
     if (!PyArg_ParseTuple(args, "O!O!:_add_newdoc_ufunc", &PyUFunc_Type, &ufunc,
                                         &PyUnicode_Type, &str)) {
         return NULL;
     }
-    tmp = PyUnicode_AsUTF8String(str);
-    if (tmp == NULL) {
+    if (ufunc->doc != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot change docstring of ufunc with non-NULL docstring");
         return NULL;
     }
-    docstr = PyBytes_AS_STRING(tmp);
 
-    if (NULL != ufunc->doc) {
-        PyErr_SetString(PyExc_ValueError,
-                "Cannot change docstring of ufunc with non-NULL docstring");
-        Py_DECREF(tmp);
+    PyObject *tmp = PyUnicode_AsUTF8String(str);
+    if (tmp == NULL) {
         return NULL;
     }
+    char *docstr = PyBytes_AS_STRING(tmp);
 
     /*
      * This introduces a memory leak, as the memory allocated for the doc
@@ -199,7 +197,11 @@ add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args)
      * this should not be a problem since the user would have to
      * repeatedly create, document, and throw away ufuncs.
      */
-    newdocstr = malloc(strlen(docstr) + 1);
+    char *newdocstr = malloc(strlen(docstr) + 1);
+    if (!newdocstr) {
+        Py_DECREF(tmp);
+        return PyErr_NoMemory();
+    }
     strcpy(newdocstr, docstr);
     ufunc->doc = newdocstr;
 
@@ -232,30 +234,28 @@ NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_array_finalize = NULL;
 NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_ufunc = NULL;
 NPY_VISIBILITY_HIDDEN PyObject *npy_um_str_pyvals_name = NULL;
 
-/* intern some strings used in ufuncs */
+/* intern some strings used in ufuncs, returns 0 on success */
 static int
 intern_strings(void)
 {
-    npy_um_str_out = PyUString_InternFromString("out");
-    npy_um_str_where = PyUString_InternFromString("where");
-    npy_um_str_axes = PyUString_InternFromString("axes");
-    npy_um_str_axis = PyUString_InternFromString("axis");
-    npy_um_str_keepdims = PyUString_InternFromString("keepdims");
-    npy_um_str_casting = PyUString_InternFromString("casting");
-    npy_um_str_order = PyUString_InternFromString("order");
-    npy_um_str_dtype = PyUString_InternFromString("dtype");
-    npy_um_str_subok = PyUString_InternFromString("subok");
-    npy_um_str_signature = PyUString_InternFromString("signature");
-    npy_um_str_sig = PyUString_InternFromString("sig");
-    npy_um_str_extobj = PyUString_InternFromString("extobj");
-    npy_um_str_array_prepare = PyUString_InternFromString("__array_prepare__");
-    npy_um_str_array_wrap = PyUString_InternFromString("__array_wrap__");
-    npy_um_str_array_finalize = PyUString_InternFromString("__array_finalize__");
-    npy_um_str_ufunc = PyUString_InternFromString("__array_ufunc__");
-    npy_um_str_pyvals_name = PyUString_InternFromString(UFUNC_PYVALS_NAME);
-
-    return npy_um_str_out && npy_um_str_subok && npy_um_str_array_prepare &&
-        npy_um_str_array_wrap && npy_um_str_array_finalize && npy_um_str_ufunc;
+    if (!(npy_um_str_out = PyUnicode_InternFromString("out"))) return -1;
+    if (!(npy_um_str_where = PyUnicode_InternFromString("where"))) return -1;
+    if (!(npy_um_str_axes = PyUnicode_InternFromString("axes"))) return -1;
+    if (!(npy_um_str_axis = PyUnicode_InternFromString("axis"))) return -1;
+    if (!(npy_um_str_keepdims = PyUnicode_InternFromString("keepdims"))) return -1;
+    if (!(npy_um_str_casting = PyUnicode_InternFromString("casting"))) return -1;
+    if (!(npy_um_str_order = PyUnicode_InternFromString("order"))) return -1;
+    if (!(npy_um_str_dtype = PyUnicode_InternFromString("dtype"))) return -1;
+    if (!(npy_um_str_subok = PyUnicode_InternFromString("subok"))) return -1;
+    if (!(npy_um_str_signature = PyUnicode_InternFromString("signature"))) return -1;
+    if (!(npy_um_str_sig = PyUnicode_InternFromString("sig"))) return -1;
+    if (!(npy_um_str_extobj = PyUnicode_InternFromString("extobj"))) return -1;
+    if (!(npy_um_str_array_prepare = PyUnicode_InternFromString("__array_prepare__"))) return -1;
+    if (!(npy_um_str_array_wrap = PyUnicode_InternFromString("__array_wrap__"))) return -1;
+    if (!(npy_um_str_array_finalize = PyUnicode_InternFromString("__array_finalize__"))) return -1;
+    if (!(npy_um_str_ufunc = PyUnicode_InternFromString("__array_ufunc__"))) return -1;
+    if (!(npy_um_str_pyvals_name = PyUnicode_InternFromString(UFUNC_PYVALS_NAME))) return -1;
+    return 0;
 }
 
 /* Setup the umath part of the module */
@@ -326,7 +326,7 @@ int initumath(PyObject *m)
     PyDict_SetItemString(d, "conj", s);
     PyDict_SetItemString(d, "mod", s2);
 
-    if (!intern_strings()) {
+    if (intern_strings() < 0) {
         PyErr_SetString(PyExc_RuntimeError,
            "cannot intern umath strings while initializing _multiarray_umath.");
         return -1;
diff --git a/numpy/core/tests/data/umath-validation-set-log b/numpy/core/tests/data/umath-validation-set-log
index a7bd98481..b8f6b0875 100644
--- a/numpy/core/tests/data/umath-validation-set-log
+++ b/numpy/core/tests/data/umath-validation-set-log
@@ -116,3 +116,156 @@ np.float32,0x3f494ab1,0xbe763131,4
 np.float32,0x3f476b69,0xbe7fc2c6,4
 np.float32,0x3f4884e8,0xbe7a214a,4
 np.float32,0x3f486945,0xbe7aae76,4
+#float64
+## +ve denormal ##
+np.float64,0x0000000000000001,0xc0874385446d71c3,1
+np.float64,0x0001000000000000,0xc086395a2079b70c,1
+np.float64,0x000fffffffffffff,0xc086232bdd7abcd2,1
+np.float64,0x0007ad63e2168cb6,0xc086290bc0b2980f,1
+## -ve denormal ##
+np.float64,0x8000000000000001,0xfff8000000000001,1
+np.float64,0x8001000000000000,0xfff8000000000001,1
+np.float64,0x800fffffffffffff,0xfff8000000000001,1
+np.float64,0x8007ad63e2168cb6,0xfff8000000000001,1
+## +/-0.0f, MAX, MIN##
+np.float64,0x0000000000000000,0xfff0000000000000,1
+np.float64,0x8000000000000000,0xfff0000000000000,1
+np.float64,0x7fefffffffffffff,0x40862e42fefa39ef,1
+np.float64,0xffefffffffffffff,0xfff8000000000001,1
+## near 1.0f ##
+np.float64,0x3ff0000000000000,0x0000000000000000,1
+np.float64,0x3fe8000000000000,0xbfd269621134db92,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3ff0000020000000,0x3e7fffffe000002b,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3fefffffe0000000,0xbe70000008000005,1
+np.float64,0x3fefffffffffffff,0xbca0000000000000,1
+## random numbers ##
+np.float64,0x02500186f3d9da56,0xc0855b8abf135773,1
+np.float64,0x09200815a3951173,0xc082ff1ad7131bdc,1
+np.float64,0x0da029623b0243d4,0xc0816fc994695bb5,1
+np.float64,0x48703b8ac483a382,0x40579213a313490b,1
+np.float64,0x09207b74c87c9860,0xc082fee20ff349ef,1
+np.float64,0x62c077698e8df947,0x407821c996d110f0,1
+np.float64,0x2350b45e87c3cfb0,0xc073d6b16b51d072,1
+np.float64,0x3990a23f9ff2b623,0xc051aa60eadd8c61,1
+np.float64,0x0d011386a116c348,0xc081a6cc7ea3b8fb,1
+np.float64,0x1fe0f0303ebe273a,0xc0763870b78a81ca,1
+np.float64,0x0cd1260121d387da,0xc081b7668d61a9d1,1
+np.float64,0x1e6135a8f581d422,0xc077425ac10f08c2,1
+np.float64,0x622168db5fe52d30,0x4077b3c669b9fadb,1
+np.float64,0x69f188e1ec6d1718,0x407d1e2f18c63889,1
+np.float64,0x3aa1bf1d9c4dd1a3,0xc04d682e24bde479,1
+np.float64,0x6c81c4011ce4f683,0x407ee5190e8a8e6a,1
+np.float64,0x2191fa55aa5a5095,0xc0750c0c318b5e2d,1
+np.float64,0x32a1f602a32bf360,0xc06270caa493fc17,1
+np.float64,0x16023c90ba93249b,0xc07d0f88e0801638,1
+np.float64,0x1c525fe6d71fa9ff,0xc078af49c66a5d63,1
+np.float64,0x1a927675815d65b7,0xc079e5bdd7fe376e,1
+np.float64,0x41227b8fe70da028,0x402aa0c9f9a84c71,1
+np.float64,0x4962bb6e853fe87d,0x405a34aa04c83747,1
+np.float64,0x23d2cda00b26b5a4,0xc0737c13a06d00ea,1
+np.float64,0x2d13083fd62987fa,0xc06a25055aeb474e,1
+np.float64,0x10e31e4c9b4579a1,0xc0804e181929418e,1
+np.float64,0x26d3247d556a86a9,0xc0716774171da7e8,1
+np.float64,0x6603379398d0d4ac,0x407a64f51f8a887b,1
+np.float64,0x02d38af17d9442ba,0xc0852d955ac9dd68,1
+np.float64,0x6a2382b4818dd967,0x407d4129d688e5d4,1
+np.float64,0x2ee3c403c79b3934,0xc067a091fefaf8b6,1
+np.float64,0x6493a699acdbf1a4,0x4079663c8602bfc5,1
+np.float64,0x1c8413c4f0de3100,0xc0788c99697059b6,1
+np.float64,0x4573f1ed350d9622,0x404e9bd1e4c08920,1
+np.float64,0x2f34265c9200b69c,0xc067310cfea4e986,1
+np.float64,0x19b43e65fa22029b,0xc07a7f8877de22d6,1
+np.float64,0x0af48ab7925ed6bc,0xc0825c4fbc0e5ade,1
+np.float64,0x4fa49699cad82542,0x4065c76d2a318235,1
+np.float64,0x7204a15e56ade492,0x40815bb87484dffb,1
+np.float64,0x4734aa08a230982d,0x40542a4bf7a361a9,1
+np.float64,0x1ae4ed296c2fd749,0xc079ac4921f20abb,1
+np.float64,0x472514ea4370289c,0x4053ff372bd8f18f,1
+np.float64,0x53a54b3f73820430,0x406b5411fc5f2e33,1
+np.float64,0x64754de5a15684fa,0x407951592e99a5ab,1
+np.float64,0x69358e279868a7c3,0x407c9c671a882c31,1
+np.float64,0x284579ec61215945,0xc0706688e55f0927,1
+np.float64,0x68b5c58806447adc,0x407c43d6f4eff760,1
+np.float64,0x1945a83f98b0e65d,0xc07acc15eeb032cc,1
+np.float64,0x0fc5eb98a16578bf,0xc080b0d02eddca0e,1
+np.float64,0x6a75e208f5784250,0x407d7a7383bf8f05,1
+np.float64,0x0fe63a029c47645d,0xc080a59ca1e98866,1
+np.float64,0x37963ac53f065510,0xc057236281f7bdb6,1
+np.float64,0x135661bb07067ff7,0xc07ee924930c21e4,1
+np.float64,0x4b4699469d458422,0x405f73843756e887,1
+np.float64,0x1a66d73e4bf4881b,0xc07a039ba1c63adf,1
+np.float64,0x12a6b9b119a7da59,0xc07f62e49c6431f3,1
+np.float64,0x24c719aa8fd1bdb5,0xc072d26da4bf84d3,1
+np.float64,0x0fa6ff524ffef314,0xc080bb8514662e77,1
+np.float64,0x1db751d66fdd4a9a,0xc077b77cb50d7c92,1
+np.float64,0x4947374c516da82c,0x4059e9acfc7105bf,1
+np.float64,0x1b1771ab98f3afc8,0xc07989326b8e1f66,1
+np.float64,0x25e78805baac8070,0xc0720a818e6ef080,1
+np.float64,0x4bd7a148225d3687,0x406082d004ea3ee7,1
+np.float64,0x53d7d6b2bbbda00a,0x406b9a398967cbd5,1
+np.float64,0x6997fb9f4e1c685f,0x407ce0a703413eba,1
+np.float64,0x069802c2ff71b951,0xc083df39bf7acddc,1
+np.float64,0x4d683ac9890f66d8,0x4062ae21d8c2acf0,1
+np.float64,0x5a2825863ec14f4c,0x40722d718d549552,1
+np.float64,0x0398799a88f4db80,0xc084e93dab8e2158,1
+np.float64,0x5ed87a8b77e135a5,0x40756d7051777b33,1
+np.float64,0x5828cd6d79b9bede,0x4070cafb22fc6ca1,1
+np.float64,0x7b18ba2a5ec6f068,0x408481386b3ed6fe,1
+np.float64,0x4938fd60922198fe,0x4059c206b762ea7e,1
+np.float64,0x31b8f44fcdd1a46e,0xc063b2faa8b6434e,1
+np.float64,0x5729341c0d918464,0x407019cac0c4a7d7,1
+np.float64,0x13595e9228ee878e,0xc07ee7235a7d8088,1
+np.float64,0x17698b0dc9dd4135,0xc07c1627e3a5ad5f,1
+np.float64,0x63b977c283abb0cc,0x4078cf1ec6ed65be,1
+np.float64,0x7349cc0d4dc16943,0x4081cc697ce4cb53,1
+np.float64,0x4e49a80b732fb28d,0x4063e67e3c5cbe90,1
+np.float64,0x07ba14b848a8ae02,0xc0837ac032a094e0,1
+np.float64,0x3da9f17b691bfddc,0xc03929c25366acda,1
+np.float64,0x02ea39aa6c3ac007,0xc08525af6f21e1c4,1
+np.float64,0x3a6a42f04ed9563d,0xc04e98e825dca46b,1
+np.float64,0x1afa877cd7900be7,0xc0799d6648cb34a9,1
+np.float64,0x58ea986649e052c6,0x4071512e939ad790,1
+np.float64,0x691abbc04647f536,0x407c89aaae0fcb83,1
+np.float64,0x43aabc5063e6f284,0x4044b45d18106fd2,1
+np.float64,0x488b003c893e0bea,0x4057df012a2dafbe,1
+np.float64,0x77eb076ed67caee5,0x40836720de94769e,1
+np.float64,0x5c1b46974aba46f4,0x40738731ba256007,1
+np.float64,0x1a5b29ecb5d3c261,0xc07a0becc77040d6,1
+np.float64,0x5d8b6ccf868c6032,0x4074865c1865e2db,1
+np.float64,0x4cfb6690b4aaf5af,0x406216cd8c7e8ddb,1
+np.float64,0x76cbd8eb5c5fc39e,0x4083038dc66d682b,1
+np.float64,0x28bbd1fec5012814,0xc07014c2dd1b9711,1
+np.float64,0x33dc1b3a4fd6bf7a,0xc060bd0756e07d8a,1
+np.float64,0x52bbe89b37de99f3,0x406a10041aa7d343,1
+np.float64,0x07bc479d15eb2dd3,0xc0837a1a6e3a3b61,1
+np.float64,0x18fc5275711a901d,0xc07aff3e9d62bc93,1
+np.float64,0x114c9758e247dc71,0xc080299a7cf15b05,1
+np.float64,0x25ac8f6d60755148,0xc07233c4c0c511d4,1
+np.float64,0x260cae2bb9e9fd7e,0xc071f128c7e82eac,1
+np.float64,0x572ccdfe0241de82,0x40701bedc84bb504,1
+np.float64,0x0ddcef6c8d41f5ee,0xc0815a7e16d07084,1
+np.float64,0x6dad1d59c988af68,0x407fb4a0bc0142b1,1
+np.float64,0x025d200580d8b6d1,0xc08556c0bc32b1b2,1
+np.float64,0x7aad344b6aa74c18,0x40845bbc453f22be,1
+np.float64,0x5b5d9d6ad9d14429,0x4073036d2d21f382,1
+np.float64,0x49cd8d8dcdf19954,0x405b5c034f5c7353,1
+np.float64,0x63edb9483335c1e6,0x4078f2dd21378786,1
+np.float64,0x7b1dd64c9d2c26bd,0x408482b922017bc9,1
+np.float64,0x782e13e0b574be5f,0x40837e2a0090a5ad,1
+np.float64,0x592dfe18b9d6db2f,0x40717f777fbcb1ec,1
+np.float64,0x654e3232ac60d72c,0x4079e71a95a70446,1
+np.float64,0x7b8e42ad22091456,0x4084a9a6f1e61722,1
+np.float64,0x570e88dfd5860ae6,0x407006ae6c0d137a,1
+np.float64,0x294e98346cb98ef1,0xc06f5edaac12bd44,1
+np.float64,0x1adeaa4ab792e642,0xc079b1431d5e2633,1
+np.float64,0x7b6ead3377529ac8,0x40849eabc8c7683c,1
+np.float64,0x2b8eedae8a9b2928,0xc06c400054deef11,1
+np.float64,0x65defb45b2dcf660,0x407a4b53f181c05a,1
+np.float64,0x1baf582d475e7701,0xc07920bcad4a502c,1
+np.float64,0x461f39cf05a0f15a,0x405126368f984fa1,1
+np.float64,0x7e5f6f5dcfff005b,0x4085a37d610439b4,1
+np.float64,0x136f66e4d09bd662,0xc07ed8a2719f2511,1
+np.float64,0x65afd8983fb6ca1f,0x407a2a7f48bf7fc1,1
+np.float64,0x572fa7f95ed22319,0x40701d706cf82e6f,1
diff --git a/numpy/core/tests/examples/checks.pyx b/numpy/core/tests/examples/checks.pyx
index ecf0ad3fa..151979db7 100644
--- a/numpy/core/tests/examples/checks.pyx
+++ b/numpy/core/tests/examples/checks.pyx
@@ -24,3 +24,7 @@ def get_td64_value(obj):
 
 def get_dt64_unit(obj):
     return cnp.get_datetime64_unit(obj)
+
+
+def is_integer(obj):
+    return isinstance(obj, (cnp.integer, int))
diff --git a/numpy/core/tests/examples/setup.py b/numpy/core/tests/examples/setup.py
index 9860bf5f7..6e34aa778 100644
--- a/numpy/core/tests/examples/setup.py
+++ b/numpy/core/tests/examples/setup.py
@@ -9,12 +9,11 @@ from Cython.Build import cythonize
 from setuptools.extension import Extension
 import os
 
-here = os.path.dirname(__file__)
 macros = [("NPY_NO_DEPRECATED_API", 0)]
 
 checks = Extension(
     "checks",
-    sources=[os.path.join(here, "checks.pyx")],
+    sources=[os.path.join('.', "checks.pyx")],
     include_dirs=[np.get_include()],
     define_macros=macros,
 )
diff --git a/numpy/core/tests/test__exceptions.py b/numpy/core/tests/test__exceptions.py
index 494b51f34..51c056936 100644
--- a/numpy/core/tests/test__exceptions.py
+++ b/numpy/core/tests/test__exceptions.py
@@ -1,11 +1,21 @@
 """
 Tests of the ._exceptions module. Primarily for exercising the __str__ methods.
 """
+
+import pickle
+
 import numpy as np
 
 _ArrayMemoryError = np.core._exceptions._ArrayMemoryError
+_UFuncNoLoopError = np.core._exceptions._UFuncNoLoopError
 
 class TestArrayMemoryError:
+    def test_pickling(self):
+        """ Test that _ArrayMemoryError can be pickled """
+        error = _ArrayMemoryError((1023,), np.dtype(np.uint8))
+        res = pickle.loads(pickle.dumps(error))
+        assert res._total_size == error._total_size
+
     def test_str(self):
         e = _ArrayMemoryError((1023,), np.dtype(np.uint8))
         str(e)  # not crashing is enough
@@ -40,3 +50,9 @@ class TestArrayMemoryError:
 
         e = _ArrayMemoryError((2, 4), np.dtype((np.uint64, 16)))
         assert e._total_size == 1024
+
+
+class TestUFuncNoLoopError:
+    def test_pickling(self):
+        """ Test that _UFuncNoLoopError can be pickled """
+        assert isinstance(pickle.dumps(_UFuncNoLoopError), bytes)
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 067cadf78..0f42f7076 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -281,6 +281,19 @@ def test_array_astype():
     a = np.array(1000, dtype='i4')
     assert_raises(TypeError, a.astype, 'U1', casting='safe')
 
+
+@pytest.mark.parametrize("dt", ["d", "f", "S13", "U32"])
+def test_array_astype_to_void(dt):
+    dt = np.dtype(dt)
+    arr = np.array([], dtype=dt)
+    assert arr.astype("V").dtype.itemsize == dt.itemsize
+
+def test_object_array_astype_to_void():
+    # This is different to `test_array_astype_to_void` as object arrays
+    # are inspected.  The default void is "V8" (8 is the length of double)
+    arr = np.array([], dtype="O").astype("V")
+    assert arr.dtype == "V8"
+
 @pytest.mark.parametrize("t",
     np.sctypes['uint'] + np.sctypes['int'] + np.sctypes['float']
 )
@@ -317,6 +330,29 @@ def test_string_to_boolean_cast_errors(dtype, out_dtype):
         with assert_raises(ValueError):
             arr.astype(out_dtype)
 
+@pytest.mark.parametrize("str_type", [str, bytes, np.str_, np.unicode_])
+@pytest.mark.parametrize("scalar_type",
+        [np.complex64, np.complex128, np.clongdouble])
+def test_string_to_complex_cast(str_type, scalar_type):
+    value = scalar_type(b"1+3j")
+    assert scalar_type(value) == 1+3j
+    assert np.array([value], dtype=object).astype(scalar_type)[()] == 1+3j
+    assert np.array(value).astype(scalar_type)[()] == 1+3j
+    arr = np.zeros(1, dtype=scalar_type)
+    arr[0] = value
+    assert arr[0] == 1+3j
+
+@pytest.mark.parametrize("dtype", np.typecodes["AllFloat"])
+def test_none_to_nan_cast(dtype):
+    # Note that at the time of writing this test, the scalar constructors
+    # reject None
+    arr = np.zeros(1, dtype=dtype)
+    arr[0] = None
+    assert np.isnan(arr)[0]
+    assert np.isnan(np.array(None, dtype=dtype))[()]
+    assert np.isnan(np.array([None], dtype=dtype))[0]
+    assert np.isnan(np.array(None).astype(dtype))[()]
+
 def test_copyto_fromscalar():
     a = np.arange(6, dtype='f4').reshape(2, 3)
 
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index 30019b253..78def9360 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -11,6 +11,7 @@ from itertools import product
 
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_umath import _discover_array_parameters
 
 from numpy.testing import (
     assert_array_equal, assert_warns, IS_PYPY)
@@ -308,6 +309,13 @@ class TestScalarDiscovery:
                 # coercion should also raise (error type may change)
                 with pytest.raises(Exception):
                     np.array(scalar, dtype=dtype)
+
+                if (isinstance(scalar, rational) and
+                        np.issubdtype(dtype, np.signedinteger)):
+                    return
+
+                with pytest.raises(Exception):
+                    np.array([scalar], dtype=dtype)
                 # assignment should also raise
                 res = np.zeros((), dtype=dtype)
                 with pytest.raises(Exception):
@@ -323,6 +331,46 @@ class TestScalarDiscovery:
             ass[()] = scalar
             assert_array_equal(ass, cast)
 
+    @pytest.mark.parametrize("dtype_char", np.typecodes["All"])
+    def test_default_dtype_instance(self, dtype_char):
+        if dtype_char in "SU":
+            dtype = np.dtype(dtype_char + "1")
+        elif dtype_char == "V":
+            # Legacy behaviour was to use V8. The reason was float64 being the
+            # default dtype and that having 8 bytes.
+            dtype = np.dtype("V8")
+        else:
+            dtype = np.dtype(dtype_char)
+
+        discovered_dtype, _ = _discover_array_parameters([], type(dtype))
+
+        assert discovered_dtype == dtype
+        assert discovered_dtype.itemsize == dtype.itemsize
+
+    @pytest.mark.parametrize("dtype", np.typecodes["Integer"])
+    def test_scalar_to_int_coerce_does_not_cast(self, dtype):
+        """
+        Signed integers are currently different in that they do not cast other
+        NumPy scalar, but instead use scalar.__int__(). The harcoded
+        exception to this rule is `np.array(scalar, dtype=integer)`.
+        """
+        dtype = np.dtype(dtype)
+        invalid_int = np.ulonglong(-1)
+
+        float_nan = np.float64(np.nan)
+
+        for scalar in [float_nan, invalid_int]:
+            # This is a special case using casting logic and thus not failing:
+            coerced = np.array(scalar, dtype=dtype)
+            cast = np.array(scalar).astype(dtype)
+            assert_array_equal(coerced, cast)
+
+            # However these fail:
+            with pytest.raises((ValueError, OverflowError)):
+                np.array([scalar], dtype=dtype)
+            with pytest.raises((ValueError, OverflowError)):
+                cast[()] = scalar
+
 
 class TestTimeScalars:
     @pytest.mark.parametrize("dtype", [np.int64, np.float32])
@@ -332,13 +380,21 @@ class TestTimeScalars:
              param(np.datetime64("NaT", "generic"), id="datetime64[generic](NaT)"),
              param(np.datetime64(1, "D"), id="datetime64[D]")],)
     def test_coercion_basic(self, dtype, scalar):
+        # Note the `[scalar]` is there because np.array(scalar) uses stricter
+        # `scalar.__int__()` rules for backward compatibility right now.
         arr = np.array(scalar, dtype=dtype)
         cast = np.array(scalar).astype(dtype)
-        ass = np.ones((), dtype=dtype)
-        ass[()] = scalar  # raises, as would np.array([scalar], dtype=dtype)
-
         assert_array_equal(arr, cast)
-        assert_array_equal(cast, cast)
+
+        ass = np.ones((), dtype=dtype)
+        if issubclass(dtype, np.integer):
+            with pytest.raises(TypeError):
+                # raises, as would np.array([scalar], dtype=dtype), this is
+                # conversion from times, but behaviour of integers.
+                ass[()] = scalar
+        else:
+            ass[()] = scalar
+            assert_array_equal(ass, cast)
 
     @pytest.mark.parametrize("dtype", [np.int64, np.float32])
     @pytest.mark.parametrize("scalar",
@@ -441,7 +497,7 @@ class TestNested:
         for i in range(np.MAXDIMS - 1):
             nested = [nested]
 
-        with pytest.raises(ValueError):
+        with pytest.warns(DeprecationWarning):
             # It will refuse to assign the array into
             np.array(nested, dtype="float64")
 
@@ -478,6 +534,27 @@ class TestNested:
         with pytest.raises(ValueError):
             np.array([[], np.empty((0, 1))], dtype=object)
 
+    def test_array_of_different_depths(self):
+        # When multiple arrays (or array-likes) are included in a
+        # sequences and have different depth, we currently discover
+        # as many dimensions as they share. (see also gh-17224)
+        arr = np.zeros((3, 2))
+        mismatch_first_dim = np.zeros((1, 2))
+        mismatch_second_dim = np.zeros((3, 3))
+
+        dtype, shape = _discover_array_parameters(
+            [arr, mismatch_second_dim], dtype=np.dtype("O"))
+        assert shape == (2, 3)
+
+        dtype, shape = _discover_array_parameters(
+            [arr, mismatch_first_dim], dtype=np.dtype("O"))
+        assert shape == (2,)
+        # The second case is currently supported because the arrays
+        # can be stored as objects:
+        res = np.asarray([arr, mismatch_first_dim], dtype=np.dtype("O"))
+        assert res[0] is arr
+        assert res[1] is mismatch_first_dim
+
 
 class TestBadSequences:
     # These are tests for bad objects passed into `np.array`, in general
@@ -570,3 +647,45 @@ class TestArrayLikes:
         with pytest.raises(ValueError):
             # The error type does not matter much here.
             np.array([obj])
+
+    def test_arraylike_classes(self):
+        # The classes of array-likes should generally be acceptable to be
+        # stored inside a numpy (object) array.  This tests all of the
+        # special attributes (since all are checked during coercion).
+        arr = np.array(np.int64)
+        assert arr[()] is np.int64
+        arr = np.array([np.int64])
+        assert arr[0] is np.int64
+
+        # This also works for properties/unbound methods:
+        class ArrayLike:
+            @property
+            def __array_interface__(self):
+                pass
+
+            @property
+            def __array_struct__(self):
+                pass
+
+            def __array__(self):
+                pass
+
+        arr = np.array(ArrayLike)
+        assert arr[()] is ArrayLike
+        arr = np.array([ArrayLike])
+        assert arr[0] is ArrayLike
+
+    @pytest.mark.skipif(
+            np.dtype(np.intp).itemsize < 8, reason="Needs 64bit platform")
+    def test_too_large_array_error_paths(self):
+        """Test the error paths, including for memory leaks"""
+        arr = np.array(0, dtype="uint8")
+        # Guarantees that a contiguous copy won't work:
+        arr = np.broadcast_to(arr, 2**62)
+
+        for i in range(5):
+            # repeat, to ensure caching cannot have an effect:
+            with pytest.raises(MemoryError):
+                np.array(arr)
+            with pytest.raises(MemoryError):
+                np.array([arr])
diff --git a/numpy/core/tests/test_casting_unittests.py b/numpy/core/tests/test_casting_unittests.py
new file mode 100644
index 000000000..fec0ae7c7
--- /dev/null
+++ b/numpy/core/tests/test_casting_unittests.py
@@ -0,0 +1,301 @@
+"""
+The tests exercise the casting machinery in a more low-level manner.
+The reason is mostly to test a new implementation of the casting machinery.
+
+Unlike most tests in NumPy, these are closer to unit-tests rather
+than integration tests.
+"""
+
+import pytest
+import textwrap
+import enum
+
+import numpy as np
+
+from numpy.core._multiarray_umath import (
+    _get_castingimpl as get_castingimpl)
+from numpy.core._multiarray_tests import uses_new_casts
+
+
+# Simple skips object, parametric and long double (unsupported by struct)
+simple_dtypes = "?bhilqBHILQefdFD"
+if np.dtype("l").itemsize != np.dtype("q").itemsize:
+    # Remove l and L, the table was generated with 64bit linux in mind.
+    # TODO: Should have two tables or no a different solution.
+    simple_dtypes = simple_dtypes.replace("l", "").replace("L", "")
+simple_dtypes = [type(np.dtype(c)) for c in simple_dtypes]
+
+
+def simple_dtype_instances():
+    for dtype_class in simple_dtypes:
+        dt = dtype_class()
+        yield pytest.param(dt, id=str(dt))
+        if dt.byteorder != "|":
+            dt = dt.newbyteorder()
+            yield pytest.param(dt, id=str(dt))
+
+
+def get_expected_stringlength(dtype):
+    """Returns the string length when casting the basic dtypes to strings.
+    """
+    if dtype == np.bool_:
+        return 5
+    if dtype.kind in "iu":
+        if dtype.itemsize == 1:
+            length = 3
+        elif dtype.itemsize == 2:
+            length = 5
+        elif dtype.itemsize == 4:
+            length = 10
+        elif dtype.itemsize == 8:
+            length = 20
+        else:
+            raise AssertionError(f"did not find expected length for {dtype}")
+
+        if dtype.kind == "i":
+            length += 1  # adds one character for the sign
+
+        return length
+
+    # Note: Can't do dtype comparison for longdouble on windows
+    if dtype.char == "g":
+        return 48
+    elif dtype.char == "G":
+        return 48 * 2
+    elif dtype.kind == "f":
+        return 32  # also for half apparently.
+    elif dtype.kind == "c":
+        return 32 * 2
+
+    raise AssertionError(f"did not find expected length for {dtype}")
+
+
+class Casting(enum.IntEnum):
+    no = 0
+    equiv = 1
+    safe = 2
+    same_kind = 3
+    unsafe = 4
+    cast_is_view = 1 << 16
+
+
+def _get_cancast_table():
+    table = textwrap.dedent("""
+        X ? b h i l q B H I L Q e f d g F D G S U V O M m
+        ? # = = = = = = = = = = = = = = = = = = = = = . =
+        b . # = = = = . . . . . = = = = = = = = = = = . =
+        h . ~ # = = = . . . . . ~ = = = = = = = = = = . =
+        i . ~ ~ # = = . . . . . ~ ~ = = ~ = = = = = = . =
+        l . ~ ~ ~ # # . . . . . ~ ~ = = ~ = = = = = = . =
+        q . ~ ~ ~ # # . . . . . ~ ~ = = ~ = = = = = = . =
+        B . ~ = = = = # = = = = = = = = = = = = = = = . =
+        H . ~ ~ = = = ~ # = = = ~ = = = = = = = = = = . =
+        I . ~ ~ ~ = = ~ ~ # = = ~ ~ = = ~ = = = = = = . =
+        L . ~ ~ ~ ~ ~ ~ ~ ~ # # ~ ~ = = ~ = = = = = = . ~
+        Q . ~ ~ ~ ~ ~ ~ ~ ~ # # ~ ~ = = ~ = = = = = = . ~
+        e . . . . . . . . . . . # = = = = = = = = = = . .
+        f . . . . . . . . . . . ~ # = = = = = = = = = . .
+        d . . . . . . . . . . . ~ ~ # = ~ = = = = = = . .
+        g . . . . . . . . . . . ~ ~ ~ # ~ ~ = = = = = . .
+        F . . . . . . . . . . . . . . . # = = = = = = . .
+        D . . . . . . . . . . . . . . . ~ # = = = = = . .
+        G . . . . . . . . . . . . . . . ~ ~ # = = = = . .
+        S . . . . . . . . . . . . . . . . . . # = = = . .
+        U . . . . . . . . . . . . . . . . . . . # = = . .
+        V . . . . . . . . . . . . . . . . . . . . # = . .
+        O . . . . . . . . . . . . . . . . . . . . = # . .
+        M . . . . . . . . . . . . . . . . . . . . = = # .
+        m . . . . . . . . . . . . . . . . . . . . = = . #
+        """).strip().split("\n")
+    dtypes = [type(np.dtype(c)) for c in table[0][2::2]]
+
+    convert_cast = {".": Casting.unsafe, "~": Casting.same_kind,
+                    "=": Casting.safe, "#": Casting.equiv,
+                    " ": -1}
+
+    cancast = {}
+    for from_dt, row in zip(dtypes, table[1:]):
+        cancast[from_dt] = {}
+        for to_dt, c in zip(dtypes, row[2::2]):
+            cancast[from_dt][to_dt] = convert_cast[c]
+
+    return cancast
+
+CAST_TABLE = _get_cancast_table()
+
+
+class TestChanges:
+    """
+    These test cases excercise some behaviour changes
+    """
+    @pytest.mark.parametrize("string", ["S", "U"])
+    @pytest.mark.parametrize("floating", ["e", "f", "d", "g"])
+    def test_float_to_string(self, floating, string):
+        assert np.can_cast(floating, string)
+        # 100 is long enough to hold any formatted floating
+        if uses_new_casts():
+            assert np.can_cast(floating, f"{string}100")
+        else:
+            assert not np.can_cast(floating, f"{string}100")
+            assert np.can_cast(floating, f"{string}100", casting="same_kind")
+
+    def test_to_void(self):
+        # But in general, we do consider these safe:
+        assert np.can_cast("d", "V")
+        assert np.can_cast("S20", "V")
+
+        # Do not consider it a safe cast if the void is too smaller:
+        if uses_new_casts():
+            assert not np.can_cast("d", "V1")
+            assert not np.can_cast("S20", "V1")
+            assert not np.can_cast("U1", "V1")
+            # Structured to unstructured is just like any other:
+            assert np.can_cast("d,i", "V", casting="same_kind")
+        else:
+            assert np.can_cast("d", "V1")
+            assert np.can_cast("S20", "V1")
+            assert np.can_cast("U1", "V1")
+            assert not np.can_cast("d,i", "V", casting="same_kind")
+
+
+class TestCasting:
+    @pytest.mark.parametrize("from_Dt", simple_dtypes)
+    def test_simple_cancast(self, from_Dt):
+        for to_Dt in simple_dtypes:
+            cast = get_castingimpl(from_Dt, to_Dt)
+
+            for from_dt in [from_Dt(), from_Dt().newbyteorder()]:
+                default = cast._resolve_descriptors((from_dt, None))[1][1]
+                assert default == to_Dt()
+                del default
+
+                for to_dt in [to_Dt(), to_Dt().newbyteorder()]:
+                    casting, (from_res, to_res) = cast._resolve_descriptors(
+                        (from_dt, to_dt))
+                    assert(type(from_res) == from_Dt)
+                    assert(type(to_res) == to_Dt)
+                    if casting & Casting.cast_is_view:
+                        # If a view is acceptable, this is "no" casting
+                        # and byte order must be matching.
+                        assert casting == Casting.no | Casting.cast_is_view
+                        # The above table lists this as "equivalent"
+                        assert Casting.equiv == CAST_TABLE[from_Dt][to_Dt]
+                        # Note that to_res may not be the same as from_dt
+                        assert from_res.isnative == to_res.isnative
+                    else:
+                        if from_Dt == to_Dt:
+                            # Note that to_res may not be the same as from_dt
+                            assert from_res.isnative != to_res.isnative
+                        assert casting == CAST_TABLE[from_Dt][to_Dt]
+
+                    if from_Dt is to_Dt:
+                        assert(from_dt is from_res)
+                        assert(to_dt is to_res)
+
+
+    def string_with_modified_length(self, dtype, change_length):
+        fact = 1 if dtype.char == "S" else 4
+        length = dtype.itemsize // fact + change_length
+        return np.dtype(f"{dtype.byteorder}{dtype.char}{length}")
+
+    @pytest.mark.parametrize("other_DT", simple_dtypes)
+    @pytest.mark.parametrize("string_char", ["S", "U"])
+    def test_string_cancast(self, other_DT, string_char):
+        fact = 1 if string_char == "S" else 4
+
+        string_DT = type(np.dtype(string_char))
+        cast = get_castingimpl(other_DT, string_DT)
+
+        other_dt = other_DT()
+        expected_length = get_expected_stringlength(other_dt)
+        string_dt = np.dtype(f"{string_char}{expected_length}")
+
+        safety, (res_other_dt, res_dt) = cast._resolve_descriptors((other_dt, None))
+        assert res_dt.itemsize == expected_length * fact
+        assert safety == Casting.safe  # we consider to string casts "safe"
+        assert isinstance(res_dt, string_DT)
+
+        # These casts currently implement changing the string length, so
+        # check the cast-safety for too long/fixed string lengths:
+        for change_length in [-1, 0, 1]:
+            if change_length >= 0:
+                expected_safety = Casting.safe
+            else:
+                expected_safety = Casting.same_kind
+
+            to_dt = self.string_with_modified_length(string_dt, change_length)
+            safety, (_, res_dt) = cast._resolve_descriptors((other_dt, to_dt))
+            assert res_dt is to_dt
+            assert safety == expected_safety
+
+        # The opposite direction is always considered unsafe:
+        cast = get_castingimpl(string_DT, other_DT)
+
+        safety, _ = cast._resolve_descriptors((string_dt, other_dt))
+        assert safety == Casting.unsafe
+
+        cast = get_castingimpl(string_DT, other_DT)
+        safety, (_, res_dt) = cast._resolve_descriptors((string_dt, None))
+        assert safety == Casting.unsafe
+        assert other_dt is res_dt  # returns the singleton for simple dtypes
+
+    @pytest.mark.parametrize("other_dt", ["S8", "<U8", ">U8"])
+    @pytest.mark.parametrize("string_char", ["S", "U"])
+    def test_string_to_string_cancast(self, other_dt, string_char):
+        other_dt = np.dtype(other_dt)
+
+        fact = 1 if string_char == "S" else 4
+        div = 1 if other_dt.char == "S" else 4
+
+        string_DT = type(np.dtype(string_char))
+        cast = get_castingimpl(type(other_dt), string_DT)
+
+        expected_length = other_dt.itemsize // div
+        string_dt = np.dtype(f"{string_char}{expected_length}")
+
+        safety, (res_other_dt, res_dt) = cast._resolve_descriptors((other_dt, None))
+        assert res_dt.itemsize == expected_length * fact
+        assert isinstance(res_dt, string_DT)
+
+        if other_dt.char == string_char:
+            if other_dt.isnative:
+                expected_safety = Casting.no | Casting.cast_is_view
+            else:
+                expected_safety = Casting.equiv
+        elif string_char == "U":
+            expected_safety = Casting.safe
+        else:
+            expected_safety = Casting.unsafe
+
+        assert expected_safety == safety
+
+        for change_length in [-1, 0, 1]:
+            to_dt = self.string_with_modified_length(string_dt, change_length)
+            safety, (_, res_dt) = cast._resolve_descriptors((other_dt, to_dt))
+
+            assert res_dt is to_dt
+            if expected_safety == Casting.unsafe:
+                assert safety == expected_safety
+            elif change_length < 0:
+                assert safety == Casting.same_kind
+            elif change_length == 0:
+                assert safety == expected_safety
+            elif change_length > 0:
+                assert safety == Casting.safe
+
+    def test_void_to_string_special_case(self):
+        # Cover a small special case in void to string casting that could
+        # probably just as well be turned into an error (compare
+        # `test_object_to_parametric_internal_error` below).
+        assert np.array([], dtype="V5").astype("S").dtype.itemsize == 5
+        assert np.array([], dtype="V5").astype("U").dtype.itemsize == 4 * 5
+
+    def test_object_to_parametric_internal_error(self):
+        # We reject casting from object to a parametric type, without
+        # figuring out the correct instance first.
+        object_dtype = type(np.dtype(object))
+        other_dtype = type(np.dtype(str))
+        cast = get_castingimpl(object_dtype, other_dtype)
+        with pytest.raises(TypeError,
+                    match="casting from object to the parametric DType"):
+            cast._resolve_descriptors((np.dtype("O"), None))
diff --git a/numpy/core/tests/test_cython.py b/numpy/core/tests/test_cython.py
index 63524b269..a1f09d0fe 100644
--- a/numpy/core/tests/test_cython.py
+++ b/numpy/core/tests/test_cython.py
@@ -34,21 +34,19 @@ def install_temp(request, tmp_path):
     here = os.path.dirname(__file__)
     ext_dir = os.path.join(here, "examples")
 
-    tmp_path = tmp_path._str
-    cytest = os.path.join(tmp_path, "cytest")
+    cytest = str(tmp_path / "cytest")
 
     shutil.copytree(ext_dir, cytest)
     # build the examples and "install" them into a temporary directory
 
-    install_log = os.path.join(tmp_path, "tmp_install_log.txt")
+    install_log = str(tmp_path / "tmp_install_log.txt")
     subprocess.check_call(
         [
             sys.executable,
             "setup.py",
             "build",
             "install",
-            "--prefix",
-            os.path.join(tmp_path, "installdir"),
+            "--prefix", str(tmp_path / "installdir"),
             "--single-version-externally-managed",
             "--record",
             install_log,
@@ -126,3 +124,11 @@ def test_get_datetime64_unit(install_temp):
     result = checks.get_dt64_unit(td64)
     expected = 5
     assert result == expected
+
+
+def test_abstract_scalars(install_temp):
+    import checks
+
+    assert checks.is_integer(1)
+    assert checks.is_integer(np.int8(1))
+    assert checks.is_integer(np.uint64(1))
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 59a3954fd..62f6381d5 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -26,6 +26,7 @@ class TestDateTime:
     def test_datetime_dtype_creation(self):
         for unit in ['Y', 'M', 'W', 'D',
                      'h', 'm', 's', 'ms', 'us',
+                     'μs',  # alias for us
                      'ns', 'ps', 'fs', 'as']:
             dt1 = np.dtype('M8[750%s]' % unit)
             assert_(dt1 == np.dtype('datetime64[750%s]' % unit))
@@ -429,6 +430,10 @@ class TestDateTime:
                             np.timedelta64)
         assert_equal(actual, expected)
 
+    def test_timedelta_nat_format(self):
+        # gh-17552
+        assert_equal('NaT', '{0}'.format(np.timedelta64('nat')))
+
     def test_timedelta_scalar_construction_units(self):
         # String construction detecting units
         assert_equal(np.datetime64('2010').dtype,
@@ -1653,8 +1658,9 @@ class TestDateTime:
                      '1959-10-13T12:34:56')
         assert_equal(np.datetime_as_string(np.datetime64(datetime, 'ms')),
                      '1959-10-13T12:34:56.789')
-        assert_equal(np.datetime_as_string(np.datetime64(datetime, 'us')),
-                     '1959-10-13T12:34:56.789012')
+        for us in ['us', 'μs', b'us']:  # check non-ascii and bytes too
+            assert_equal(np.datetime_as_string(np.datetime64(datetime, us)),
+                         '1959-10-13T12:34:56.789012')
 
         datetime = '1969-12-31T23:34:56.789012345678901234'
 
@@ -2389,3 +2395,19 @@ class TestDateTimeData:
     def test_basic(self):
         a = np.array(['1980-03-23'], dtype=np.datetime64)
         assert_equal(np.datetime_data(a.dtype), ('D', 1))
+
+    def test_bytes(self):
+        # byte units are converted to unicode
+        dt = np.datetime64('2000', (b'ms', 5))
+        assert np.datetime_data(dt.dtype) == ('ms', 5)
+
+        dt = np.datetime64('2000', b'5ms')
+        assert np.datetime_data(dt.dtype) == ('ms', 5)
+
+    def test_non_ascii(self):
+        # μs is normalized to μ
+        dt = np.datetime64('2000', ('μs', 5))
+        assert np.datetime_data(dt.dtype) == ('us', 5)
+
+        dt = np.datetime64('2000', '5μs')
+        assert np.datetime_data(dt.dtype) == ('us', 5)
diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py
index bbb94f7d3..59fc54722 100644
--- a/numpy/core/tests/test_defchararray.py
+++ b/numpy/core/tests/test_defchararray.py
@@ -179,6 +179,12 @@ class TestComparisons:
     def test_less(self):
         assert_array_equal((self.A < self.B), [[True, False], [False, False]])
 
+    def test_type(self):
+        out1 = np.char.equal(self.A, self.B)
+        out2 = np.char.equal('a', 'a')
+        assert_(isinstance(out1, np.ndarray))
+        assert_(isinstance(out2, np.ndarray))
+
 class TestComparisonsMixed1(TestComparisons):
     """Ticket #1276"""
 
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 68502adda..a67fe62c3 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -81,6 +81,8 @@ class _DeprecationTestCase:
         kwargs : dict
             Keyword arguments for `function`
         """
+        __tracebackhide__ = True  # Hide traceback for py.test
+
         # reset the log
         self.log[:] = []
 
@@ -615,7 +617,7 @@ class BuiltInRoundComplexDType(_DeprecationTestCase):
             self.assert_deprecated(round, args=(scalar,))
             self.assert_deprecated(round, args=(scalar, 0))
             self.assert_deprecated(round, args=(scalar,), kwargs={'ndigits': 0})
-    
+
     def test_not_deprecated(self):
         for scalar_type in self.not_deprecated_types:
             scalar = scalar_type(0)
@@ -678,3 +680,108 @@ class TestDeprecatedGlobals(_DeprecationTestCase):
         # from np.compat
         self.assert_deprecated(lambda: np.long)
         self.assert_deprecated(lambda: np.unicode)
+
+
+class TestMatrixInOuter(_DeprecationTestCase):
+    # 2020-05-13 NumPy 1.20.0
+    message = (r"add.outer\(\) was passed a numpy matrix as "
+               r"(first|second) argument.")
+
+    def test_deprecated(self):
+        arr = np.array([1, 2, 3])
+        m = np.array([1, 2, 3]).view(np.matrix)
+        self.assert_deprecated(np.add.outer, args=(m, m), num=2)
+        self.assert_deprecated(np.add.outer, args=(arr, m))
+        self.assert_deprecated(np.add.outer, args=(m, arr))
+        self.assert_not_deprecated(np.add.outer, args=(arr, arr))
+
+
+class TestRaggedArray(_DeprecationTestCase):
+    # 2020-07-24, NumPy 1.20.0
+    message = "setting an array element with a sequence"
+
+    def test_deprecated(self):
+        arr = np.ones((1, 1))
+        # Deprecated if the array is a leave node:
+        self.assert_deprecated(lambda: np.array([arr, 0], dtype=np.float64))
+        self.assert_deprecated(lambda: np.array([0, arr], dtype=np.float64))
+        # And when it is an assignment into a lower dimensional subarray:
+        self.assert_deprecated(lambda: np.array([arr, [0]], dtype=np.float64))
+        self.assert_deprecated(lambda: np.array([[0], arr], dtype=np.float64))
+
+
+class FlatteningConcatenateUnsafeCast(_DeprecationTestCase):
+    # NumPy 1.20, 2020-09-03
+    message = "concatenate with `axis=None` will use same-kind casting"
+
+    def test_deprecated(self):
+        self.assert_deprecated(np.concatenate,
+                args=(([0.], [1.]),),
+                kwargs=dict(axis=None, out=np.empty(2, dtype=np.int64)))
+
+    def test_not_deprecated(self):
+        self.assert_not_deprecated(np.concatenate,
+                args=(([0.], [1.]),),
+                kwargs={'axis': None, 'out': np.empty(2, dtype=np.int64),
+                        'casting': "unsafe"})
+
+        with assert_raises(TypeError):
+            # Tests should notice if the deprecation warning is given first...
+            np.concatenate(([0.], [1.]), out=np.empty(2, dtype=np.int64),
+                           casting="same_kind")
+
+
+class TestDeprecateSubarrayDTypeDuringArrayCoercion(_DeprecationTestCase):
+    warning_cls = FutureWarning
+    message = "(creating|casting) an array (with|to) a subarray dtype"
+
+    def test_deprecated_array(self):
+        # Arrays are more complex, since they "broadcast" on success:
+        arr = np.array([1, 2])
+
+        self.assert_deprecated(lambda: arr.astype("(2)i,"))
+        with pytest.warns(FutureWarning):
+            res = arr.astype("(2)i,")
+
+        assert_array_equal(res, [[1, 2], [1, 2]])
+
+        self.assert_deprecated(lambda: np.array(arr, dtype="(2)i,"))
+        with pytest.warns(FutureWarning):
+            res = np.array(arr, dtype="(2)i,")
+
+        assert_array_equal(res, [[1, 2], [1, 2]])
+
+        with pytest.warns(FutureWarning):
+            res = np.array([[(1,), (2,)], arr], dtype="(2)i,")
+
+        assert_array_equal(res, [[[1, 1], [2, 2]], [[1, 2], [1, 2]]])
+
+    def test_deprecated_and_error(self):
+        # These error paths do not give a warning, but will succeed in the
+        # future.
+        arr = np.arange(5 * 2).reshape(5, 2)
+        def check():
+            with pytest.raises(ValueError):
+                arr.astype("(2,2)f")
+
+        self.assert_deprecated(check)
+
+        def check():
+            with pytest.raises(ValueError):
+                np.array(arr, dtype="(2,2)f")
+
+        self.assert_deprecated(check)
+
+
+class TestDeprecatedUnpickleObjectScalar(_DeprecationTestCase):
+    # Deprecated 2020-11-24, NumPy 1.20
+    """
+    Technically, it should be impossible to create numpy object scalars,
+    but there was an unpickle path that would in theory allow it. That
+    path is invalid and must lead to the warning.
+    """
+    message = "Unpickling a scalar with object dtype is deprecated."
+
+    def test_deprecated(self):
+        ctor = np.core.multiarray.scalar
+        self.assert_deprecated(lambda: ctor(np.dtype("O"), 1))
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 2e2b0dbe2..0ebcc72da 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -6,6 +6,7 @@ import gc
 
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_tests import create_custom_field_dtype
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, HAS_REFCOUNT)
 from numpy.compat import pickle
@@ -152,6 +153,9 @@ class TestBuiltin:
                       'formats': ['f4', 'i4'],
                       'offsets': [4, 0]})
         assert_equal(x == y, False)
+        # But it is currently an equivalent cast:
+        assert np.can_cast(x, y, casting="equiv")
+
 
 class TestRecord:
     def test_equivalent_record(self):
@@ -313,6 +317,24 @@ class TestRecord:
                        'formats':['i1', 'O'],
                        'offsets':[np.dtype('intp').itemsize, 0]})
 
+    @pytest.mark.parametrize(["obj", "dtype", "expected"],
+        [([], ("(2)f4,"), np.empty((0, 2), dtype="f4")),
+         (3, "(3)f4,", [3, 3, 3]),
+         (np.float64(2), "(2)f4,", [2, 2]),
+         ([((0, 1), (1, 2)), ((2,),)], '(2,2)f4', None),
+         (["1", "2"], "(2)i,", None)])
+    def test_subarray_list(self, obj, dtype, expected):
+        dtype = np.dtype(dtype)
+        res = np.array(obj, dtype=dtype)
+
+        if expected is None:
+            # iterate the 1-d list to fill the array
+            expected = np.empty(len(obj), dtype=dtype)
+            for i in range(len(expected)):
+                expected[i] = obj[i]
+
+        assert_array_equal(res, expected)
+
     def test_comma_datetime(self):
         dt = np.dtype('M8[D],datetime64[Y],i8')
         assert_equal(dt, np.dtype([('f0', 'M8[D]'),
@@ -766,6 +788,26 @@ class TestMonsterType:
             ('yi', np.dtype((a, (3, 2))))])
         assert_dtype_equal(c, d)
 
+    def test_list_recursion(self):
+        l = list()
+        l.append(('f', l))
+        with pytest.raises(RecursionError):
+            np.dtype(l)
+
+    def test_tuple_recursion(self):
+        d = np.int32
+        for i in range(100000):
+            d = (d, (1,))
+        with pytest.raises(RecursionError):
+            np.dtype(d)
+
+    def test_dict_recursion(self):
+        d = dict(names=['self'], formats=[None], offsets=[0])
+        d['formats'][0] = d
+        with pytest.raises(RecursionError):
+            np.dtype(d)
+
+
 class TestMetadata:
     def test_no_metadata(self):
         d = np.dtype(int)
@@ -1338,3 +1380,35 @@ class TestFromCTypes:
         pair_type = np.dtype('{},{}'.format(*pair))
         expected = np.dtype([('f0', pair[0]), ('f1', pair[1])])
         assert_equal(pair_type, expected)
+
+
+class TestUserDType:
+    @pytest.mark.leaks_references(reason="dynamically creates custom dtype.")
+    def test_custom_structured_dtype(self):
+        class mytype:
+            pass
+
+        blueprint = np.dtype([("field", object)])
+        dt = create_custom_field_dtype(blueprint, mytype, 0)
+        assert dt.type == mytype
+        # We cannot (currently) *create* this dtype with `np.dtype` because
+        # mytype does not inherit from `np.generic`.  This seems like an
+        # unnecessary restriction, but one that has been around forever:
+        assert np.dtype(mytype) == np.dtype("O")
+
+    def test_custom_structured_dtype_errors(self):
+        class mytype:
+            pass
+
+        blueprint = np.dtype([("field", object)])
+
+        with pytest.raises(ValueError):
+            # Tests what happens if fields are unset during creation
+            # which is currently rejected due to the containing object
+            # (see PyArray_RegisterDataType).
+            create_custom_field_dtype(blueprint, mytype, 1)
+
+        with pytest.raises(RuntimeError):
+            # Tests that a dtype must have its type field set up to np.dtype
+            # or in this case a builtin instance.
+            create_custom_field_dtype(blueprint, mytype, 2)
diff --git a/numpy/core/tests/test_function_base.py b/numpy/core/tests/test_function_base.py
index 62a9772c8..dad7a5883 100644
--- a/numpy/core/tests/test_function_base.py
+++ b/numpy/core/tests/test_function_base.py
@@ -402,3 +402,8 @@ class TestLinspace:
         stop = array(2, dtype='O')
         y = linspace(start, stop, 3)
         assert_array_equal(y, array([1., 1.5, 2.]))
+                    
+    def test_round_negative(self):
+        y = linspace(-1, 3, num=8, dtype=int)
+        t = array([-1, -1, 0, 0, 1, 1, 2, 3], dtype=int)
+        assert_array_equal(y, t)
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 1069cbe8d..667c49240 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -7,8 +7,8 @@ import numpy as np
 from numpy.core._multiarray_tests import array_indexing
 from itertools import product
 from numpy.testing import (
-    assert_, assert_equal, assert_raises, assert_array_equal, assert_warns,
-    HAS_REFCOUNT,
+    assert_, assert_equal, assert_raises, assert_raises_regex,
+    assert_array_equal, assert_warns, HAS_REFCOUNT,
     )
 
 
@@ -1239,6 +1239,41 @@ class TestBooleanIndexing:
         assert_raises(IndexError, lambda: a[False, [0, 1], ...])
 
 
+    def test_boolean_indexing_fast_path(self):
+        # These used to either give the wrong error, or incorrectly give no
+        # error.
+        a = np.ones((3, 3))
+
+        # This used to incorrectly work (and give an array of shape (0,))
+        idx1 = np.array([[False]*9])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx1])
+
+        # This used to incorrectly give a ValueError: operands could not be broadcast together
+        idx2 = np.array([[False]*8 + [True]])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx2])
+
+        # This is the same as it used to be. The above two should work like this.
+        idx3 = np.array([[False]*10])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 0; "
+            "dimension is 3 but corresponding boolean dimension is 1",
+            lambda: a[idx3])
+
+        # This used to give ValueError: non-broadcastable operand
+        a = np.ones((1, 1, 2))
+        idx = np.array([[[True], [False]]])
+        assert_raises_regex(IndexError,
+            "boolean index did not match indexed array along dimension 1; "
+            "dimension is 1 but corresponding boolean dimension is 2",
+            lambda: a[idx])
+
+
 class TestArrayToIndexDeprecation:
     """Creating an an index from array not 0-D is an error.
 
diff --git a/numpy/core/tests/test_memmap.py b/numpy/core/tests/test_memmap.py
index feef80ce8..a1e0c8f8f 100644
--- a/numpy/core/tests/test_memmap.py
+++ b/numpy/core/tests/test_memmap.py
@@ -11,7 +11,8 @@ from numpy import (
 
 from numpy import arange, allclose, asarray
 from numpy.testing import (
-    assert_, assert_equal, assert_array_equal, suppress_warnings
+    assert_, assert_equal, assert_array_equal, suppress_warnings, IS_PYPY,
+    break_cycles
     )
 
 class TestMemmap:
@@ -25,6 +26,10 @@ class TestMemmap:
 
     def teardown(self):
         self.tmpfp.close()
+        self.data = None
+        if IS_PYPY:
+            break_cycles()
+            break_cycles()
         shutil.rmtree(self.tempdir)
 
     def test_roundtrip(self):
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b7d4a6a92..048b1688f 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -21,8 +21,8 @@ import builtins
 from decimal import Decimal
 
 import numpy as np
-from numpy.compat import strchar
 import numpy.core._multiarray_tests as _multiarray_tests
+from numpy.core._rational_tests import rational
 from numpy.testing import (
     assert_, assert_raises, assert_warns, assert_equal, assert_almost_equal,
     assert_array_equal, assert_raises_regex, assert_array_almost_equal,
@@ -207,6 +207,23 @@ class TestFlags:
             a[2] = 10
             # only warn once
             assert_(len(w) == 1)
+    
+    @pytest.mark.parametrize(["flag", "flag_value", "writeable"],
+            [("writeable", True, True),
+             # Delete _warn_on_write after deprecation and simplify
+             # the parameterization:
+             ("_warn_on_write", True, False),
+             ("writeable", False, False)])
+    def test_readonly_flag_protocols(self, flag, flag_value, writeable):
+        a = np.arange(10)
+        setattr(a.flags, flag, flag_value)
+
+        class MyArr():
+            __array_struct__ = a.__array_struct__
+
+        assert memoryview(a).readonly is not writeable
+        assert a.__array_interface__['data'][1] is not writeable
+        assert np.asarray(MyArr()).flags.writeable is writeable
 
     def test_otherflags(self):
         assert_equal(self.a.flags.carray, True)
@@ -828,7 +845,37 @@ class TestCreation:
 
     def test_void(self):
         arr = np.array([], dtype='V')
-        assert_equal(arr.dtype.kind, 'V')
+        assert arr.dtype == 'V8'  # current default
+        # Same length scalars (those that go to the same void) work:
+        arr = np.array([b"1234", b"1234"], dtype="V")
+        assert arr.dtype == "V4"
+
+        # Promoting different lengths will fail (pre 1.20 this worked)
+        # by going via S5 and casting to V5.
+        with pytest.raises(TypeError):
+            np.array([b"1234", b"12345"], dtype="V")
+        with pytest.raises(TypeError):
+            np.array([b"12345", b"1234"], dtype="V")
+
+        # Check the same for the casting path:
+        arr = np.array([b"1234", b"1234"], dtype="O").astype("V")
+        assert arr.dtype == "V4"
+        with pytest.raises(TypeError):
+            np.array([b"1234", b"12345"], dtype="O").astype("V")
+
+    @pytest.mark.parametrize("idx",
+            [pytest.param(Ellipsis, id="arr"), pytest.param((), id="scalar")])
+    def test_structured_void_promotion(self, idx):
+        arr = np.array(
+            [np.array(1, dtype="i,i")[idx], np.array(2, dtype='i,i')[idx]],
+            dtype="V")
+        assert_array_equal(arr, np.array([(1, 1), (2, 2)], dtype="i,i"))
+        # The following fails to promote the two dtypes, resulting in an error
+        with pytest.raises(TypeError):
+            np.array(
+                [np.array(1, dtype="i,i")[idx], np.array(2, dtype='i,i,i')[idx]],
+                dtype="V")
+
 
     def test_too_big_error(self):
         # 45341 is the smallest integer greater than sqrt(2**31 - 1).
@@ -1587,6 +1634,47 @@ class TestMethods:
 
     sort_kinds = ['quicksort', 'heapsort', 'stable']
 
+    def test_all_where(self):
+        a = np.array([[True, False, True],
+                      [False, False, False],
+                      [True, True, True]])
+        wh_full = np.array([[True, False, True],
+                            [False, False, False],
+                            [True, False, True]])
+        wh_lower = np.array([[False],
+                             [False],
+                             [True]])
+        for _ax in [0, None]:
+            assert_equal(a.all(axis=_ax, where=wh_lower),
+                        np.all(a[wh_lower[:,0],:], axis=_ax))
+            assert_equal(np.all(a, axis=_ax, where=wh_lower),
+                         a[wh_lower[:,0],:].all(axis=_ax))
+
+        assert_equal(a.all(where=wh_full), True)
+        assert_equal(np.all(a, where=wh_full), True)
+        assert_equal(a.all(where=False), True)
+        assert_equal(np.all(a, where=False), True)
+
+    def test_any_where(self):
+        a = np.array([[True, False, True],
+                      [False, False, False],
+                      [True, True, True]])
+        wh_full = np.array([[False, True, False],
+                            [True, True, True],
+                            [False, False, False]])
+        wh_middle = np.array([[False],
+                              [True],
+                              [False]])
+        for _ax in [0, None]:
+            assert_equal(a.any(axis=_ax, where=wh_middle),
+                         np.any(a[wh_middle[:,0],:], axis=_ax))
+            assert_equal(np.any(a, axis=_ax, where=wh_middle),
+                         a[wh_middle[:,0],:].any(axis=_ax))
+        assert_equal(a.any(where=wh_full), False)
+        assert_equal(np.any(a, where=wh_full), False)
+        assert_equal(a.any(where=False), False)
+        assert_equal(np.any(a, where=False), False)
+
     def test_compress(self):
         tgt = [[5, 6, 7, 8, 9]]
         arr = np.arange(10).reshape(2, 5)
@@ -2014,7 +2102,7 @@ class TestMethods:
             strtype = '>i2'
         else:
             strtype = '<i2'
-        mydtype = [('name', strchar + '5'), ('col2', strtype)]
+        mydtype = [('name', 'U5'), ('col2', strtype)]
         r = np.array([('a', 1), ('b', 255), ('c', 3), ('d', 258)],
                      dtype=mydtype)
         r.sort(order='col2')
@@ -3851,13 +3939,6 @@ class TestPickling:
             with pytest.raises(ImportError):
                 array.__reduce_ex__(5)
 
-        elif sys.version_info[:2] < (3, 6):
-            # when calling __reduce_ex__ explicitly with protocol=5 on python
-            # raise a ValueError saying that protocol 5 is not available for
-            # this python version
-            with pytest.raises(ValueError):
-                array.__reduce_ex__(5)
-
     def test_record_array_with_object_dtype(self):
         my_object = object()
 
@@ -4562,6 +4643,13 @@ class TestPutmask:
         np.putmask(x[1:4], x[:3], [True, False, True])
         assert_equal(x, np.array([True, True, True, True]))
 
+    def test_writeable(self):
+        a = np.arange(5)
+        a.flags.writeable = False
+
+        with pytest.raises(ValueError):
+            np.putmask(a, a >= 2, 3)
+
 
 class TestTake:
     def tst_basic(self, x):
@@ -5039,6 +5127,17 @@ class TestIO:
             s = f.read()
         assert_equal(s, '1.51,2.00,3.51,4.00')
 
+    def test_tofile_cleanup(self):
+        x = np.zeros((10), dtype=object)
+        with open(self.filename, 'wb') as f:
+            assert_raises(IOError, lambda: x.tofile(f, sep=''))
+        # Dup-ed file handle should be closed or remove will fail on Windows OS
+        os.remove(self.filename)
+
+        # Also make sure that we close the Python handle
+        assert_raises(IOError, lambda: x.tofile(self.filename))
+        os.remove(self.filename)
+
     def test_locale(self):
         with CommaDecimalPointLocale():
             self.test_numbers()
@@ -5063,6 +5162,33 @@ class TestIO:
             res = np.fromstring(x_str, dtype="(3,4)i4")
             assert_array_equal(x, res)
 
+    def test_parsing_subarray_unsupported(self):
+        # We currently do not support parsing subarray dtypes
+        data = "12,42,13," * 50
+        with pytest.raises(ValueError):
+            expected = np.fromstring(data, dtype="(3,)i", sep=",")
+
+        with open(self.filename, "w") as f:
+            f.write(data)
+
+        with pytest.raises(ValueError):
+            np.fromfile(self.filename, dtype="(3,)i", sep=",")
+
+    def test_read_shorter_than_count_subarray(self):
+        # Test that requesting more values does not cause any problems
+        # in conjuction with subarray dimensions being absored into the
+        # array dimension.
+        expected = np.arange(511 * 10, dtype="i").reshape(-1, 10)
+
+        binary = expected.tobytes()
+        with pytest.raises(ValueError):
+            with pytest.warns(DeprecationWarning):
+                np.fromstring(binary, dtype="(10,)i", count=10000)
+
+        expected.tofile(self.filename)
+        res = np.fromfile(self.filename, dtype="(10,)i", count=10000)
+        assert_array_equal(res, expected)
+
 
 class TestFromBuffer:
     @pytest.mark.parametrize('byteorder', ['<', '>'])
@@ -5575,6 +5701,33 @@ class TestStats:
         with assert_raises(np.core._exceptions.AxisError):
             np.arange(10).mean(axis=2)
 
+    def test_mean_where(self):
+        a = np.arange(16).reshape((4, 4))
+        wh_full = np.array([[False, True, False, True],
+                            [True, False, True, False],
+                            [True, True, False, False],
+                            [False, False, True, True]])
+        wh_partial = np.array([[False],
+                               [True],
+                               [True],
+                               [False]])
+        _cases = [(1, True, [1.5, 5.5, 9.5, 13.5]),
+                  (0, wh_full, [6., 5., 10., 9.]),
+                  (1, wh_full, [2., 5., 8.5, 14.5]),
+                  (0, wh_partial, [6., 7., 8., 9.])]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.mean(axis=_ax, where=_wh),
+                            np.array(_res))
+            assert_allclose(np.mean(a, axis=_ax, where=_wh),
+                            np.array(_res))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_allclose(a.mean(axis=1, where=wh_partial),
+                            np.array([np.nan, 5.5, 9.5, np.nan]))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.mean(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.mean(a, where=False), np.nan)
+
     def test_var_values(self):
         for mat in [self.rmat, self.cmat, self.omat]:
             for axis in [0, 1, None]:
@@ -5623,6 +5776,34 @@ class TestStats:
         with assert_raises(np.core._exceptions.AxisError):
             np.arange(10).var(axis=2)
 
+    def test_var_where(self):
+        a = np.arange(25).reshape((5, 5))
+        wh_full = np.array([[False, True, False, True, True],
+                            [True, False, True, True, False],
+                            [True, True, False, False, True],
+                            [False, True, True, False, True],
+                            [True, False, True, True, False]])
+        wh_partial = np.array([[False],
+                               [True],
+                               [True],
+                               [False],
+                               [True]])
+        _cases = [(0, True, [50., 50., 50., 50., 50.]),
+                  (1, True, [2., 2., 2., 2., 2.])]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.var(axis=_ax, where=_wh),
+                            np.array(_res))
+            assert_allclose(np.var(a, axis=_ax, where=_wh),
+                            np.array(_res))
+        assert_allclose(np.var(a, axis=1, where=wh_full),
+                        np.var(a[wh_full].reshape((5, 3)), axis=1))
+        assert_allclose(np.var(a, axis=0, where=wh_partial),
+                        np.var(a[wh_partial[:,0]], axis=0))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.var(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.var(a, where=False), np.nan)
+
     def test_std_values(self):
         for mat in [self.rmat, self.cmat, self.omat]:
             for axis in [0, 1, None]:
@@ -5630,6 +5811,42 @@ class TestStats:
                 res = _std(mat, axis=axis)
                 assert_almost_equal(res, tgt)
 
+    def test_std_where(self):
+        a = np.arange(25).reshape((5,5))[::-1]
+        whf = np.array([[False, True, False, True, True],
+                        [True, False, True, False, True],
+                        [True, True, False, True, False],
+                        [True, False, True, True, False],
+                        [False, True, False, True, True]])
+        whp = np.array([[False],
+                        [False],
+                        [True],
+                        [True],
+                        [False]])
+        _cases = [
+            (0, True, 7.07106781*np.ones((5))),
+            (1, True, 1.41421356*np.ones((5))),
+            (0, whf,
+             np.array([4.0824829 , 8.16496581, 5., 7.39509973, 8.49836586])),
+            (0, whp, 2.5*np.ones((5)))
+        ]
+        for _ax, _wh, _res in _cases:
+            assert_allclose(a.std(axis=_ax, where=_wh), _res)
+            assert_allclose(np.std(a, axis=_ax, where=_wh), _res)
+
+        assert_allclose(a.std(axis=1, where=whf),
+                        np.std(a[whf].reshape((5,3)), axis=1))
+        assert_allclose(np.std(a, axis=1, where=whf),
+                        (a[whf].reshape((5,3))).std(axis=1))
+        assert_allclose(a.std(axis=0, where=whp),
+                        np.std(a[whp[:,0]], axis=0))
+        assert_allclose(np.std(a, axis=0, where=whp),
+                        (a[whp[:,0]]).std(axis=0))
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(a.std(where=False), np.nan)
+        with pytest.warns(RuntimeWarning) as w:
+            assert_equal(np.std(a, where=False), np.nan)
+
     def test_subclass(self):
         class TestArray(np.ndarray):
             def __new__(cls, data, info):
@@ -7134,6 +7351,21 @@ class TestNewBufferProtocol:
                       _multiarray_tests.get_buffer_info,
                        np.arange(5)[::2], ('SIMPLE',))
 
+    @pytest.mark.parametrize(["obj", "error"], [
+            pytest.param(np.array([1, 2], dtype=rational), ValueError, id="array"),
+            pytest.param(rational(1, 2), TypeError, id="scalar")])
+    def test_export_and_pickle_user_dtype(self, obj, error):
+        # User dtypes should export successfully when FORMAT was not requested.
+        with pytest.raises(error):
+            _multiarray_tests.get_buffer_info(obj, ("STRIDED_RO", "FORMAT"))
+
+        _multiarray_tests.get_buffer_info(obj, ("STRIDED_RO",))
+
+        # This is currently also necessary to implement pickling:
+        pickle_obj = pickle.dumps(obj)
+        res = pickle.loads(pickle_obj)
+        assert_array_equal(res, obj)
+
     def test_padding(self):
         for j in range(8):
             x = np.array([(1,), (2,)], dtype={'f0': (int, j)})
@@ -7169,9 +7401,10 @@ class TestNewBufferProtocol:
         x3 = np.arange(dt3.itemsize, dtype=np.int8).view(dt3)
         self._check_roundtrip(x3)
 
-    def test_relaxed_strides(self):
-        # Test that relaxed strides are converted to non-relaxed
-        c = np.ones((1, 10, 10), dtype='i8')
+    @pytest.mark.valgrind_error(reason="leaks buffer info cache temporarily.")
+    def test_relaxed_strides(self, c=np.ones((1, 10, 10), dtype='i8')):
+        # Note: c defined as parameter so that it is persistent and leak
+        # checks will notice gh-16934 (buffer info cache leak).
 
         # Check for NPY_RELAXED_STRIDES_CHECKING:
         if np.ones((10, 1), order="C").flags.f_contiguous:
@@ -7196,6 +7429,23 @@ class TestNewBufferProtocol:
                     arr, ['C_CONTIGUOUS'])
             assert_(strides[-1] == 8)
 
+    @pytest.mark.valgrind_error(reason="leaks buffer info cache temporarily.")
+    @pytest.mark.skipif(not np.ones((10, 1), order="C").flags.f_contiguous,
+            reason="Test is unnecessary (but fails) without relaxed strides.")
+    def test_relaxed_strides_buffer_info_leak(self, arr=np.ones((1, 10))):
+        """Test that alternating export of C- and F-order buffers from
+        an array which is both C- and F-order when relaxed strides is
+        active works.
+        This test defines array in the signature to ensure leaking more
+        references every time the test is run (catching the leak with
+        pytest-leaks).
+        """
+        for i in range(10):
+            _, s = _multiarray_tests.get_buffer_info(arr, ['F_CONTIGUOUS'])
+            assert s == (8, 8)
+            _, s = _multiarray_tests.get_buffer_info(arr, ['C_CONTIGUOUS'])
+            assert s == (80, 8)
+
     def test_out_of_order_fields(self):
         dt = np.dtype(dict(
             formats=['<i4', '<i4'],
@@ -7283,6 +7533,25 @@ class TestNewBufferProtocol:
         f.a = 3
         assert_equal(arr['a'], 3)
 
+    @pytest.mark.parametrize("obj", [np.ones(3), np.ones(1, dtype="i,i")[()]])
+    def test_error_if_stored_buffer_info_is_corrupted(self, obj):
+        """
+        If a user extends a NumPy array before 1.20 and then runs it
+        on NumPy 1.20+. A C-subclassed array might in theory modify
+        the new buffer-info field. This checks that an error is raised
+        if this happens (for buffer export), an error is written on delete.
+        This is a sanity check to help users transition to safe code, it
+        may be deleted at any point.
+        """
+        # corrupt buffer info:
+        _multiarray_tests.corrupt_or_fix_bufferinfo(obj)
+        name = type(obj)
+        with pytest.raises(RuntimeError,
+                    match=f".*{name} appears to be C subclassed"):
+            memoryview(obj)
+        # Fix buffer info again before we delete (or we lose the memory)
+        _multiarray_tests.corrupt_or_fix_bufferinfo(obj)
+
 
 class TestArrayAttributeDeletion:
 
@@ -7419,6 +7688,18 @@ def test_array_interface_offset():
     arr1 = np.asarray(DummyArray())
     assert_equal(arr1, arr[1:])
 
+def test_array_interface_unicode_typestr():
+    arr = np.array([1, 2, 3], dtype='int32')
+    interface = dict(arr.__array_interface__)
+    interface['typestr'] = '\N{check mark}'
+
+    class DummyArray:
+        __array_interface__ = interface
+
+    # should not be UnicodeEncodeError
+    with pytest.raises(TypeError):
+        np.asarray(DummyArray())
+
 def test_flat_element_deletion():
     it = np.ones(3).flat
     try:
@@ -8199,6 +8480,22 @@ class TestArange:
         assert_raises(ZeroDivisionError, np.arange, 0, 0, 0)
         assert_raises(ZeroDivisionError, np.arange, 0.0, 0.0, 0.0)
 
+    def test_require_range(self):
+        assert_raises(TypeError, np.arange)
+        assert_raises(TypeError, np.arange, step=3)
+        assert_raises(TypeError, np.arange, dtype='int64')
+        assert_raises(TypeError, np.arange, start=4)
+
+    def test_start_stop_kwarg(self):
+        keyword_stop = np.arange(stop=3)
+        keyword_zerotostop = np.arange(start=0, stop=3)
+        keyword_start_stop = np.arange(start=3, stop=9)
+
+        assert len(keyword_stop) == 3
+        assert len(keyword_zerotostop) == 3
+        assert len(keyword_start_stop) == 6
+        assert_array_equal(keyword_stop, keyword_zerotostop)
+
 
 class TestArrayFinalize:
     """ Tests __array_finalize__ """
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index 7b3c3a40d..e10c7ad92 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -2880,3 +2880,68 @@ def test_warn_noclose():
                         casting='equiv', op_dtypes=[np.dtype('f4')])
         del it
         assert len(sup.log) == 1
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize(["in_dtype", "buf_dtype"],
+        [("i", "O"), ("O", "i"),  # most simple cases
+         ("i,O", "O,O"),  # structured partially only copying O
+         ("O,i", "i,O"),  # structured casting to and from O
+         ])
+@pytest.mark.parametrize("steps", [1, 2, 3])
+def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.full(int(np.BUFSIZE * 2.5), value).astype(in_dtype)
+    count = sys.getrefcount(value)
+
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+            flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    for step in range(steps):
+        # The iteration finishes in 3 steps, the first two are partial
+        next(it)
+
+    # Note that resetting does not free references
+    del it
+    assert count == sys.getrefcount(value)
+
+    # Repeat the test with `iternext`
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+                   flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    for step in range(steps):
+        it.iternext()
+
+    del it  # should ensure cleanup
+    assert count == sys.getrefcount(value)
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize(["in_dtype", "buf_dtype"],
+         [("O", "i"),  # most simple cases
+          ("O,i", "i,O"),  # structured casting to and from O
+          ])
+def test_partial_iteration_error(in_dtype, buf_dtype):
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.full(int(np.BUFSIZE * 2.5), value).astype(in_dtype)
+    if in_dtype == "O":
+        arr[int(np.BUFSIZE * 1.5)] = None
+    else:
+        arr[int(np.BUFSIZE * 1.5)]["f0"] = None
+
+    count = sys.getrefcount(value)
+
+    it = np.nditer(arr, op_dtypes=[np.dtype(buf_dtype)],
+            flags=["buffered", "external_loop", "refs_ok"], casting="unsafe")
+    with pytest.raises(TypeError):
+        # pytest.raises seems to have issues with the error originating
+        # in the for loop, so manually unravel:
+        next(it)
+        next(it)  # raises TypeError
+
+    # Repeat the test with `iternext` after resetting, the buffers should
+    # already be cleared from any references, so resetting is sufficient.
+    it.reset()
+    with pytest.raises(TypeError):
+        it.iternext()
+        it.iternext()
+
+    assert count == sys.getrefcount(value)
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 2a87ffaf8..866a96e31 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -14,6 +14,7 @@ from numpy.testing import (
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_warns, assert_array_max_ulp, HAS_REFCOUNT
     )
+from numpy.core._rational_tests import rational
 
 from hypothesis import assume, given, strategies as st
 from hypothesis.extra import numpy as hynp
@@ -863,6 +864,30 @@ class TestTypes:
         assert_equal(np.promote_types('<m8', '<m8'), np.dtype('m8'))
         assert_equal(np.promote_types('>m8', '>m8'), np.dtype('m8'))
 
+    def test_can_cast_and_promote_usertypes(self):
+        # The rational type defines safe casting for signed integers,
+        # boolean. Rational itself *does* cast safely to double.
+        # (rational does not actually cast to all signed integers, e.g.
+        # int64 can be both long and longlong and it registers only the first)
+        valid_types = ["int8", "int16", "int32", "int64", "bool"]
+        invalid_types = "BHILQP" + "FDG" + "mM" + "f" + "V"
+
+        rational_dt = np.dtype(rational)
+        for numpy_dtype in valid_types:
+            numpy_dtype = np.dtype(numpy_dtype)
+            assert np.can_cast(numpy_dtype, rational_dt)
+            assert np.promote_types(numpy_dtype, rational_dt) is rational_dt
+
+        for numpy_dtype in invalid_types:
+            numpy_dtype = np.dtype(numpy_dtype)
+            assert not np.can_cast(numpy_dtype, rational_dt)
+            with pytest.raises(TypeError):
+                np.promote_types(numpy_dtype, rational_dt)
+
+        double_dt = np.dtype("double")
+        assert np.can_cast(rational_dt, double_dt)
+        assert np.promote_types(double_dt, rational_dt) is double_dt
+
     def test_promote_types_strings(self):
         assert_equal(np.promote_types('bool', 'S'), np.dtype('S5'))
         assert_equal(np.promote_types('b', 'S'), np.dtype('S4'))
@@ -897,6 +922,126 @@ class TestTypes:
         assert_equal(np.promote_types('u8', 'S1'), np.dtype('S20'))
         assert_equal(np.promote_types('u8', 'S30'), np.dtype('S30'))
 
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V6"), np.dtype("V10")],
+             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
+             [np.dtype("i8,i8"), np.dtype("i4,i4")],
+            ])
+    def test_invalid_void_promotion(self, dtype1, dtype2):
+        # Mainly test structured void promotion, which currently allows
+        # byte-swapping, but nothing else:
+        with pytest.raises(TypeError):
+            np.promote_types(dtype1, dtype2)
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V10"), np.dtype("V10")],
+             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
+             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
+            ])
+    def test_valid_void_promotion(self, dtype1, dtype2):
+        assert np.promote_types(dtype1, dtype2) is dtype1
+
+    @pytest.mark.parametrize("dtype",
+           list(np.typecodes["All"]) +
+           ["i,i", "S3", "S100", "U3", "U100", rational])
+    def test_promote_identical_types_metadata(self, dtype):
+        # The same type passed in twice to promote types always
+        # preserves metadata
+        metadata = {1: 1}
+        dtype = np.dtype(dtype, metadata=metadata)
+
+        res = np.promote_types(dtype, dtype)
+        assert res.metadata == dtype.metadata
+
+        # byte-swapping preserves and makes the dtype native:
+        dtype = dtype.newbyteorder()
+        if dtype.isnative:
+            # The type does not have byte swapping
+            return
+
+        res = np.promote_types(dtype, dtype)
+        if res.char in "?bhilqpBHILQPefdgFDGOmM" or dtype.type is rational:
+            # Metadata is lost for simple promotions (they create a new dtype)
+            assert res.metadata is None
+        else:
+            assert res.metadata == metadata
+        if dtype.kind != "V":
+            # the result is native (except for structured void)
+            assert res.isnative
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            itertools.product(
+                list(np.typecodes["All"]) +
+                ["i,i", "S3", "S100", "U3", "U100", rational],
+                repeat=2))
+    def test_promote_types_metadata(self, dtype1, dtype2):
+        """Metadata handling in promotion does not appear formalized
+        right now in NumPy. This test should thus be considered to
+        document behaviour, rather than test the correct definition of it.
+
+        This test is very ugly, it was useful for rewriting part of the
+        promotion, but probably should eventually be replaced/deleted
+        (i.e. when metadata handling in promotion is better defined).
+        """
+        metadata1 = {1: 1}
+        metadata2 = {2: 2}
+        dtype1 = np.dtype(dtype1, metadata=metadata1)
+        dtype2 = np.dtype(dtype2, metadata=metadata2)
+
+        try:
+            res = np.promote_types(dtype1, dtype2)
+        except TypeError:
+            # Promotion failed, this test only checks metadata
+            return
+
+        if res.char in "?bhilqpBHILQPefdgFDGOmM" or res.type is rational:
+            # All simple types lose metadata (due to using promotion table):
+            assert res.metadata is None
+        elif res == dtype1:
+            # If one result is the result, it is usually returned unchanged:
+            assert res is dtype1
+        elif res == dtype2:
+            # dtype1 may have been cast to the same type/kind as dtype2.
+            # If the resulting dtype is identical we currently pick the cast
+            # version of dtype1, which lost the metadata:
+            if np.promote_types(dtype1, dtype2.kind) == dtype2:
+                res.metadata is None
+            else:
+                res.metadata == metadata2
+        else:
+            assert res.metadata is None
+
+        # Try again for byteswapped version
+        dtype1 = dtype1.newbyteorder()
+        assert dtype1.metadata == metadata1
+        res_bs = np.promote_types(dtype1, dtype2)
+        if res_bs.names is not None:
+            # Structured promotion doesn't remove byteswap:
+            assert res_bs.newbyteorder() == res
+        else:
+            assert res_bs == res
+        assert res_bs.metadata == res.metadata
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V6"), np.dtype("V10")],
+             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
+             [np.dtype("i8,i8"), np.dtype("i4,i4")],
+            ])
+    def test_invalid_void_promotion(self, dtype1, dtype2):
+        # Mainly test structured void promotion, which currently allows
+        # byte-swapping, but nothing else:
+        with pytest.raises(TypeError):
+            np.promote_types(dtype1, dtype2)
+
+    @pytest.mark.parametrize(["dtype1", "dtype2"],
+            [[np.dtype("V10"), np.dtype("V10")],
+             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
+             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
+            ])
+    def test_valid_void_promotion(self, dtype1, dtype2):
+        assert np.promote_types(dtype1, dtype2) is dtype1
+
     def test_can_cast(self):
         assert_(np.can_cast(np.int32, np.int64))
         assert_(np.can_cast(np.float64, complex))
@@ -2521,7 +2666,7 @@ class TestCreationFuncs:
         self.check_function(np.zeros)
 
     def test_ones(self):
-        self.check_function(np.zeros)
+        self.check_function(np.ones)
 
     def test_empty(self):
         self.check_function(np.empty)
diff --git a/numpy/core/tests/test_overrides.py b/numpy/core/tests/test_overrides.py
index 7e73d8c03..6862fca03 100644
--- a/numpy/core/tests/test_overrides.py
+++ b/numpy/core/tests/test_overrides.py
@@ -1,5 +1,7 @@
 import inspect
 import sys
+import tempfile
+from io import StringIO
 from unittest import mock
 
 import numpy as np
@@ -425,3 +427,155 @@ class TestNumPyFunctions:
         # note: the internal implementation of np.sum() calls the .sum() method
         array = np.array(1).view(MyArray)
         assert_equal(np.sum(array), 'summed')
+
+
+class TestArrayLike:
+    def setup(self):
+        class MyArray():
+            def __init__(self, function=None):
+                self.function = function
+
+            def __array_function__(self, func, types, args, kwargs):
+                try:
+                    my_func = getattr(self, func.__name__)
+                except AttributeError:
+                    return NotImplemented
+                return my_func(*args, **kwargs)
+
+        self.MyArray = MyArray
+
+        class MyNoArrayFunctionArray():
+            def __init__(self, function=None):
+                self.function = function
+
+        self.MyNoArrayFunctionArray = MyNoArrayFunctionArray
+
+    def add_method(self, name, arr_class, enable_value_error=False):
+        def _definition(*args, **kwargs):
+            # Check that `like=` isn't propagated downstream
+            assert 'like' not in kwargs
+
+            if enable_value_error and 'value_error' in kwargs:
+                raise ValueError
+
+            return arr_class(getattr(arr_class, name))
+        setattr(arr_class, name, _definition)
+
+    def func_args(*args, **kwargs):
+        return args, kwargs
+
+    @requires_array_function
+    def test_array_like_not_implemented(self):
+        self.add_method('array', self.MyArray)
+
+        ref = self.MyArray.array()
+
+        with assert_raises_regex(TypeError, 'no implementation found'):
+            array_like = np.asarray(1, like=ref)
+
+    _array_tests = [
+        ('array', *func_args((1,))),
+        ('asarray', *func_args((1,))),
+        ('asanyarray', *func_args((1,))),
+        ('ascontiguousarray', *func_args((2, 3))),
+        ('asfortranarray', *func_args((2, 3))),
+        ('require', *func_args((np.arange(6).reshape(2, 3),),
+                               requirements=['A', 'F'])),
+        ('empty', *func_args((1,))),
+        ('full', *func_args((1,), 2)),
+        ('ones', *func_args((1,))),
+        ('zeros', *func_args((1,))),
+        ('arange', *func_args(3)),
+        ('frombuffer', *func_args(b'\x00' * 8, dtype=int)),
+        ('fromiter', *func_args(range(3), dtype=int)),
+        ('fromstring', *func_args('1,2', dtype=int, sep=',')),
+        ('loadtxt', *func_args(lambda: StringIO('0 1\n2 3'))),
+        ('genfromtxt', *func_args(lambda: StringIO(u'1,2.1'),
+                                  dtype=[('int', 'i8'), ('float', 'f8')],
+                                  delimiter=',')),
+    ]
+
+    @pytest.mark.parametrize('function, args, kwargs', _array_tests)
+    @pytest.mark.parametrize('numpy_ref', [True, False])
+    @requires_array_function
+    def test_array_like(self, function, args, kwargs, numpy_ref):
+        self.add_method('array', self.MyArray)
+        self.add_method(function, self.MyArray)
+        np_func = getattr(np, function)
+        my_func = getattr(self.MyArray, function)
+
+        if numpy_ref is True:
+            ref = np.array(1)
+        else:
+            ref = self.MyArray.array()
+
+        like_args = tuple(a() if callable(a) else a for a in args)
+        array_like = np_func(*like_args, **kwargs, like=ref)
+
+        if numpy_ref is True:
+            assert type(array_like) is np.ndarray
+
+            np_args = tuple(a() if callable(a) else a for a in args)
+            np_arr = np_func(*np_args, **kwargs)
+
+            # Special-case np.empty to ensure values match
+            if function == "empty":
+                np_arr.fill(1)
+                array_like.fill(1)
+
+            assert_equal(array_like, np_arr)
+        else:
+            assert type(array_like) is self.MyArray
+            assert array_like.function is my_func
+
+    @pytest.mark.parametrize('function, args, kwargs', _array_tests)
+    @pytest.mark.parametrize('ref', [1, [1], "MyNoArrayFunctionArray"])
+    @requires_array_function
+    def test_no_array_function_like(self, function, args, kwargs, ref):
+        self.add_method('array', self.MyNoArrayFunctionArray)
+        self.add_method(function, self.MyNoArrayFunctionArray)
+        np_func = getattr(np, function)
+
+        # Instantiate ref if it's the MyNoArrayFunctionArray class
+        if ref == "MyNoArrayFunctionArray":
+            ref = self.MyNoArrayFunctionArray.array()
+
+        like_args = tuple(a() if callable(a) else a for a in args)
+
+        with assert_raises_regex(TypeError,
+                'The `like` argument must be an array-like that implements'):
+            np_func(*like_args, **kwargs, like=ref)
+
+    @pytest.mark.parametrize('numpy_ref', [True, False])
+    def test_array_like_fromfile(self, numpy_ref):
+        self.add_method('array', self.MyArray)
+        self.add_method("fromfile", self.MyArray)
+
+        if numpy_ref is True:
+            ref = np.array(1)
+        else:
+            ref = self.MyArray.array()
+
+        data = np.random.random(5)
+
+        fname = tempfile.mkstemp()[1]
+        data.tofile(fname)
+
+        array_like = np.fromfile(fname, like=ref)
+        if numpy_ref is True:
+            assert type(array_like) is np.ndarray
+            np_res = np.fromfile(fname, like=ref)
+            assert_equal(np_res, data)
+            assert_equal(array_like, np_res)
+        else:
+            assert type(array_like) is self.MyArray
+            assert array_like.function is self.MyArray.fromfile
+
+    @requires_array_function
+    def test_exception_handling(self):
+        self.add_method('array', self.MyArray, enable_value_error=True)
+
+        ref = self.MyArray.array()
+
+        with assert_raises(ValueError):
+            np.array(1, value_error=True, like=ref)
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index 4350a3407..4d4b4b515 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -1,5 +1,6 @@
 import collections.abc
 import textwrap
+from io import BytesIO
 from os import path
 from pathlib import Path
 import pytest
@@ -79,8 +80,14 @@ class TestFromrecords:
         r1 = np.rec.fromfile(fd, formats='f8,i4,a5', shape=3, byteorder='big')
         fd.seek(2880 * 2)
         r2 = np.rec.array(fd, formats='f8,i4,a5', shape=3, byteorder='big')
+        fd.seek(2880 * 2)
+        bytes_array = BytesIO()
+        bytes_array.write(fd.read())
+        bytes_array.seek(0)
+        r3 = np.rec.fromfile(bytes_array, formats='f8,i4,a5', shape=3, byteorder='big')
         fd.close()
         assert_equal(r1, r2)
+        assert_equal(r2, r3)
 
     def test_recarray_from_obj(self):
         count = 10
@@ -417,7 +424,16 @@ class TestRecord:
         # make sure we did not pickle the address
         assert not isinstance(obj, bytes)
 
-        assert_raises(TypeError, ctor, dtype, 13)
+        assert_raises(RuntimeError, ctor, dtype, 13)
+
+        # Test roundtrip:
+        dump = pickle.dumps(a[0])
+        unpickled = pickle.loads(dump)
+        assert a[0] == unpickled
+
+        # Also check the similar (impossible) "object scalar" path:
+        with pytest.warns(DeprecationWarning):
+            assert ctor(np.dtype("O"), data) is data
 
     def test_objview_record(self):
         # https://github.com/numpy/numpy/issues/2599
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 170d20157..831e48e8b 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -14,7 +14,7 @@ from numpy.testing import (
         assert_raises_regex, assert_warns, suppress_warnings,
         _assert_valid_refcount, HAS_REFCOUNT,
         )
-from numpy.testing._private.utils import _no_tracing
+from numpy.testing._private.utils import _no_tracing, requires_memory
 from numpy.compat import asbytes, asunicode, pickle
 
 try:
@@ -1506,10 +1506,10 @@ class TestRegression:
                 test_type(t)
 
     def test_buffer_hashlib(self):
-        from hashlib import md5
+        from hashlib import sha256
 
         x = np.array([1, 2, 3], dtype=np.dtype('<i4'))
-        assert_equal(md5(x).hexdigest(), '2a1dd1e1e59d0a384c26951e316cd7e6')
+        assert_equal(sha256(x).hexdigest(), '4636993d3e1da4e9d6b8f87b79e8f7c6d018580d52661950eabc3845c5897a4d')
 
     def test_0d_string_scalar(self):
         # Bug #1436; the following should succeed
@@ -2427,9 +2427,10 @@ class TestRegression:
             assert b'numpy.core.multiarray' in s
 
     def test_object_casting_errors(self):
-        # gh-11993
+        # gh-11993 update to ValueError (see gh-16909), since strings can in
+        # principle be converted to complex, but this string cannot.
         arr = np.array(['AAAAA', 18465886.0, 18465886.0], dtype=object)
-        assert_raises(TypeError, arr.astype, 'c8')
+        assert_raises(ValueError, arr.astype, 'c8')
 
     def test_eff1d_casting(self):
         # gh-12711
@@ -2487,3 +2488,39 @@ class TestRegression:
         assert arr.size * arr.itemsize > 2 ** 31
         c_arr = np.ctypeslib.as_ctypes(arr)
         assert_equal(c_arr._length_, arr.size)
+
+    def test_complex_conversion_error(self):
+        # gh-17068
+        with pytest.raises(TypeError, match=r"Unable to convert dtype.*"):
+            complex(np.array("now", np.datetime64))
+
+    def test__array_interface__descr(self):
+        # gh-17068
+        dt = np.dtype(dict(names=['a', 'b'],
+                           offsets=[0, 0],
+                           formats=[np.int64, np.int64]))
+        descr = np.array((1, 1), dtype=dt).__array_interface__['descr']
+        assert descr == [('', '|V8')]  # instead of [(b'', '|V8')]
+
+    @pytest.mark.skipif(sys.maxsize < 2 ** 31 + 1, reason='overflows 32-bit python')
+    @requires_memory(free_bytes=9e9)
+    def test_dot_big_stride(self):
+        # gh-17111
+        # blas stride = stride//itemsize > int32 max
+        int32_max = np.iinfo(np.int32).max
+        n = int32_max + 3
+        a = np.empty([n], dtype=np.float32)
+        b = a[::n-1]
+        b[...] = 1
+        assert b.strides[0] > int32_max * b.dtype.itemsize
+        assert np.dot(b, b) == 2.0
+
+    def test_frompyfunc_name(self):
+        # name conversion was failing for python 3 strings
+        # resulting in the default '?' name. Also test utf-8
+        # encoding using non-ascii name.
+        def cassé(x):
+            return x
+
+        f = np.frompyfunc(cassé, 1, 1)
+        assert str(f) == "<ufunc 'cassé (vectorized)'>"
diff --git a/numpy/core/tests/test_scalarbuffer.py b/numpy/core/tests/test_scalarbuffer.py
index 574c56864..851cd3081 100644
--- a/numpy/core/tests/test_scalarbuffer.py
+++ b/numpy/core/tests/test_scalarbuffer.py
@@ -3,6 +3,7 @@ Test scalar buffer interface adheres to PEP 3118
 """
 import numpy as np
 from numpy.core._rational_tests import rational
+from numpy.core._multiarray_tests import get_buffer_info
 import pytest
 
 from numpy.testing import assert_, assert_equal, assert_raises
@@ -52,10 +53,20 @@ class TestScalarPEP3118:
         assert_equal(mv_x.suboffsets, ())
 
     @pytest.mark.parametrize('scalar, code', scalars_and_codes, ids=codes_only)
-    def test_scalar_known_code(self, scalar, code):
+    def test_scalar_code_and_properties(self, scalar, code):
         x = scalar()
+        expected = dict(strides=(), itemsize=x.dtype.itemsize, ndim=0,
+                        shape=(), format=code, readonly=True)
+
         mv_x = memoryview(x)
-        assert_equal(mv_x.format, code)
+        print(mv_x.readonly, self._as_dict(mv_x))
+        assert self._as_dict(mv_x) == expected
+
+    @pytest.mark.parametrize('scalar', scalars_only, ids=codes_only)
+    def test_scalar_buffers_readonly(self, scalar):
+        x = scalar()
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(x, ["WRITABLE"])
 
     def test_void_scalar_structured_data(self):
         dt = np.dtype([('name', np.unicode_, 16), ('grades', np.float64, (2,))])
@@ -77,9 +88,14 @@ class TestScalarPEP3118:
         assert_equal(mv_x.itemsize, mv_a.itemsize)
         assert_equal(mv_x.format, mv_a.format)
 
+        # Check that we do not allow writeable buffer export (technically
+        # we could allow it sometimes here...)
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(x, ["WRITABLE"])
+
     def _as_dict(self, m):
         return dict(strides=m.strides, shape=m.shape, itemsize=m.itemsize,
-                    ndim=m.ndim, format=m.format)
+                    ndim=m.ndim, format=m.format, readonly=m.readonly)
 
     def test_datetime_memoryview(self):
         # gh-11656
@@ -88,7 +104,7 @@ class TestScalarPEP3118:
         dt1 = np.datetime64('2016-01-01')
         dt2 = np.datetime64('2017-01-01')
         expected = dict(strides=(1,), itemsize=1, ndim=1, shape=(8,),
-                        format='B')
+                        format='B', readonly=True)
         v = memoryview(dt1)
         assert self._as_dict(v) == expected
 
@@ -100,6 +116,10 @@ class TestScalarPEP3118:
         # Fails to create a PEP 3118 valid buffer
         assert_raises((ValueError, BufferError), memoryview, a[0])
 
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(dt1, ["WRITABLE"])
+
     @pytest.mark.parametrize('s', [
         pytest.param("\x32\x32", id="ascii"),
         pytest.param("\uFE0F\uFE0F", id="basic multilingual"),
@@ -109,7 +129,8 @@ class TestScalarPEP3118:
         s = np.str_(s)  # only our subclass implements the buffer protocol
 
         # all the same, characters always encode as ucs4
-        expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w')
+        expected = dict(strides=(), itemsize=8, ndim=0, shape=(), format='2w',
+                        readonly=True)
 
         v = memoryview(s)
         assert self._as_dict(v) == expected
@@ -119,7 +140,15 @@ class TestScalarPEP3118:
 
         assert_equal(code_points, [ord(c) for c in s])
 
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(s, ["WRITABLE"])
+
     def test_user_scalar_fails_buffer(self):
         r = rational(1)
         with assert_raises(TypeError):
             memoryview(r)
+
+        # Check that we do not allow writeable buffer export
+        with pytest.raises(BufferError, match="scalar buffer is readonly"):
+            get_buffer_info(r, ["WRITABLE"])
+\ No newline at end of file
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index c7f44cf50..d8529418e 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -276,6 +276,10 @@ class TestModulus:
         # Check nans, inf
         with suppress_warnings() as sup:
             sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            sup.filter(RuntimeWarning, "divide by zero encountered in remainder")
+            sup.filter(RuntimeWarning, "divide by zero encountered in floor_divide")
+            sup.filter(RuntimeWarning, "divide by zero encountered in divmod")
+            sup.filter(RuntimeWarning, "invalid value encountered in divmod")
             for dt in np.typecodes['Float']:
                 fone = np.array(1.0, dtype=dt)
                 fzer = np.array(0.0, dtype=dt)
@@ -290,6 +294,9 @@ class TestModulus:
                 assert_(np.isnan(rem), 'dt: %s' % dt)
                 rem = operator.mod(finf, fone)
                 assert_(np.isnan(rem), 'dt: %s' % dt)
+                for op in [floordiv_and_mod, divmod]:
+                    div, mod = op(fone, fzer)
+                    assert_(np.isinf(div)) and assert_(np.isnan(mod))
 
     def test_inplace_floordiv_handling(self):
         # issue gh-12927
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 94a916193..4e56ace90 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -342,19 +342,32 @@ class TestConcatenate:
         assert_raises(ValueError, concatenate, (a, b), out=np.empty((1,4)))
         concatenate((a, b), out=np.empty(4))
 
-    def test_out_dtype(self):
-        out = np.empty(4, np.float32)
-        res = concatenate((array([1, 2]), array([3, 4])), out=out)
-        assert_(out is res)
-
-        out = np.empty(4, np.complex64)
-        res = concatenate((array([0.1, 0.2]), array([0.3, 0.4])), out=out)
-        assert_(out is res)
-
-        # invalid cast
-        out = np.empty(4, np.int32)
-        assert_raises(TypeError, concatenate,
-            (array([0.1, 0.2]), array([0.3, 0.4])), out=out)
+    @pytest.mark.parametrize("axis", [None, 0])
+    @pytest.mark.parametrize("out_dtype", ["c8", "f4", "f8", ">f8", "i8"])
+    @pytest.mark.parametrize("casting",
+            ['no', 'equiv', 'safe', 'same_kind', 'unsafe'])
+    def test_out_and_dtype(self, axis, out_dtype, casting):
+        # Compare usage of `out=out` with `dtype=out.dtype`
+        out = np.empty(4, dtype=out_dtype)
+        to_concat = (array([1.1, 2.2]), array([3.3, 4.4]))
+
+        if not np.can_cast(to_concat[0], out_dtype, casting=casting):
+            with assert_raises(TypeError):
+                concatenate(to_concat, out=out, axis=axis, casting=casting)
+            with assert_raises(TypeError):
+                concatenate(to_concat, dtype=out.dtype,
+                            axis=axis, casting=casting)
+        else:
+            res_out = concatenate(to_concat, out=out,
+                                  axis=axis, casting=casting)
+            res_dtype = concatenate(to_concat, dtype=out.dtype,
+                                    axis=axis, casting=casting)
+            assert res_out is out
+            assert_array_equal(out, res_dtype)
+            assert res_dtype.dtype == out_dtype
+
+        with assert_raises(TypeError):
+            concatenate(to_concat, out=out, dtype=out_dtype, axis=axis)
 
 
 def test_stack():
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
new file mode 100644
index 000000000..196003cdd
--- /dev/null
+++ b/numpy/core/tests/test_simd.py
@@ -0,0 +1,671 @@
+# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics
+# may be involved in their functionality.
+import pytest, math
+from numpy.core._simd import targets
+
+class _Test_Utility:
+    # submodule of the desired SIMD extention, e.g. targets["AVX512F"]
+    npyv = None
+    # the current data type suffix e.g. 's8'
+    sfx  = None
+
+    def __getattr__(self, attr):
+        """
+        To call NPV intrinsics without the attribute 'npyv' and
+        auto suffixing intrinsics according to class attribute 'sfx'
+        """
+        return getattr(self.npyv, attr + "_" + self.sfx)
+
+    def _data(self, start=None, count=None, reverse=False):
+        """
+        Create list of consecutive numbers according to number of vector's lanes.
+        """
+        if start is None:
+            start = 1
+        if count is None:
+            count = self.nlanes
+        rng = range(start, start + count)
+        if reverse:
+            rng = reversed(rng)
+        if self._is_fp():
+            return [x / 1.0 for x in rng]
+        return list(rng)
+
+    def _is_unsigned(self):
+        return self.sfx[0] == 'u'
+
+    def _is_signed(self):
+        return self.sfx[0] == 's'
+
+    def _is_fp(self):
+        return self.sfx[0] == 'f'
+
+    def _scalar_size(self):
+        return int(self.sfx[1:])
+
+    def _int_clip(self, seq):
+        if self._is_fp():
+            return seq
+        max_int = self._int_max()
+        min_int = self._int_min()
+        return [min(max(v, min_int), max_int) for v in seq]
+
+    def _int_max(self):
+        if self._is_fp():
+            return None
+        max_u = self._to_unsigned(self.setall(-1))[0]
+        if self._is_signed():
+            return max_u // 2
+        return max_u
+
+    def _int_min(self):
+        if self._is_fp():
+            return None
+        if self._is_unsigned():
+            return 0
+        return -(self._int_max() + 1)
+
+    def _true_mask(self):
+        max_unsig = getattr(self.npyv, "setall_u" + self.sfx[1:])(-1)
+        return max_unsig[0]
+
+    def _to_unsigned(self, vector):
+        if isinstance(vector, (list, tuple)):
+            return getattr(self.npyv, "load_u" + self.sfx[1:])(vector)
+        else:
+            sfx = vector.__name__.replace("npyv_", "")
+            if sfx[0] == "b":
+                cvt_intrin = "cvt_u{0}_b{0}"
+            else:
+                cvt_intrin = "reinterpret_u{0}_{1}"
+            return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector)
+
+    def _pinfinity(self):
+        v = self.npyv.setall_u32(0x7f800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _ninfinity(self):
+        v = self.npyv.setall_u32(0xff800000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+    def _nan(self):
+        v = self.npyv.setall_u32(0x7fc00000)
+        return self.npyv.reinterpret_f32_u32(v)[0]
+
+class _SIMD_BOOL(_Test_Utility):
+    """
+    To test all boolean vector types at once
+    """
+    def _data(self, start=None, count=None, reverse=False):
+        nlanes = getattr(self.npyv, "nlanes_u" + self.sfx[1:])
+        true_mask = self._true_mask()
+        rng = range(nlanes)
+        if reverse:
+            rng = reversed(rng)
+        return [true_mask if x % 2 else 0 for x in rng]
+
+    def _load_b(self, data):
+        len_str = self.sfx[1:]
+        load = getattr(self.npyv, "load_u" + len_str)
+        cvt = getattr(self.npyv, f"cvt_b{len_str}_u{len_str}")
+        return cvt(load(data))
+
+    def test_tobits(self):
+        data2bits = lambda data: sum([int(x != 0) << i for i, x in enumerate(data, 0)])
+        for data in (self._data(), self._data(reverse=True)):
+            vdata = self._load_b(data)
+            data_bits = data2bits(data)
+            tobits = bin(self.tobits(vdata))
+            assert tobits == bin(data_bits)
+
+class _SIMD_INT(_Test_Utility):
+    """
+    To test all integer vector types at once
+    """
+    def test_operators_shift(self):
+        if self.sfx in ("u8", "s8"):
+            return
+
+        data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        for count in range(self._scalar_size()):
+            # load to cast
+            data_shl_a = self.load([a << count for a in data_a])
+            # left shift
+            shl = self.shl(vdata_a, count)
+            assert shl == data_shl_a
+            # left shift by an immediate constant
+            shli = self.shli(vdata_a, count)
+            assert shli == data_shl_a
+            # load to cast
+            data_shr_a = self.load([a >> count for a in data_a])
+            # right shift
+            shr = self.shr(vdata_a, count)
+            assert shr == data_shr_a
+            # right shift by an immediate constant
+            shri = self.shri(vdata_a, count)
+            assert shri == data_shr_a
+
+    def test_arithmetic_subadd_saturated(self):
+        if self.sfx in ("u32", "s32", "u64", "s64"):
+            return
+
+        data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        data_adds = self._int_clip([a + b for a, b in zip(data_a, data_b)])
+        adds = self.adds(vdata_a, vdata_b)
+        assert adds == data_adds
+
+        data_subs = self._int_clip([a - b for a, b in zip(data_a, data_b)])
+        subs = self.subs(vdata_a, vdata_b)
+        assert subs == data_subs
+
+class _SIMD_FP(_Test_Utility):
+    """
+    To test all float vector types at once
+    """
+    def test_arithmetic_fused(self):
+        vdata_a, vdata_b, vdata_c = [self.load(self._data())]*3
+        vdata_cx2 = self.add(vdata_c, vdata_c)
+        # multiply and add, a*b + c
+        data_fma = self.load([a * b + c for a, b, c in zip(vdata_a, vdata_b, vdata_c)])
+        fma = self.muladd(vdata_a, vdata_b, vdata_c)
+        assert fma == data_fma
+        # multiply and subtract, a*b - c
+        fms = self.mulsub(vdata_a, vdata_b, vdata_c)
+        data_fms = self.sub(data_fma, vdata_cx2)
+        assert fms == data_fms
+        # negate multiply and add, -(a*b) + c
+        nfma = self.nmuladd(vdata_a, vdata_b, vdata_c)
+        data_nfma = self.sub(vdata_cx2, data_fma)
+        assert nfma == data_nfma
+        # negate multiply and subtract, -(a*b) - c
+        nfms = self.nmulsub(vdata_a, vdata_b, vdata_c)
+        data_nfms = self.mul(data_fma, self.setall(-1))
+        assert nfms == data_nfms
+
+    def test_abs(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan))
+        for case, desired in abs_cases:
+            data_abs = [desired]*self.nlanes
+            vabs = self.abs(self.setall(case))
+            assert vabs == pytest.approx(data_abs, nan_ok=True)
+
+        vabs = self.abs(self.mul(vdata, self.setall(-1)))
+        assert vabs == data
+
+    def test_sqrt(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf))
+        for case, desired in sqrt_cases:
+            data_sqrt = [desired]*self.nlanes
+            sqrt  = self.sqrt(self.setall(case))
+            assert sqrt == pytest.approx(data_sqrt, nan_ok=True)
+
+        data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision
+        sqrt = self.sqrt(vdata)
+        assert sqrt == data_sqrt
+
+    def test_square(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+        # square
+        square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf))
+        for case, desired in square_cases:
+            data_square = [desired]*self.nlanes
+            square  = self.square(self.setall(case))
+            assert square == pytest.approx(data_square, nan_ok=True)
+
+        data_square = [x*x for x in data]
+        square = self.square(vdata)
+        assert square == data_square
+
+    def test_reciprocal(self):
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        data = self._data()
+        vdata = self.load(self._data())
+
+        recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf))
+        for case, desired in recip_cases:
+            data_recip = [desired]*self.nlanes
+            recip = self.recip(self.setall(case))
+            assert recip == pytest.approx(data_recip, nan_ok=True)
+
+        data_recip = self.load([1/x for x in data]) # load to truncate precision
+        recip = self.recip(vdata)
+        assert recip == data_recip
+
+class _SIMD_ALL(_Test_Utility):
+    """
+    To test all vector types at once
+    """
+    def test_memory_load(self):
+        data = self._data()
+        # unaligned load
+        load_data = self.load(data)
+        assert load_data == data
+        # aligned load
+        loada_data = self.loada(data)
+        assert loada_data == data
+        # stream load
+        loads_data = self.loads(data)
+        assert loads_data == data
+        # load lower part
+        loadl = self.loadl(data)
+        loadl_half = list(loadl)[:self.nlanes//2]
+        data_half = data[:self.nlanes//2]
+        assert loadl_half == data_half
+        assert loadl != data # detect overflow
+
+    def test_memory_store(self):
+        data = self._data()
+        vdata = self.load(data)
+        # unaligned store
+        store = [0] * self.nlanes
+        self.store(store, vdata)
+        assert store == data
+        # aligned store
+        store_a = [0] * self.nlanes
+        self.storea(store_a, vdata)
+        assert store_a == data
+        # stream store
+        store_s = [0] * self.nlanes
+        self.stores(store_s, vdata)
+        assert store_s == data
+        # store lower part
+        store_l = [0] * self.nlanes
+        self.storel(store_l, vdata)
+        assert store_l[:self.nlanes//2] == data[:self.nlanes//2]
+        assert store_l != vdata # detect overflow
+        # store higher part
+        store_h = [0] * self.nlanes
+        self.storeh(store_h, vdata)
+        assert store_h[:self.nlanes//2] == data[self.nlanes//2:]
+        assert store_h != vdata  # detect overflow
+
+    def test_memory_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4] # test out of range
+        for n in lanes:
+            load_till  = self.load_till(data, n, 15)
+            data_till  = data[:n] + [15] * (self.nlanes-n)
+            assert load_till == data_till
+            load_tillz = self.load_tillz(data, n)
+            data_tillz = data[:n] + [0] * (self.nlanes-n)
+            assert load_tillz == data_tillz
+
+    def test_memory_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        data_rev = self._data(reverse=True)
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for n in lanes:
+            data_till = data_rev.copy()
+            data_till[:n] = data[:n]
+            store_till = self._data(reverse=True)
+            self.store_till(store_till, n, vdata)
+            assert store_till == data_till
+
+    def test_memory_noncont_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = self.load(data[::stride]) # cast unsigned
+            loadn = self.loadn(data, stride)
+            assert loadn == data_stride
+
+    def test_memory_noncont_partial_load(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            data = self._data(count=stride*self.nlanes)
+            data_stride = data[::stride]
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+        for stride in range(-64, 0):
+            data = self._data(stride, -stride*self.nlanes)
+            data_stride = list(self.load(data[::stride])) # cast unsigned
+            for n in lanes:
+                data_stride_till = data_stride[:n] + [15] * (self.nlanes-n)
+                loadn_till = self.loadn_till(data, stride, n, 15)
+                assert loadn_till == data_stride_till
+                data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n)
+                loadn_tillz = self.loadn_tillz(data, stride, n)
+                assert loadn_tillz == data_stride_tillz
+
+    def test_memory_noncont_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        vdata = self.load(self._data())
+        for stride in range(1, 64):
+            data = [15] * stride * self.nlanes
+            data[::stride] = vdata
+            storen = [15] * stride * self.nlanes
+            storen += [127]*64
+            self.storen(storen, stride, vdata)
+            assert storen[:-64] == data
+            assert storen[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            data = [15] * -stride * self.nlanes
+            data[::stride] = vdata
+            storen = [127]*64
+            storen += [15] * -stride * self.nlanes
+            self.storen(storen, stride, vdata)
+            assert storen[64:] == data
+            assert storen[:64] == [127]*64 # detect overflow
+
+    def test_memory_noncont_partial_store(self):
+        if self.sfx in ("u8", "s8", "u16", "s16"):
+            return
+
+        data = self._data()
+        vdata = self.load(data)
+        lanes = list(range(1, self.nlanes + 1))
+        lanes += [self.nlanes**2, self.nlanes**4]
+        for stride in range(1, 64):
+            for n in lanes:
+                data_till = [15] * stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [15] * stride * self.nlanes
+                storen_till += [127]*64
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[:-64] == data_till
+                assert storen_till[-64:] == [127]*64 # detect overflow
+
+        for stride in range(-64, 0):
+            for n in lanes:
+                data_till = [15] * -stride * self.nlanes
+                data_till[::stride] = data[:n] + [15] * (self.nlanes-n)
+                storen_till = [127]*64
+                storen_till += [15] * -stride * self.nlanes
+                self.storen_till(storen_till, stride, n, vdata)
+                assert storen_till[64:] == data_till
+                assert storen_till[:64] == [127]*64 # detect overflow
+
+    def test_misc(self):
+        broadcast_zero = self.zero()
+        assert broadcast_zero == [0] * self.nlanes
+        for i in range(1, 10):
+            broadcasti = self.setall(i)
+            assert broadcasti == [i] * self.nlanes
+
+        data_a, data_b = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # py level of npyv_set_* don't support ignoring the extra specified lanes or
+        # fill non-specified lanes with zero.
+        vset = self.set(*data_a)
+        assert vset == data_a
+        # py level of npyv_setf_* don't support ignoring the extra specified lanes or
+        # fill non-specified lanes with the specified scalar.
+        vsetf = self.setf(10, *data_a)
+        assert vsetf == data_a
+
+        # We're testing the sainty of _simd's type-vector,
+        # reinterpret* intrinsics itself are tested via compiler
+        # during the build of _simd module
+        sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"]
+        if self.npyv.simd_f64:
+            sfxes.append("f64")
+        for sfx in sfxes:
+            vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__
+            assert vec_name == "npyv_" + sfx
+
+        # select & mask operations
+        select_a = self.select(self.cmpeq(self.zero(), self.zero()), vdata_a, vdata_b)
+        assert select_a == data_a
+        select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b)
+        assert select_b == data_b
+
+        # cleanup intrinsic is only used with AVX for
+        # zeroing registers to avoid the AVX-SSE transition penalty,
+        # so nothing to test here
+        self.npyv.cleanup()
+
+    def test_reorder(self):
+        data_a, data_b  = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+        # lower half part
+        data_a_lo = data_a[:self.nlanes//2]
+        data_b_lo = data_b[:self.nlanes//2]
+        # higher half part
+        data_a_hi = data_a[self.nlanes//2:]
+        data_b_hi = data_b[self.nlanes//2:]
+        # combine two lower parts
+        combinel = self.combinel(vdata_a, vdata_b)
+        assert combinel == data_a_lo + data_b_lo
+        # combine two higher parts
+        combineh = self.combineh(vdata_a, vdata_b)
+        assert combineh == data_a_hi + data_b_hi
+        # combine x2
+        combine  = self.combine(vdata_a, vdata_b)
+        assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi)
+        # zip(interleave)
+        data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p]
+        data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p]
+        vzip  = self.zip(vdata_a, vdata_b)
+        assert vzip == (data_zipl, data_ziph)
+
+    def test_reorder_rev64(self):
+        # Reverse elements of each 64-bit lane
+        ssize = self._scalar_size()
+        if ssize == 64:
+            return
+        data_rev64 = [
+            y for x in range(0, self.nlanes, 64//ssize)
+              for y in reversed(range(x, x + 64//ssize))
+        ]
+        rev64 = self.rev64(self.load(range(self.nlanes)))
+        assert rev64 == data_rev64
+
+    def test_operators_comparison(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        mask_true = self._true_mask()
+        def to_bool(vector):
+            return [lane == mask_true for lane in vector]
+        # equal
+        data_eq = [a == b for a, b in zip(data_a, data_b)]
+        cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b))
+        assert cmpeq == data_eq
+        # not equal
+        data_neq = [a != b for a, b in zip(data_a, data_b)]
+        cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b))
+        assert cmpneq == data_neq
+        # greater than
+        data_gt = [a > b for a, b in zip(data_a, data_b)]
+        cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b))
+        assert cmpgt == data_gt
+        # greater than and equal
+        data_ge = [a >= b for a, b in zip(data_a, data_b)]
+        cmpge = to_bool(self.cmpge(vdata_a, vdata_b))
+        assert cmpge == data_ge
+        # less than
+        data_lt  = [a < b for a, b in zip(data_a, data_b)]
+        cmplt = to_bool(self.cmplt(vdata_a, vdata_b))
+        assert cmplt == data_lt
+        # less than and equal
+        data_le  = [a <= b for a, b in zip(data_a, data_b)]
+        cmple = to_bool(self.cmple(vdata_a, vdata_b))
+        assert cmple == data_le
+
+    def test_operators_logical(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        if self._is_fp():
+            data_cast_a = self._to_unsigned(vdata_a)
+            data_cast_b = self._to_unsigned(vdata_b)
+            cast, cast_data = self._to_unsigned, self._to_unsigned
+        else:
+            data_cast_a, data_cast_b = data_a, data_b
+            cast, cast_data = lambda a: a, self.load
+
+        data_xor = cast_data([a ^ b for a, b in zip(data_cast_a, data_cast_b)])
+        vxor = cast(self.xor(vdata_a, vdata_b))
+        assert vxor == data_xor
+
+        data_or  = cast_data([a | b for a, b in zip(data_cast_a, data_cast_b)])
+        vor  = cast(getattr(self, "or")(vdata_a, vdata_b))
+        assert vor == data_or
+
+        data_and = cast_data([a & b for a, b in zip(data_cast_a, data_cast_b)])
+        vand = cast(getattr(self, "and")(vdata_a, vdata_b))
+        assert vand == data_and
+
+        data_not = cast_data([~a for a in data_cast_a])
+        vnot = cast(getattr(self, "not")(vdata_a))
+        assert vnot == data_not
+
+    def test_conversion_boolean(self):
+        bsfx = "b" + self.sfx[1:]
+        to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx))
+        from_boolean = getattr(self.npyv, "cvt_%s_%s" % (self.sfx, bsfx))
+
+        false_vb = to_boolean(self.setall(0))
+        true_vb  = self.cmpeq(self.setall(0), self.setall(0))
+        assert false_vb != true_vb
+
+        false_vsfx = from_boolean(false_vb)
+        true_vsfx = from_boolean(true_vb)
+        assert false_vsfx != true_vsfx
+
+    def test_arithmetic_subadd(self):
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # non-saturated
+        data_add = self.load([a + b for a, b in zip(data_a, data_b)]) # load to cast
+        add  = self.add(vdata_a, vdata_b)
+        assert add == data_add
+        data_sub  = self.load([a - b for a, b in zip(data_a, data_b)])
+        sub  = self.sub(vdata_a, vdata_b)
+        assert sub == data_sub
+
+    def test_arithmetic_mul(self):
+        if self.sfx in ("u64", "s64"):
+            return
+
+        if self._is_fp():
+            data_a = self._data()
+        else:
+            data_a = self._data(self._int_max() - self.nlanes)
+        data_b = self._data(self._int_min(), reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        data_mul = self.load([a * b for a, b in zip(data_a, data_b)])
+        mul = self.mul(vdata_a, vdata_b)
+        assert mul == data_mul
+
+    def test_arithmetic_div(self):
+        if not self._is_fp():
+            return
+
+        data_a, data_b = self._data(), self._data(reverse=True)
+        vdata_a, vdata_b = self.load(data_a), self.load(data_b)
+
+        # load to truncate f64 to precision of f32
+        data_div = self.load([a / b for a, b in zip(data_a, data_b)])
+        div = self.div(vdata_a, vdata_b)
+        assert div == data_div
+
+    def test_arithmetic_reduce_sum(self):
+        if not self._is_fp():
+            return
+        # reduce sum
+        data = self._data()
+        vdata = self.load(data)
+
+        data_sum = sum(data)
+        vsum = self.sum(vdata)
+        assert vsum == data_sum
+
+bool_sfx = ("b8", "b16", "b32", "b64")
+int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64")
+fp_sfx  = ("f32", "f64")
+all_sfx = int_sfx + fp_sfx
+tests_registry = {
+    bool_sfx: _SIMD_BOOL,
+    int_sfx : _SIMD_INT,
+    fp_sfx  : _SIMD_FP,
+    all_sfx : _SIMD_ALL
+}
+for target_name, npyv in targets.items():
+    simd_width = npyv.simd if npyv else ''
+    pretty_name = target_name.split('__') # multi-target separator
+    if len(pretty_name) > 1:
+        # multi-target
+        pretty_name = f"({' '.join(pretty_name)})"
+    else:
+        pretty_name = pretty_name[0]
+
+    skip = ""
+    skip_sfx = dict()
+    if not npyv:
+        skip = f"target '{pretty_name}' isn't supported by current machine"
+    elif not npyv.simd:
+        skip = f"target '{pretty_name}' isn't supported by NPYV"
+    elif not npyv.simd_f64:
+        skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision"
+
+    for sfxes, cls in tests_registry.items():
+        for sfx in sfxes:
+            skip_m = skip_sfx.get(sfx, skip)
+            inhr = (cls,)
+            attr = dict(npyv=targets[target_name], sfx=sfx)
+            tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr)
+            if skip_m:
+                pytest.mark.skip(reason=skip_m)(tcls)
+            globals()[tcls.__name__] = tcls
diff --git a/numpy/core/tests/test_simd_module.py b/numpy/core/tests/test_simd_module.py
new file mode 100644
index 000000000..3d710884a
--- /dev/null
+++ b/numpy/core/tests/test_simd_module.py
@@ -0,0 +1,97 @@
+import pytest
+from numpy.core._simd import targets
+"""
+This testing unit only for checking the sanity of common functionality,
+therefore all we need is just to take one submodule that represents any
+of enabled SIMD extensions to run the test on it and the second submodule
+required to run only one check related to the possibility of mixing
+the data types among each submodule.
+"""
+npyvs = [npyv_mod for npyv_mod in targets.values() if npyv_mod and npyv_mod.simd]
+npyv, npyv2 = (npyvs + [None, None])[:2]
+
+unsigned_sfx = ["u8", "u16", "u32", "u64"]
+signed_sfx = ["s8", "s16", "s32", "s64"]
+fp_sfx = ["f32"]
+if npyv and npyv.simd_f64:
+    fp_sfx.append("f64")
+
+int_sfx = unsigned_sfx + signed_sfx
+all_sfx = unsigned_sfx + int_sfx
+
+@pytest.mark.skipif(not npyv, reason="could not find any SIMD extension with NPYV support")
+class Test_SIMD_MODULE:
+
+    @pytest.mark.parametrize('sfx', all_sfx)
+    def test_num_lanes(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        vector = getattr(npyv, "setall_" + sfx)(1)
+        assert len(vector) == nlanes
+
+    @pytest.mark.parametrize('sfx', all_sfx)
+    def test_type_name(self, sfx):
+        vector = getattr(npyv, "setall_" + sfx)(1)
+        assert vector.__name__ == "npyv_" + sfx
+
+    def test_raises(self):
+        a, b = [npyv.setall_u32(1)]*2
+        for sfx in all_sfx:
+            vcb = lambda intrin: getattr(npyv, f"{intrin}_{sfx}")
+            pytest.raises(TypeError, vcb("add"), a)
+            pytest.raises(TypeError, vcb("add"), a, b, a)
+            pytest.raises(TypeError, vcb("setall"))
+            pytest.raises(TypeError, vcb("setall"), [1])
+            pytest.raises(TypeError, vcb("load"), 1)
+            pytest.raises(ValueError, vcb("load"), [1])
+            pytest.raises(ValueError, vcb("store"), [1], getattr(npyv, f"reinterpret_{sfx}_u32")(a))
+
+    @pytest.mark.skipif(not npyv2, reason=(
+        "could not find a second SIMD extension with NPYV support"
+    ))
+    def test_nomix(self):
+        # mix among submodules isn't allowed
+        a = npyv.setall_u32(1)
+        a2 = npyv2.setall_u32(1)
+        pytest.raises(TypeError, npyv.add_u32, a2, a2)
+        pytest.raises(TypeError, npyv2.add_u32, a, a)
+
+    @pytest.mark.parametrize('sfx', unsigned_sfx)
+    def test_unsigned_overflow(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        maxu = (1 << int(sfx[1:])) - 1
+        maxu_72 = (1 << 72) - 1
+        lane = getattr(npyv, "setall_" + sfx)(maxu_72)[0]
+        assert lane == maxu
+        lanes = getattr(npyv, "load_" + sfx)([maxu_72] * nlanes)
+        assert lanes == [maxu] * nlanes
+        lane = getattr(npyv, "setall_" + sfx)(-1)[0]
+        assert lane == maxu
+        lanes = getattr(npyv, "load_" + sfx)([-1] * nlanes)
+        assert lanes == [maxu] * nlanes
+
+    @pytest.mark.parametrize('sfx', signed_sfx)
+    def test_signed_overflow(self, sfx):
+        nlanes = getattr(npyv, "nlanes_" + sfx)
+        maxs_72 = (1 << 71) - 1
+        lane = getattr(npyv, "setall_" + sfx)(maxs_72)[0]
+        assert lane == -1
+        lanes = getattr(npyv, "load_" + sfx)([maxs_72] * nlanes)
+        assert lanes == [-1] * nlanes
+        mins_72 = -1 << 71
+        lane = getattr(npyv, "setall_" + sfx)(mins_72)[0]
+        assert lane == 0
+        lanes = getattr(npyv, "load_" + sfx)([mins_72] * nlanes)
+        assert lanes == [0] * nlanes
+
+    def test_truncate_f32(self):
+        f32 = npyv.setall_f32(0.1)[0]
+        assert f32 != 0.1
+        assert round(f32, 1) == 0.1
+
+    def test_compare(self):
+        data_range = range(0, npyv.nlanes_u32)
+        vdata = npyv.load_u32(data_range)
+        assert vdata == list(data_range)
+        assert vdata == tuple(data_range)
+        for i in data_range:
+            assert vdata[i] == data_range[i]
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 1305f4877..aa17d6b08 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1,5 +1,6 @@
 import warnings
 import itertools
+import sys
 
 import pytest
 
@@ -11,7 +12,7 @@ import numpy.core._rational_tests as _rational_tests
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_array_equal,
     assert_almost_equal, assert_array_almost_equal, assert_no_warnings,
-    assert_allclose,
+    assert_allclose, HAS_REFCOUNT,
     )
 from numpy.compat import pickle
 
@@ -177,6 +178,10 @@ class TestUfuncGenericLoops:
                 assert_array_equal(res_num.astype("O"), res_obj)
 
 
+def _pickleable_module_global():
+    pass
+
+
 class TestUfunc:
     def test_pickle(self):
         for proto in range(2, pickle.HIGHEST_PROTOCOL + 1):
@@ -194,6 +199,15 @@ class TestUfunc:
                    b"(S'numpy.core.umath'\np1\nS'cos'\np2\ntp3\nRp4\n.")
         assert_(pickle.loads(astring) is np.cos)
 
+    def test_pickle_name_is_qualname(self):
+        # This tests that a simplification of our ufunc pickle code will
+        # lead to allowing qualnames as names.  Future ufuncs should
+        # possible add a specific qualname, or a hook into pickling instead
+        # (dask+numba may benefit).
+        _pickleable_module_global.ufunc = umt._pickleable_module_global_ufunc
+        obj = pickle.loads(pickle.dumps(_pickleable_module_global.ufunc))
+        assert obj is umt._pickleable_module_global_ufunc
+
     def test_reduceat_shifting_sum(self):
         L = 6
         x = np.arange(L)
@@ -1319,6 +1333,18 @@ class TestUfunc:
         m = np.array([True], dtype=bool)
         assert_equal(np.sqrt(a, where=m), [1])
 
+    def test_where_with_broadcasting(self):
+        # See gh-17198
+        a = np.random.random((5000, 4))
+        b = np.random.random((5000, 1))
+
+        where = a > 0.3
+        out = np.full_like(a, 0)
+        np.less(a, b, where=where, out=out)
+        b_where = np.broadcast_to(b, a.shape)[where]
+        assert_array_equal((a[where] < b_where), out[where].astype(bool))
+        assert not out[~where].any()  # outside mask, out remains all 0
+
     def check_identityless_reduction(self, a):
         # np.minimum.reduce is an identityless reduction
 
@@ -2074,3 +2100,60 @@ def test_ufunc_warn_with_nan(ufunc):
     else:
         raise ValueError('ufunc with more than 2 inputs')
 
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+def test_ufunc_casterrors():
+    # Tests that casting errors are correctly reported and buffers are
+    # cleared.
+    # The following array can be added to itself as an object array, but
+    # the result cannot be cast to an integer output:
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.array([value] * int(np.BUFSIZE * 1.5) +
+                   ["string"] +
+                   [value] * int(1.5 * np.BUFSIZE), dtype=object)
+    out = np.ones(len(arr), dtype=np.intp)
+
+    count = sys.getrefcount(value)
+    with pytest.raises(ValueError):
+        # Output casting failure:
+        np.add(arr, arr, out=out, casting="unsafe")
+
+    assert count == sys.getrefcount(value)
+    # output is unchanged after the error, this shows that the iteration
+    # was aborted (this is not necessarily defined behaviour)
+    assert out[-1] == 1
+
+    with pytest.raises(ValueError):
+        # Input casting failure:
+        np.add(arr, arr, out=out, dtype=np.intp, casting="unsafe")
+
+    assert count == sys.getrefcount(value)
+    # output is unchanged after the error, this shows that the iteration
+    # was aborted (this is not necessarily defined behaviour)
+    assert out[-1] == 1
+
+
+@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
+@pytest.mark.parametrize("offset",
+        [0, np.BUFSIZE//2, int(1.5*np.BUFSIZE)])
+def test_reduce_casterrors(offset):
+    # Test reporting of casting errors in reductions, we test various
+    # offsets to where the casting error will occur, since these may occur
+    # at different places during the reduction procedure. For example
+    # the first item may be special.
+    value = 123  # relies on python cache (leak-check will still find it)
+    arr = np.array([value] * offset +
+                   ["string"] +
+                   [value] * int(1.5 * np.BUFSIZE), dtype=object)
+    out = np.array(-1, dtype=np.intp)
+
+    count = sys.getrefcount(value)
+    with pytest.raises(ValueError):
+        # This is an unsafe cast, but we currently always allow that:
+        np.add.reduce(arr, dtype=np.intp, out=out)
+    assert count == sys.getrefcount(value)
+    # If an error occurred during casting, the operation is done at most until
+    # the error occurs (the result of which would be `value * offset`) and -1
+    # if the error happened immediately.
+    # This does not define behaviour, the output is invalid and thus undefined
+    assert out[()] < value * offset
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 941d99521..8162e52bd 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -13,7 +13,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp
+    _gen_alignment_data, assert_array_almost_equal_nulp, assert_warns
     )
 
 def on_powerpc():
@@ -249,6 +249,67 @@ class TestDivision:
         assert_equal(x // 100, [0, 0, 0, 1, -1, -1, -1, -1, -2])
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
+    @pytest.mark.parametrize("input_dtype",
+            [np.int8, np.int16, np.int32, np.int64])
+    def test_division_int_boundary(self, input_dtype):
+        iinfo = np.iinfo(input_dtype)
+
+        # Create list with min, 25th percentile, 0, 75th percentile, max
+        lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
+        divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
+        a = np.array(lst, dtype=input_dtype)
+
+        for divisor in divisors:
+            div_a = a // divisor
+            b = a.copy(); b //= divisor
+            div_lst = [i // divisor for i in lst]
+
+            msg = "Integer arrays floor division check (//)"
+            assert all(div_a == div_lst), msg
+
+            msg = "Integer arrays floor division check (//=)"
+            assert all(div_a == b), msg
+
+        with np.errstate(divide='raise'):
+            with pytest.raises(FloatingPointError):
+                a // 0
+            with pytest.raises(FloatingPointError):
+                a //= 0
+
+            np.array([], dtype=input_dtype) // 0
+
+    @pytest.mark.parametrize(
+            "dividend,divisor,quotient",
+            [(np.timedelta64(2,'Y'), np.timedelta64(2,'M'), 12),
+             (np.timedelta64(2,'Y'), np.timedelta64(-2,'M'), -12),
+             (np.timedelta64(-2,'Y'), np.timedelta64(2,'M'), -12),
+             (np.timedelta64(-2,'Y'), np.timedelta64(-2,'M'), 12),
+             (np.timedelta64(2,'M'), np.timedelta64(-2,'Y'), -1),
+             (np.timedelta64(2,'Y'), np.timedelta64(0,'M'), 0),
+             (np.timedelta64(2,'Y'), 2, np.timedelta64(1,'Y')),
+             (np.timedelta64(2,'Y'), -2, np.timedelta64(-1,'Y')),
+             (np.timedelta64(-2,'Y'), 2, np.timedelta64(-1,'Y')),
+             (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')),
+             (np.timedelta64(-2,'Y'), -2, np.timedelta64(1,'Y')),
+             (np.timedelta64(-2,'Y'), -3, np.timedelta64(0,'Y')),
+             (np.timedelta64(-2,'Y'), 0, np.timedelta64('Nat','Y')),
+            ])
+    def test_division_int_timedelta(self, dividend, divisor, quotient):
+        # If either divisor is 0 or quotient is Nat, check for division by 0
+        if divisor and (isinstance(quotient, int) or not np.isnat(quotient)):
+            msg = "Timedelta floor division check"
+            assert dividend // divisor == quotient, msg
+
+            # Test for arrays as well
+            msg = "Timedelta arrays floor division check"
+            dividend_array = np.array([dividend]*5)
+            quotient_array = np.array([quotient]*5)
+            assert all(dividend_array // divisor == quotient_array), msg
+        else:
+            with np.errstate(divide='raise', invalid='raise'):
+                with pytest.raises(FloatingPointError):
+                    dividend // divisor
+
     def test_division_complex(self):
         # check that implementation is correct
         msg = "Complex division implementation check"
@@ -293,6 +354,42 @@ class TestDivision:
         assert_equal(np.signbit(x//1), 0)
         assert_equal(np.signbit((-x)//1), 1)
 
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_floor_division_errors(self, dtype):
+        fnan = np.array(np.nan, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        fzer = np.array(0.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        # divide by zero error check
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.floor_divide, fone, fzer)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.floor_divide, fnan, fone)
+            assert_raises(FloatingPointError, np.floor_divide, fone, fnan)
+            assert_raises(FloatingPointError, np.floor_divide, fnan, fzer)
+
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_floor_division_corner_cases(self, dtype):
+        # test corner cases like 1.0//0.0 for errors and return vals
+        x = np.zeros(10, dtype=dtype)
+        y = np.ones(10, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        fzer = np.array(0.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        with suppress_warnings() as sup:
+            sup.filter(RuntimeWarning, "invalid value encountered in floor_divide")
+            div = np.floor_divide(fnan, fone)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+            div = np.floor_divide(fone, fnan)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+            div = np.floor_divide(fnan, fzer)
+            assert(np.isnan(div)), "dt: %s, div: %s" % (dt, div)
+        # verify 1.0//0.0 computations return inf
+        with np.errstate(divide='ignore'):
+            z = np.floor_divide(y, x)
+            assert_(np.isinf(z).all())
+
 def floor_divide_and_remainder(x, y):
     return (np.floor_divide(x, y), np.remainder(x, y))
 
@@ -366,9 +463,90 @@ class TestRemainder:
                     else:
                         assert_(b > rem >= 0, msg)
 
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    def test_float_divmod_errors(self, dtype):
+        # Check valid errors raised for divmod and remainder
+        fzero = np.array(0.0, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        # since divmod is combination of both remainder and divide
+        # ops it will set both dividebyzero and invalid flags
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, fone, fzero)
+        with np.errstate(divide='ignore', invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, fone, fzero)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, fzero, fzero)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, finf, finf)
+        with np.errstate(divide='ignore', invalid='raise'):
+            assert_raises(FloatingPointError, np.divmod, finf, fzero)
+        with np.errstate(divide='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, finf, fzero)
+
+    @pytest.mark.parametrize('dtype', np.typecodes['Float'])
+    @pytest.mark.parametrize('fn', [np.fmod, np.remainder])
+    def test_float_remainder_errors(self, dtype, fn):
+        fzero = np.array(0.0, dtype=dtype)
+        fone = np.array(1.0, dtype=dtype)
+        finf = np.array(np.inf, dtype=dtype)
+        fnan = np.array(np.nan, dtype=dtype)
+        with np.errstate(invalid='raise'):
+            assert_raises(FloatingPointError, fn, fone, fzero)
+            assert_raises(FloatingPointError, fn, fnan, fzero)
+            assert_raises(FloatingPointError, fn, fone, fnan)
+            assert_raises(FloatingPointError, fn, fnan, fone)
+
+    def test_float_remainder_overflow(self):
+        a = np.finfo(np.float64).tiny
+        with np.errstate(over='ignore', invalid='ignore'):
+            div, mod = np.divmod(4, a)
+            np.isinf(div)
+            assert_(mod == 0)
+        with np.errstate(over='raise', invalid='ignore'):
+            assert_raises(FloatingPointError, np.divmod, 4, a)
+        with np.errstate(invalid='raise', over='ignore'):
+            assert_raises(FloatingPointError, np.divmod, 4, a)
+
+    def test_float_divmod_corner_cases(self):
+        # check nan cases
+        for dt in np.typecodes['Float']:
+            fnan = np.array(np.nan, dtype=dt)
+            fone = np.array(1.0, dtype=dt)
+            fzer = np.array(0.0, dtype=dt)
+            finf = np.array(np.inf, dtype=dt)
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "invalid value encountered in divmod")
+                sup.filter(RuntimeWarning, "divide by zero encountered in divmod")
+                div, rem = np.divmod(fone, fzer)
+                assert(np.isinf(div)), 'dt: %s, div: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(fzer, fzer)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                assert_(np.isnan(div)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(finf, finf)
+                assert(np.isnan(div)), 'dt: %s, rem: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(finf, fzer)
+                assert(np.isinf(div)), 'dt: %s, rem: %s' % (dt, rem)
+                assert(np.isnan(rem)), 'dt: %s, rem: %s' % (dt, rem)
+                div, rem = np.divmod(fnan, fone)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+                div, rem = np.divmod(fone, fnan)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+                div, rem = np.divmod(fnan, fzer)
+                assert(np.isnan(rem)), "dt: %s, rem: %s" % (dt, rem)
+                assert(np.isnan(div)), "dt: %s, rem: %s" % (dt, rem)
+
     def test_float_remainder_corner_cases(self):
         # Check remainder magnitude.
         for dt in np.typecodes['Float']:
+            fone = np.array(1.0, dtype=dt)
+            fzer = np.array(0.0, dtype=dt)
+            fnan = np.array(np.nan, dtype=dt)
             b = np.array(1.0, dtype=dt)
             a = np.nextafter(np.array(0.0, dtype=dt), -b)
             rem = np.remainder(a, b)
@@ -379,6 +557,7 @@ class TestRemainder:
         # Check nans, inf
         with suppress_warnings() as sup:
             sup.filter(RuntimeWarning, "invalid value encountered in remainder")
+            sup.filter(RuntimeWarning, "invalid value encountered in fmod")
             for dt in np.typecodes['Float']:
                 fone = np.array(1.0, dtype=dt)
                 fzer = np.array(0.0, dtype=dt)
@@ -389,10 +568,30 @@ class TestRemainder:
                 # MSVC 2008 returns NaN here, so disable the check.
                 #rem = np.remainder(fone, finf)
                 #assert_(rem == fone, 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(finf, fone)
+                fmod = np.fmod(finf, fone)
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                rem = np.remainder(finf, finf)
+                fmod = np.fmod(finf, fone)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                rem = np.remainder(finf, fzer)
+                fmod = np.fmod(finf, fzer)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
                 rem = np.remainder(fone, fnan)
+                fmod = np.fmod(fone, fnan)
                 assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
-                rem = np.remainder(finf, fone)
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, fmod))
+                rem = np.remainder(fnan, fzer)
+                fmod = np.fmod(fnan, fzer)
                 assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, rem))
+                rem = np.remainder(fnan, fone)
+                fmod = np.fmod(fnan, fone)
+                assert_(np.isnan(rem), 'dt: %s, rem: %s' % (dt, rem))
+                assert_(np.isnan(fmod), 'dt: %s, fmod: %s' % (dt, rem))
 
 
 class TestCbrt:
@@ -657,6 +856,24 @@ class TestLog:
             yf = np.array(y, dtype=dt)*log2_
             assert_almost_equal(np.log(xf), yf)
 
+        # test aliasing(issue #17761)
+        x = np.array([2, 0.937500, 3, 0.947500, 1.054697])
+        xf = np.log(x)
+        assert_almost_equal(np.log(x, out=x), xf)
+
+    def test_log_strides(self):
+        np.random.seed(42)
+        strides = np.array([-4,-3,-2,-1,1,2,3,4])
+        sizes = np.arange(2,100)
+        for ii in sizes:
+            x_f64 = np.float64(np.random.uniform(low=0.01, high=100.0,size=ii))
+            x_special = x_f64.copy()
+            x_special[3:-1:4] = 1.0
+            y_true = np.log(x_f64)
+            y_special = np.log(x_special)
+            for jj in strides:
+                assert_array_almost_equal_nulp(np.log(x_f64[::jj]), y_true[::jj], nulp=2)
+                assert_array_almost_equal_nulp(np.log(x_special[::jj]), y_special[::jj], nulp=2)
 
 class TestExp:
     def test_exp_values(self):
@@ -883,6 +1100,10 @@ class TestAVXFloat32Transcendental:
         x_f64 = np.float64(x_f32)
         assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
         assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
+        # test aliasing(issue #17761)
+        tx_f32 = x_f32.copy()
+        assert_array_max_ulp(np.sin(x_f32, out=x_f32), np.float32(np.sin(x_f64)), maxulp=2)
+        assert_array_max_ulp(np.cos(tx_f32, out=tx_f32), np.float32(np.cos(x_f64)), maxulp=2)
 
     def test_strided_float32(self):
         np.random.seed(42)
@@ -2444,7 +2665,7 @@ class TestSpecialMethods:
         assert_raises(ValueError, inner1d, a, a, out=())
 
     def test_ufunc_override_with_super(self):
-        # NOTE: this class is given as an example in doc/subclassing.py;
+        # NOTE: this class is used in doc/source/user/basics.subclassing.rst
         # if you make any changes here, do update it there too.
         class A(np.ndarray):
             def __array_ufunc__(self, ufunc, method, *inputs, out=None, **kwargs):
@@ -3270,3 +3491,39 @@ def test_outer_subclass_preserve(arr):
     class foo(np.ndarray): pass
     actual = np.multiply.outer(arr.view(foo), arr.view(foo))
     assert actual.__class__.__name__ == 'foo'
+
+def test_outer_bad_subclass():
+    class BadArr1(np.ndarray):
+        def __array_finalize__(self, obj):
+            # The outer call reshapes to 3 dims, try to do a bad reshape.
+            if self.ndim == 3:
+                self.shape = self.shape + (1,)
+
+        def __array_prepare__(self, obj, context=None):
+            return obj
+
+    class BadArr2(np.ndarray):
+        def __array_finalize__(self, obj):
+            if isinstance(obj, BadArr2):
+                # outer inserts 1-sized dims. In that case disturb them.
+                if self.shape[-1] == 1:
+                    self.shape = self.shape[::-1]
+
+        def __array_prepare__(self, obj, context=None):
+            return obj
+
+    for cls in [BadArr1, BadArr2]:
+        arr = np.ones((2, 3)).view(cls)
+        with assert_raises(TypeError) as a:
+            # The first array gets reshaped (not the second one)
+            np.add.outer(arr, [1, 2])
+
+        # This actually works, since we only see the reshaping error:
+        arr = np.ones((2, 3)).view(cls)
+        assert type(np.add.outer([1, 2], arr)) is cls
+
+def test_outer_exceeds_maxdims():
+    deep = np.ones((1,) * 17)
+    with assert_raises(ValueError):
+        np.add.outer(deep, deep)
+
author	Charles Harris <charlesr.harris@gmail.com>	2020-12-13 14:14:49 -0700
committer	GitHub <noreply@github.com>	2020-12-13 14:14:49 -0700
commit	3fe2d9d2627fc0f84aeed293ff8afa7c1f08d899 (patch)
tree	2ea27fe06a19c39e8d7a5fe2f87cb7e05363247d /numpy/core
parent	7d7e446fcbeeff70d905bde2eb0264a797488280 (diff)
parent	eff302e5e8678fa17fb3d8156d49eb585b0876d9 (diff)
download	numpy-3fe2d9d2627fc0f84aeed293ff8afa7c1f08d899.tar.gz