119 files changed, 5879 insertions, 1690 deletions
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index dad9293e1..b89e27f0f 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -9,6 +9,7 @@ are available in the main ``numpy`` namespace - use that instead.
 from numpy.version import version as __version__
 
 import os
+import warnings
 
 # disables OpenBLAS affinity setting of the main thread that limits
 # python threads or processes to one core
@@ -80,8 +81,8 @@ from .memmap import *
 from .defchararray import chararray
 from . import function_base
 from .function_base import *
-from . import machar
-from .machar import *
+from . import _machar
+from ._machar import *
 from . import getlimits
 from .getlimits import *
 from . import shape_base
@@ -105,11 +106,9 @@ from . import _methods
 
 __all__ = ['char', 'rec', 'memmap']
 __all__ += numeric.__all__
-__all__ += fromnumeric.__all__
 __all__ += ['record', 'recarray', 'format_parser']
 __all__ += ['chararray']
 __all__ += function_base.__all__
-__all__ += machar.__all__
 __all__ += getlimits.__all__
 __all__ += shape_base.__all__
 __all__ += einsumfunc.__all__
@@ -151,6 +150,17 @@ def _DType_reduce(DType):
     return _DType_reconstruct, (scalar_type,)
 
 
+def __getattr__(name):
+    # Deprecated 2021-10-20, NumPy 1.22
+    if name == "machar":
+        warnings.warn(
+            "The `np.core.machar` module is deprecated (NumPy 1.22)",
+            DeprecationWarning, stacklevel=2,
+        )
+        return _machar
+    raise AttributeError(f"Module {__name__!r} has no attribute {name!r}")
+
+
 import copyreg
 
 copyreg.pickle(ufunc, _ufunc_reduce)
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 7467be80f..078c58976 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -328,7 +328,7 @@ add_newdoc('numpy.core', 'nditer',
     ...     with it:
     ...         for (a, b, c) in it:
     ...             addop(a, b, out=c)
-    ...     return it.operands[2]
+    ...         return it.operands[2]
 
     Here is the same function, but following the C-style pattern:
 
@@ -1573,6 +1573,19 @@ add_newdoc('numpy.core.multiarray', 'frombuffer',
         array_function_like_doc,
     ))
 
+add_newdoc('numpy.core.multiarray', '_from_dlpack',
+    """
+    _from_dlpack(x, /)
+
+    Create a NumPy array from an object implementing the ``__dlpack__``
+    protocol.
+
+    See Also
+    --------
+    `Array API documentation
+    <https://data-apis.org/array-api/latest/design_topics/data_interchange.html#syntax-for-data-interchange-with-dlpack>`_
+    """)
+
 add_newdoc('numpy.core', 'fastCopyAndTranspose',
     """_fastCopyAndTranspose(a)""")
 
@@ -2263,6 +2276,15 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_priority__',
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__array_struct__',
     """Array protocol: C-struct side."""))
 
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack__',
+    """a.__dlpack__(*, stream=None)
+    
+    DLPack Protocol: Part of the Array API."""))
+
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__dlpack_device__',
+    """a.__dlpack_device__()
+    
+    DLPack Protocol: Part of the Array API."""))
 
 add_newdoc('numpy.core.multiarray', 'ndarray', ('base',
     """
@@ -2819,7 +2841,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
     >>> import numpy as np
 
     >>> np.ndarray[Any, np.dtype[Any]]
-    numpy.ndarray[typing.Any, numpy.dtype[Any]]
+    numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]
 
     Notes
     -----
@@ -4727,6 +4749,26 @@ add_newdoc('numpy.core.umath', '_add_newdoc_ufunc',
     and then throwing away the ufunc.
     """)
 
+add_newdoc('numpy.core.multiarray', 'get_handler_name',
+    """
+    get_handler_name(a: ndarray) -> str,None
+
+    Return the name of the memory handler used by `a`. If not provided, return
+    the name of the memory handler that will be used to allocate data for the
+    next `ndarray` in this context. May return None if `a` does not own its
+    memory, in which case you can traverse ``a.base`` for a memory handler.
+    """)
+
+add_newdoc('numpy.core.multiarray', 'get_handler_version',
+    """
+    get_handler_version(a: ndarray) -> int,None
+
+    Return the version of the memory handler used by `a`. If not provided,
+    return the version of the memory handler that will be used to allocate data
+    for the next `ndarray` in this context. May return None if `a` does not own
+    its memory, in which case you can traverse ``a.base`` for a memory handler.
+    """)
+
 add_newdoc('numpy.core.multiarray', '_set_madvise_hugepage',
     """
     _set_madvise_hugepage(enabled: bool) -> bool
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
index 8773d6c96..94859a9d5 100644
--- a/numpy/core/_add_newdocs_scalars.py
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -290,3 +290,22 @@ for float_name in ('half', 'single', 'double', 'longdouble'):
         >>> np.{float_name}(3.2).is_integer()
         False
         """))
+
+for int_name in ('int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32',
+        'int64', 'uint64', 'int64', 'uint64', 'int64', 'uint64'):
+    # Add negative examples for signed cases by checking typecode
+    add_newdoc('numpy.core.numerictypes', int_name, ('bit_count',
+        f"""
+        {int_name}.bit_count() -> int
+
+        Computes the number of 1-bits in the absolute value of the input.
+        Analogous to the builtin `int.bit_count` or ``popcount`` in C++.
+
+        Examples
+        --------
+        >>> np.{int_name}(127).bit_count()
+        7""" +
+        (f"""
+        >>> np.{int_name}(-127).bit_count()
+        7
+        """ if dtype(int_name).char.islower() else "")))
diff --git a/numpy/core/machar.py b/numpy/core/_machar.py
index c77be793f..ace19a429 100644
--- a/numpy/core/machar.py
+++ b/numpy/core/_machar.py
@@ -13,6 +13,7 @@ from numpy.core.overrides import set_module
 
 # Need to speed this up...especially for longfloat
 
+# Deprecated 2021-10-20, NumPy 1.22
 @set_module('numpy')
 class MachAr:
     """
diff --git a/numpy/core/_ufunc_config.pyi b/numpy/core/_ufunc_config.pyi
index 9c8cc8ab6..cd7129bcb 100644
--- a/numpy/core/_ufunc_config.pyi
+++ b/numpy/core/_ufunc_config.pyi
@@ -1,11 +1,10 @@
-from typing import Optional, Union, Callable, Any, Literal, Protocol, TypedDict
+from typing import Optional, Union, Callable, Any, Literal, TypedDict
+
+from numpy import _SupportsWrite
 
 _ErrKind = Literal["ignore", "warn", "raise", "call", "print", "log"]
 _ErrFunc = Callable[[str, int], Any]
 
-class _SupportsWrite(Protocol):
-    def write(self, msg: str, /) -> Any: ...
-
 class _ErrDict(TypedDict):
     divide: _ErrKind
     over: _ErrKind
@@ -30,8 +29,8 @@ def geterr() -> _ErrDict: ...
 def setbufsize(size: int) -> int: ...
 def getbufsize() -> int: ...
 def seterrcall(
-    func: Union[None, _ErrFunc, _SupportsWrite]
-) -> Union[None, _ErrFunc, _SupportsWrite]: ...
-def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite]: ...
+    func: Union[None, _ErrFunc, _SupportsWrite[str]]
+) -> Union[None, _ErrFunc, _SupportsWrite[str]]: ...
+def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite[str]]: ...
 
 # See `numpy/__init__.pyi` for the `errstate` class
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index a02c7153a..e1ee8a860 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -1,6 +1,8 @@
 # Hash below were defined from numpy_api_order.txt and ufunc_api_order.txt
 # When adding a new version here for a new minor release, also add the same
-# version as NPY_x_y_API_VERSION in numpyconfig.h
+# version as NPY_x_y_API_VERSION in numpyconfig.h and C_API_VERSION in
+# setup_common.py.
+
 0x00000001 = 603580d224763e58c5e7147f804dc0f5
 0x00000002 = 8ecb29306758515ae69749c803a75da1
 0x00000003 = bf22c0d05b31625d2a7015988d61ce5a
@@ -56,5 +58,8 @@
 # DType related API additions.
 # A new field was added to the end of PyArrayObject_fields.
 # Version 14 (NumPy 1.21) No change.
-# Version 14 (NumPy 1.22) No change.
 0x0000000e = 17a0f366e55ec05e5c5c149123478452
+
+# Version 15 (NumPy 1.22) Configurable memory allocations
+# Version 14 (NumPy 1.23) No change.
+0x0000000f = b8783365b873681cd204be50cdfb448d
diff --git a/numpy/core/code_generators/genapi.py b/numpy/core/code_generators/genapi.py
index c2458c2b5..b401ee6a5 100644
--- a/numpy/core/code_generators/genapi.py
+++ b/numpy/core/code_generators/genapi.py
@@ -41,6 +41,7 @@ API_FILES = [join('multiarray', 'alloc.c'),
              join('multiarray', 'datetime_busdaycal.c'),
              join('multiarray', 'datetime_strings.c'),
              join('multiarray', 'descriptor.c'),
+             join('multiarray', 'dlpack.c'),
              join('multiarray', 'dtypemeta.c'),
              join('multiarray', 'einsum.c.src'),
              join('multiarray', 'flagsobject.c'),
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 9fa87a11e..dc71fc5c9 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -829,7 +829,7 @@ defdict = {
           docstrings.get('numpy.core.umath.ceil'),
           None,
           TD('e', f='ceil', astype={'e': 'f'}),
-          TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
+          TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
           ),
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index fbd323368..d12d62d8f 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -19,6 +19,7 @@ from code_generators.genapi import StealRef, NonNull
 multiarray_global_vars = {
     'NPY_NUMUSERTYPES':             (7, 'int'),
     'NPY_DEFAULT_ASSIGN_CASTING':   (292, 'NPY_CASTING'),
+    'PyDataMem_DefaultHandler':     (306, 'PyObject*'),
 }
 
 multiarray_scalar_bool_values = {
@@ -76,9 +77,9 @@ multiarray_types_api = {
     # End 1.6 API
 }
 
-#define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
-#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[7])
-#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[8])
+# define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
+# define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[7])
+# define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[8])
 
 multiarray_funcs_api = {
     'PyArray_GetNDArrayCVersion':           (0,),
@@ -350,6 +351,9 @@ multiarray_funcs_api = {
     'PyArray_ResolveWritebackIfCopy':       (302,),
     'PyArray_SetWritebackIfCopyBase':       (303,),
     # End 1.14 API
+    'PyDataMem_SetHandler':                 (304,),
+    'PyDataMem_GetHandler':                 (305,),
+    # End 1.21 API
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 4e1182de6..cd584eea7 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -1420,7 +1420,7 @@ add_newdoc('numpy.core.umath', 'floor_divide',
 
 add_newdoc('numpy.core.umath', 'fmod',
     """
-    Return the element-wise remainder of division.
+    Returns the element-wise remainder of division.
 
     This is the NumPy implementation of the C library function fmod, the
     remainder has the same sign as the dividend `x1`. It is equivalent to
@@ -1678,7 +1678,7 @@ add_newdoc('numpy.core.umath', 'invert',
 
 add_newdoc('numpy.core.umath', 'isfinite',
     """
-    Test element-wise for finiteness (not infinity or not Not a Number).
+    Test element-wise for finiteness (not infinity and not Not a Number).
 
     The result is returned as a boolean array.
 
@@ -3065,8 +3065,14 @@ add_newdoc('numpy.core.umath', 'power',
     First array elements raised to powers from second array, element-wise.
 
     Raise each base in `x1` to the positionally-corresponding power in
-    `x2`.  `x1` and `x2` must be broadcastable to the same shape. Note that an
-    integer type raised to a negative integer power will raise a ValueError.
+    `x2`.  `x1` and `x2` must be broadcastable to the same shape.
+
+    An integer type raised to a negative integer power will raise a
+    ``ValueError``.
+
+    Negative values raised to a non-integral value will return ``nan``.
+    To get complex results, cast the input to complex, or specify the
+    ``dtype`` to be ``complex`` (see the example below).
 
     Parameters
     ----------
@@ -3121,6 +3127,21 @@ add_newdoc('numpy.core.umath', 'power',
     >>> x1 ** x2
     array([ 0,  1,  8, 27, 16,  5])
 
+    Negative values raised to a non-integral value will result in ``nan``
+    (and a warning will be generated).
+
+    >>> x3 = np.array([-1.0, -4.0])
+    >>> with np.errstate(invalid='ignore'):
+    ...     p = np.power(x3, 1.5)
+    ...
+    >>> p
+    array([nan, nan])
+
+    To get complex results, give the argument ``dtype=complex``.
+
+    >>> np.power(x3, 1.5, dtype=complex)
+    array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])
+
     """)
 
 add_newdoc('numpy.core.umath', 'float_power',
@@ -3134,6 +3155,10 @@ add_newdoc('numpy.core.umath', 'float_power',
     inexact.  The intent is that the function will return a usable result for
     negative powers and seldom overflow for positive powers.
 
+    Negative values raised to a non-integral value will return ``nan``.
+    To get complex results, cast the input to complex, or specify the
+    ``dtype`` to be ``complex`` (see the example below).
+
     .. versionadded:: 1.12.0
 
     Parameters
@@ -3181,6 +3206,21 @@ add_newdoc('numpy.core.umath', 'float_power',
     array([[  0.,   1.,   8.,  27.,  16.,   5.],
            [  0.,   1.,   8.,  27.,  16.,   5.]])
 
+    Negative values raised to a non-integral value will result in ``nan``
+    (and a warning will be generated).
+
+    >>> x3 = np.array([-1, -4])
+    >>> with np.errstate(invalid='ignore'):
+    ...     p = np.float_power(x3, 1.5)
+    ...
+    >>> p
+    array([nan, nan])
+
+    To get complex results, give the argument ``dtype=complex``.
+
+    >>> np.float_power(x3, 1.5, dtype=complex)
+    array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])
+
     """)
 
 add_newdoc('numpy.core.umath', 'radians',
@@ -3292,7 +3332,7 @@ add_newdoc('numpy.core.umath', 'reciprocal',
 
 add_newdoc('numpy.core.umath', 'remainder',
     """
-    Return element-wise remainder of division.
+    Returns the element-wise remainder of division.
 
     Computes the remainder complementary to the `floor_divide` function.  It is
     equivalent to the Python modulus operator``x1 % x2`` and has the same sign
@@ -3787,6 +3827,7 @@ add_newdoc('numpy.core.umath', 'sqrt',
     --------
     lib.scimath.sqrt
         A version which returns complex numbers when given negative reals.
+        Note: 0.0 and -0.0 are handled differently for complex inputs.
 
     Notes
     -----
diff --git a/numpy/core/getlimits.py b/numpy/core/getlimits.py
index 0f7031bac..ab4a4d2be 100644
--- a/numpy/core/getlimits.py
+++ b/numpy/core/getlimits.py
@@ -5,13 +5,12 @@ __all__ = ['finfo', 'iinfo']
 
 import warnings
 
-from .machar import MachAr
+from ._machar import MachAr
 from .overrides import set_module
 from . import numeric
 from . import numerictypes as ntypes
 from .numeric import array, inf, NaN
 from .umath import log10, exp2, nextafter, isnan
-from . import umath
 
 
 def _fr0(a):
@@ -386,6 +385,8 @@ class finfo:
     machar : MachAr
         The object which calculated these parameters and holds more
         detailed information.
+
+        .. deprecated:: 1.22
     machep : int
         The exponent that yields `eps`.
     max : floating point number of the appropriate type
@@ -502,7 +503,7 @@ class finfo:
         self.eps = machar.eps.flat[0]
         self.nexp = machar.iexp
         self.nmant = machar.it
-        self.machar = machar
+        self._machar = machar
         self._str_tiny = machar._str_xmin.strip()
         self._str_max = machar._str_xmax.strip()
         self._str_epsneg = machar._str_epsneg.strip()
@@ -552,11 +553,11 @@ class finfo:
         """
         # This check is necessary because the value for smallest_normal is
         # platform dependent for longdouble types.
-        if isnan(self.machar.smallest_normal.flat[0]):
+        if isnan(self._machar.smallest_normal.flat[0]):
             warnings.warn(
                 'The value of smallest normal is undefined for double double',
                 UserWarning, stacklevel=2)
-        return self.machar.smallest_normal.flat[0]
+        return self._machar.smallest_normal.flat[0]
 
     @property
     def tiny(self):
@@ -575,6 +576,20 @@ class finfo:
         """
         return self.smallest_normal
 
+    @property
+    def machar(self):
+        """The object which calculated these parameters and holds more
+        detailed information.
+
+        .. deprecated:: 1.22
+        """
+        # Deprecated 2021-10-27, NumPy 1.22
+        warnings.warn(
+            "`finfo.machar` is deprecated (NumPy 1.22)",
+            DeprecationWarning, stacklevel=2,
+        )
+        return self._machar
+
 
 @set_module('numpy')
 class iinfo:
diff --git a/numpy/core/getlimits.pyi b/numpy/core/getlimits.pyi
index ca22e18f7..66d062995 100644
--- a/numpy/core/getlimits.pyi
+++ b/numpy/core/getlimits.pyi
@@ -1,58 +1,8 @@
-from typing import Any, Generic, List, Type, TypeVar
+from typing import List
 
 from numpy import (
     finfo as finfo,
     iinfo as iinfo,
-    floating,
-    signedinteger,
 )
 
-from numpy.typing import NBitBase, NDArray
-
-_NBit = TypeVar("_NBit", bound=NBitBase)
-
 __all__: List[str]
-
-class MachArLike(Generic[_NBit]):
-    def __init__(
-        self,
-        ftype: Type[floating[_NBit]],
-        *,
-        eps: floating[Any],
-        epsneg: floating[Any],
-        huge: floating[Any],
-        tiny: floating[Any],
-        ibeta: int,
-        smallest_subnormal: None | floating[Any] = ...,
-        # Expand `**kwargs` into keyword-only arguments
-        machep: int,
-        negep: int,
-        minexp: int,
-        maxexp: int,
-        it: int,
-        iexp: int,
-        irnd: int,
-        ngrd: int,
-    ) -> None: ...
-    @property
-    def smallest_subnormal(self) -> NDArray[floating[_NBit]]: ...
-    eps: NDArray[floating[_NBit]]
-    epsilon: NDArray[floating[_NBit]]
-    epsneg: NDArray[floating[_NBit]]
-    huge: NDArray[floating[_NBit]]
-    ibeta: signedinteger[_NBit]
-    iexp: int
-    irnd: int
-    it: int
-    machep: int
-    maxexp: int
-    minexp: int
-    negep: int
-    ngrd: int
-    precision: int
-    resolution: NDArray[floating[_NBit]]
-    smallest_normal: NDArray[floating[_NBit]]
-    tiny: NDArray[floating[_NBit]]
-    title: str
-    xmax: NDArray[floating[_NBit]]
-    xmin: NDArray[floating[_NBit]]
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index 22854a725..554c7fb6c 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -16,13 +16,47 @@
  * in your module init.  (A version mismatch will be reported, just update
  * to the correct one, this will alert you of possible changes.)
  *
- * The two main symbols exported are:
+ * The following lists the main symbols currently exported.  Please do not
+ * hesitate to ask for help or clarification:
  *
- * - PyUFunc_AddLoopFromSpec  (Register a new loop for a ufunc)
- * - PyArrayInitDTypeMeta_FromSpec  (Create a new DType)
+ * - PyUFunc_AddLoopFromSpec:
  *
- * Please check the in-line documentation for details and do not hesitate to
- * ask for help.
+ *     Register a new loop for a ufunc.  This uses the `PyArrayMethod_Spec`
+ *     which must be filled in (see in-line comments).
+ *
+ * - PyUFunc_AddPromoter:
+ *
+ *     Register a new promoter for a ufunc.  A promoter is a function stored
+ *     in a PyCapsule (see in-line comments).  It is passed the operation and
+ *     requested DType signatures and can mutate it to attempt a new search
+ *     for a matching loop/promoter.
+ *     I.e. for Numba a promoter could even add the desired loop.
+ *
+ * - PyArrayInitDTypeMeta_FromSpec:
+ *
+ *     Initialize a new DType.  It must currently be a static Python C type
+ *     that is declared as `PyArray_DTypeMeta` and not `PyTypeObject`.
+ *     Further, it must subclass `np.dtype` and set its type to
+ *     `PyArrayDTypeMeta_Type` (before calling `PyType_Read()`).
+ *
+ * - PyArray_CommonDType:
+ *
+ *     Find the common-dtype ("promotion") for two DType classes.  Similar
+ *     to `np.result_type`, but works on the classes and not instances.
+ *
+ * - PyArray_PromoteDTypeSequence:
+ *
+ *     Same as CommonDType, but works with an arbitrary number of DTypes.
+ *     This function is smarter and can often return successful and unambiguous
+ *     results when `common_dtype(common_dtype(dt1, dt2), dt3)` would
+ *     depend on the operation order or fail.  Nevertheless, DTypes should
+ *     aim to ensure that their common-dtype implementation is associative
+ *     and commutative!  (Mainly, unsigned and signed integers are not.)
+ *
+ *     For guaranteed consistent results DTypes must implement common-Dtype
+ *     "transitively".  If A promotes B and B promotes C, than A must generally
+ *     also promote C; where "promotes" means implements the promotion.
+ *     (There are some exceptions for abstract DTypes)
  *
  * WARNING
  * =======
@@ -67,11 +101,28 @@ __not_imported(void)
     printf("*****\nCritical error, dtype API not imported\n*****\n");
 }
 static void *__uninitialized_table[] = {
+        &__not_imported, &__not_imported, &__not_imported, &__not_imported,
         &__not_imported, &__not_imported, &__not_imported, &__not_imported};
 
 
 static void **__experimental_dtype_api_table = __uninitialized_table;
 
+
+/*
+ * DTypeMeta struct, the content may be made fully opaque (except the size).
+ * We may also move everything into a single `void *dt_slots`.
+ */
+typedef struct {
+    PyHeapTypeObject super;
+    PyArray_Descr *singleton;
+    int type_num;
+    PyTypeObject *scalar_type;
+    npy_uint64 flags;
+    void *dt_slots;
+    void *reserved[3];
+} PyArray_DTypeMeta;
+
+
 /*
  * ******************************************************
  *         ArrayMethod API (Casting and UFuncs)
@@ -128,6 +179,28 @@ typedef PyObject *_ufunc_addloop_fromspec_func(
 
 
 /*
+ * Type of the C promoter function, which must be wrapped into a
+ * PyCapsule with name "numpy._ufunc_promoter".
+ */
+typedef int promoter_function(PyObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[]);
+
+/*
+ * Function to register a promoter.
+ *
+ * @param ufunc The ufunc object to register the promoter with.
+ * @param DType_tuple A Python tuple containing DTypes or None matching the
+ *        number of inputs and outputs of the ufunc.
+ * @param promoter A PyCapsule with name "numpy._ufunc_promoter" containing
+ *        a pointer to a `promoter_function`.
+ */
+typedef int _ufunc_addpromoter_func(
+        PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter);
+#define PyUFunc_AddPromoter \
+    (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
+
+/*
  * In addition to the normal casting levels, NPY_CAST_IS_VIEW indicates
  * that no cast operation is necessary at all (although a copy usually will be)
  *
@@ -221,24 +294,8 @@ typedef struct{
 } PyArrayDTypeMeta_Spec;
 
 
-/*
- * DTypeMeta struct, the content may be made fully opaque (except the size).
- * We may also move everything into a single `void *dt_slots`.
- */
-typedef struct {
-    PyHeapTypeObject super;
-    PyArray_Descr *singleton;
-    int type_num;
-    PyTypeObject *scalar_type;
-    npy_uint64 flags;
-    void *dt_slots;
-    void *reserved[3];
-} PyArray_DTypeMeta;
-
-
 #define PyArrayDTypeMeta_Type \
-    (&(PyTypeObject *)__experimental_dtype_api_table[1])
-
+    (*(PyTypeObject *)__experimental_dtype_api_table[2])
 typedef int __dtypemeta_fromspec(
         PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *dtype_spec);
 /*
@@ -250,8 +307,25 @@ typedef int __dtypemeta_fromspec(
  * uses `PyArray_DTypeMeta` defined above as the C-structure.
  */
 #define PyArrayInitDTypeMeta_FromSpec \
-    ((__dtypemeta_fromspec *)(__experimental_dtype_api_table[2]))
+    ((__dtypemeta_fromspec *)(__experimental_dtype_api_table[3]))
+
+
+/*
+ * *************************************
+ *          WORKING WITH DTYPES
+ * *************************************
+ */
+
+typedef PyArray_DTypeMeta *__common_dtype(
+        PyArray_DTypeMeta *DType1, PyArray_DTypeMeta *DType2);
+#define PyArray_CommonDType \
+    ((__common_dtype *)(__experimental_dtype_api_table[4]))
+
 
+typedef PyArray_DTypeMeta *__promote_dtype_sequence(
+        npy_intp num, PyArray_DTypeMeta *DTypes[]);
+#define PyArray_PromoteDTypeSequence \
+    ((__promote_dtype_sequence *)(__experimental_dtype_api_table[5]))
 
 
 /*
@@ -264,7 +338,7 @@ typedef int __dtypemeta_fromspec(
  * runtime-check this.
  * You must call this function to use the symbols defined in this file.
  */
-#define __EXPERIMENTAL_DTYPE_VERSION 1
+#define __EXPERIMENTAL_DTYPE_VERSION 2
 
 static int
 import_experimental_dtype_api(int version)
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 8d810fa64..6240adc0c 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -355,12 +355,10 @@ struct NpyAuxData_tag {
 #define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr);
 #define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr);
 
-  /*
-   * Macros to define how array, and dimension/strides data is
-   * allocated.
-   */
-
-  /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */
+/*
+* Macros to define how array, and dimension/strides data is
+* allocated. These should be made private
+*/
 
 #define NPY_USE_PYMEM 1
 
@@ -667,6 +665,29 @@ typedef struct _arr_descr {
 } PyArray_ArrayDescr;
 
 /*
+ * Memory handler structure for array data.
+ */
+/* The declaration of free differs from PyMemAllocatorEx */
+typedef struct {
+    void *ctx;
+    void* (*malloc) (void *ctx, size_t size);
+    void* (*calloc) (void *ctx, size_t nelem, size_t elsize);
+    void* (*realloc) (void *ctx, void *ptr, size_t new_size);
+    void (*free) (void *ctx, void *ptr, size_t size);
+    /*
+     * This is the end of the version=1 struct. Only add new fields after
+     * this line
+     */
+} PyDataMemAllocator;
+
+typedef struct {
+    char name[127];  /* multiple of 64 to keep the struct aligned */
+    uint8_t version; /* currently 1 */
+    PyDataMemAllocator allocator;
+} PyDataMem_Handler;
+
+
+/*
  * The main array object structure.
  *
  * It has been recommended to use the inline functions defined below
@@ -716,6 +737,10 @@ typedef struct tagPyArrayObject_fields {
     /* For weak references */
     PyObject *weakreflist;
     void *_buffer_info;  /* private buffer info, tagged to allow warning */
+    /*
+     * For malloc/calloc/realloc/free per object
+     */
+    PyObject *mem_handler;
 } PyArrayObject_fields;
 
 /*
@@ -843,7 +868,7 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 
 /*
  * Always copy the array. Returned arrays are always CONTIGUOUS,
- * ALIGNED, and WRITEABLE.
+ * ALIGNED, and WRITEABLE. See also: NPY_ARRAY_ENSURENOCOPY = 0x4000.
  *
  * This flag may be requested in constructor functions.
  */
@@ -913,6 +938,13 @@ typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *);
 #define NPY_ARRAY_WRITEBACKIFCOPY 0x2000
 
 /*
+ * No copy may be made while converting from an object/array (result is a view)
+ *
+ * This flag may be requested in constructor functions.
+ */
+#define NPY_ARRAY_ENSURENOCOPY 0x4000
+
+/*
  * NOTE: there are also internal flags defined in multiarray/arrayobject.h,
  * which start at bit 31 and work down.
  */
@@ -1659,6 +1691,12 @@ PyArray_CLEARFLAGS(PyArrayObject *arr, int flags)
     ((PyArrayObject_fields *)arr)->flags &= ~flags;
 }
 
+static NPY_INLINE NPY_RETURNS_BORROWED_REF PyObject *
+PyArray_HANDLER(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->mem_handler;
+}
+
 #define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL)
 
 #define PyTypeNum_ISUNSIGNED(type) (((type) == NPY_UBYTE) ||   \
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 12a3e725a..88794ca07 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -356,14 +356,31 @@ typedef unsigned long npy_ulonglong;
 typedef unsigned char npy_bool;
 #define NPY_FALSE 0
 #define NPY_TRUE 1
-
-
+/*
+ * `NPY_SIZEOF_LONGDOUBLE` isn't usually equal to sizeof(long double).
+ * In some certain cases, it may forced to be equal to sizeof(double)
+ * even against the compiler implementation and the same goes for
+ * `complex long double`.
+ *
+ * Therefore, avoid `long double`, use `npy_longdouble` instead,
+ * and when it comes to standard math functions make sure of using
+ * the double version when `NPY_SIZEOF_LONGDOUBLE` == `NPY_SIZEOF_DOUBLE`.
+ * For example:
+ *   npy_longdouble *ptr, x;
+ *   #if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+ *       npy_longdouble r = modf(x, ptr);
+ *   #else
+ *       npy_longdouble r = modfl(x, ptr);
+ *   #endif
+ *
+ * See https://github.com/numpy/numpy/issues/20348
+ */
 #if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
-        typedef double npy_longdouble;
-        #define NPY_LONGDOUBLE_FMT "g"
+    #define NPY_LONGDOUBLE_FMT "g"
+    typedef double npy_longdouble;
 #else
-        typedef long double npy_longdouble;
-        #define NPY_LONGDOUBLE_FMT "Lg"
+    #define NPY_LONGDOUBLE_FMT "Lg"
+    typedef long double npy_longdouble;
 #endif
 
 #ifndef Py_USING_UNICODE
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index b1e6363e3..bead0dc14 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -150,6 +150,17 @@ NPY_INPLACE npy_long npy_lshiftl(npy_long a, npy_long b);
 NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
 
+NPY_INPLACE uint8_t npy_popcountuhh(npy_ubyte a);
+NPY_INPLACE uint8_t npy_popcountuh(npy_ushort a);
+NPY_INPLACE uint8_t npy_popcountu(npy_uint a);
+NPY_INPLACE uint8_t npy_popcountul(npy_ulong a);
+NPY_INPLACE uint8_t npy_popcountull(npy_ulonglong a);
+NPY_INPLACE uint8_t npy_popcounthh(npy_byte a);
+NPY_INPLACE uint8_t npy_popcounth(npy_short a);
+NPY_INPLACE uint8_t npy_popcount(npy_int a);
+NPY_INPLACE uint8_t npy_popcountl(npy_long a);
+NPY_INPLACE uint8_t npy_popcountll(npy_longlong a);
+
 /*
  * C99 double math funcs
  */
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index b2ce66244..4eac083e7 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -19,6 +19,19 @@
         #define NPY_SIZEOF_LONG         4
         #define NPY_SIZEOF_PY_INTPTR_T  4
     #endif
+
+    #undef NPY_SIZEOF_LONGDOUBLE
+    #undef NPY_SIZEOF_COMPLEX_LONGDOUBLE
+
+    #ifdef __x86_64
+        #define NPY_SIZEOF_LONGDOUBLE         16
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32
+    #elif defined(__arm64__)
+        #define NPY_SIZEOF_LONGDOUBLE         8
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 16
+    #else
+        #error "unknown architecture"
+    #endif
 #endif
 
 /**
@@ -43,6 +56,7 @@
 #define NPY_1_19_API_VERSION 0x00000008
 #define NPY_1_20_API_VERSION 0x0000000e
 #define NPY_1_21_API_VERSION 0x0000000e
-#define NPY_1_22_API_VERSION 0x0000000e
+#define NPY_1_22_API_VERSION 0x0000000f
+#define NPY_1_23_API_VERSION 0x0000000f
 
 #endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_ */
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 3f184bd45..1d7050bbe 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -173,11 +173,8 @@ typedef struct _tagPyUFuncObject {
          * but this was never implemented. (This is also why the above
          * selector is called the "legacy" selector.)
          */
-    #if PY_VERSION_HEX >= 0x03080000
         vectorcallfunc vectorcall;
-    #else
-        void *reserved2;
-    #endif
+
         /* Was previously the `PyUFunc_MaskedInnerLoopSelectionFunc` */
         void *_always_null_previously_masked_innerloop_selector;
 
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 154df6f4d..f88d75978 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -14,8 +14,9 @@ from ._multiarray_umath import *  # noqa: F403
 # do not change them. issue gh-15518
 # _get_ndarray_c_version is semi-public, on purpose not added to __all__
 from ._multiarray_umath import (
-    _fastCopyAndTranspose, _flagdict, _insert, _reconstruct, _vec_string,
-    _ARRAY_API, _monotonicity, _get_ndarray_c_version, _set_madvise_hugepage,
+    _fastCopyAndTranspose, _flagdict, _from_dlpack, _insert, _reconstruct,
+    _vec_string, _ARRAY_API, _monotonicity, _get_ndarray_c_version,
+    _set_madvise_hugepage,
     )
 
 __all__ = [
@@ -23,29 +24,30 @@ __all__ = [
     'ITEM_HASOBJECT', 'ITEM_IS_POINTER', 'LIST_PICKLE', 'MAXDIMS',
     'MAY_SHARE_BOUNDS', 'MAY_SHARE_EXACT', 'NEEDS_INIT', 'NEEDS_PYAPI',
     'RAISE', 'USE_GETITEM', 'USE_SETITEM', 'WRAP', '_fastCopyAndTranspose',
-    '_flagdict', '_insert', '_reconstruct', '_vec_string', '_monotonicity',
-    'add_docstring', 'arange', 'array', 'asarray', 'asanyarray',
-    'ascontiguousarray', 'asfortranarray', 'bincount', 'broadcast',
-    'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
+    '_flagdict', '_from_dlpack', '_insert', '_reconstruct', '_vec_string',
+    '_monotonicity', 'add_docstring', 'arange', 'array', 'asarray',
+    'asanyarray', 'ascontiguousarray', 'asfortranarray', 'bincount',
+    'broadcast', 'busday_count', 'busday_offset', 'busdaycalendar', 'can_cast',
     'compare_chararrays', 'concatenate', 'copyto', 'correlate', 'correlate2',
     'count_nonzero', 'c_einsum', 'datetime_as_string', 'datetime_data',
     'dot', 'dragon4_positional', 'dragon4_scientific', 'dtype',
     'empty', 'empty_like', 'error', 'flagsobj', 'flatiter', 'format_longfloat',
-    'frombuffer', 'fromfile', 'fromiter', 'fromstring', 'inner',
-    'interp', 'interp_complex', 'is_busday', 'lexsort',
-    'matmul', 'may_share_memory', 'min_scalar_type', 'ndarray', 'nditer',
-    'nested_iters', 'normalize_axis_index', 'packbits',
-    'promote_types', 'putmask', 'ravel_multi_index', 'result_type', 'scalar',
-    'set_datetimeparse_function', 'set_legacy_print_mode', 'set_numeric_ops',
-    'set_string_function', 'set_typeDict', 'shares_memory',
-    'tracemalloc_domain', 'typeinfo', 'unpackbits', 'unravel_index', 'vdot',
-    'where', 'zeros']
+    'frombuffer', 'fromfile', 'fromiter', 'fromstring',
+    'get_handler_name', 'get_handler_version', 'inner', 'interp',
+    'interp_complex', 'is_busday', 'lexsort', 'matmul', 'may_share_memory',
+    'min_scalar_type', 'ndarray', 'nditer', 'nested_iters',
+    'normalize_axis_index', 'packbits', 'promote_types', 'putmask',
+    'ravel_multi_index', 'result_type', 'scalar', 'set_datetimeparse_function',
+    'set_legacy_print_mode', 'set_numeric_ops', 'set_string_function',
+    'set_typeDict', 'shares_memory', 'tracemalloc_domain', 'typeinfo',
+    'unpackbits', 'unravel_index', 'vdot', 'where', 'zeros']
 
 # For backward compatibility, make sure pickle imports these functions from here
 _reconstruct.__module__ = 'numpy.core.multiarray'
 scalar.__module__ = 'numpy.core.multiarray'
 
 
+_from_dlpack.__module__ = 'numpy'
 arange.__module__ = 'numpy'
 array.__module__ = 'numpy'
 asarray.__module__ = 'numpy'
diff --git a/numpy/core/multiarray.pyi b/numpy/core/multiarray.pyi
index 1f3792ecb..a9f68e181 100644
--- a/numpy/core/multiarray.pyi
+++ b/numpy/core/multiarray.pyi
@@ -50,6 +50,7 @@ from numpy import (
     _ModeKind,
     _SupportsBuffer,
     _IOProtocol,
+    _CopyMode,
     _NDIterFlagsKind,
     _NDIterOpFlagsKind,
 )
@@ -177,7 +178,7 @@ def array(
     object: _ArrayType,
     dtype: None = ...,
     *,
-    copy: bool = ...,
+    copy: bool | _CopyMode = ...,
     order: _OrderKACF = ...,
     subok: L[True],
     ndmin: int = ...,
@@ -188,7 +189,7 @@ def array(
     object: _ArrayLike[_SCT],
     dtype: None = ...,
     *,
-    copy: bool = ...,
+    copy: bool | _CopyMode = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
@@ -199,7 +200,7 @@ def array(
     object: object,
     dtype: None = ...,
     *,
-    copy: bool = ...,
+    copy: bool | _CopyMode = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
@@ -210,7 +211,7 @@ def array(
     object: Any,
     dtype: _DTypeLike[_SCT],
     *,
-    copy: bool = ...,
+    copy: bool | _CopyMode = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
@@ -221,7 +222,7 @@ def array(
     object: Any,
     dtype: DTypeLike,
     *,
-    copy: bool = ...,
+    copy: bool | _CopyMode = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
     ndmin: int = ...,
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index d8a0cf9a6..014fa0a39 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -13,8 +13,8 @@ from .multiarray import (
     WRAP, arange, array, asarray, asanyarray, ascontiguousarray,
     asfortranarray, broadcast, can_cast, compare_chararrays,
     concatenate, copyto, dot, dtype, empty,
-    empty_like, flatiter, frombuffer, fromfile, fromiter, fromstring,
-    inner, lexsort, matmul, may_share_memory,
+    empty_like, flatiter, frombuffer, _from_dlpack, fromfile, fromiter,
+    fromstring, inner, lexsort, matmul, may_share_memory,
     min_scalar_type, ndarray, nditer, nested_iters, promote_types,
     putmask, result_type, set_numeric_ops, shares_memory, vdot, where,
     zeros, normalize_axis_index)
@@ -41,7 +41,7 @@ __all__ = [
     'newaxis', 'ndarray', 'flatiter', 'nditer', 'nested_iters', 'ufunc',
     'arange', 'array', 'asarray', 'asanyarray', 'ascontiguousarray',
     'asfortranarray', 'zeros', 'count_nonzero', 'empty', 'broadcast', 'dtype',
-    'fromstring', 'fromfile', 'frombuffer', 'where',
+    'fromstring', 'fromfile', 'frombuffer', '_from_dlpack', 'where',
     'argwhere', 'copyto', 'concatenate', 'fastCopyAndTranspose', 'lexsort',
     'set_numeric_ops', 'can_cast', 'promote_types', 'min_scalar_type',
     'result_type', 'isfortran', 'empty_like', 'zeros_like', 'ones_like',
@@ -1184,7 +1184,7 @@ def roll(a, shift, axis=None):
     >>> np.roll(x, -2)
     array([2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
 
-    >>> x2 = np.reshape(x, (2,5))
+    >>> x2 = np.reshape(x, (2, 5))
     >>> x2
     array([[0, 1, 2, 3, 4],
            [5, 6, 7, 8, 9]])
@@ -1206,6 +1206,12 @@ def roll(a, shift, axis=None):
     >>> np.roll(x2, -1, axis=1)
     array([[1, 2, 3, 4, 0],
            [6, 7, 8, 9, 5]])
+    >>> np.roll(x2, (1, 1), axis=(1, 0))
+    array([[9, 5, 6, 7, 8],
+           [4, 0, 1, 2, 3]])
+    >>> np.roll(x2, (2, 1), axis=(1, 0))
+    array([[8, 9, 5, 6, 7],
+           [3, 4, 0, 1, 2]])
 
     """
     a = asanyarray(a)
@@ -1823,6 +1829,14 @@ def fromfunction(function, shape, *, dtype=float, like=None, **kwargs):
 
     Examples
     --------
+    >>> np.fromfunction(lambda i, j: i, (2, 2), dtype=float)
+    array([[0., 0.],
+           [1., 1.]])
+           
+    >>> np.fromfunction(lambda i, j: j, (2, 2), dtype=float)    
+    array([[0., 1.],
+           [0., 1.]])
+           
     >>> np.fromfunction(lambda i, j: i == j, (3, 3), dtype=int)
     array([[ True, False, False],
            [False,  True, False],
diff --git a/numpy/core/numeric.pyi b/numpy/core/numeric.pyi
index 54ab4b7c8..d7ec30351 100644
--- a/numpy/core/numeric.pyi
+++ b/numpy/core/numeric.pyi
@@ -1,6 +1,5 @@
 from typing import (
     Any,
-    Optional,
     Union,
     Sequence,
     Tuple,
@@ -8,18 +7,64 @@ from typing import (
     List,
     overload,
     TypeVar,
-    Iterable,
     Literal,
+    Type,
+    SupportsAbs,
+    SupportsIndex,
+    NoReturn,
 )
+from typing_extensions import TypeGuard
 
-from numpy import ndarray, generic, dtype, bool_, signedinteger, _OrderKACF, _OrderCF
-from numpy.typing import ArrayLike, DTypeLike, _ShapeLike
+from numpy import (
+    ComplexWarning as ComplexWarning,
+    dtype,
+    generic,
+    unsignedinteger,
+    signedinteger,
+    floating,
+    complexfloating,
+    bool_,
+    int_,
+    intp,
+    float64,
+    timedelta64,
+    object_,
+    _OrderKACF,
+    _OrderCF,
+)
+
+from numpy.typing import (
+    ArrayLike,
+    NDArray,
+    DTypeLike,
+    _ShapeLike,
+    _SupportsDType,
+    _FiniteNestedSequence,
+    _SupportsArray,
+    _ScalarLike_co,
+    _ArrayLikeBool_co,
+    _ArrayLikeUInt_co,
+    _ArrayLikeInt_co,
+    _ArrayLikeFloat_co,
+    _ArrayLikeComplex_co,
+    _ArrayLikeTD64_co,
+    _ArrayLikeObject_co,
+)
 
 _T = TypeVar("_T")
-_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+_SCT = TypeVar("_SCT", bound=generic)
+_ArrayType = TypeVar("_ArrayType", bound=NDArray[Any])
 
+_DTypeLike = Union[
+    dtype[_SCT],
+    Type[_SCT],
+    _SupportsDType[dtype[_SCT]],
+]
+_ArrayLike = _FiniteNestedSequence[_SupportsArray[dtype[_SCT]]]
 _CorrelateMode = Literal["valid", "same", "full"]
 
+__all__: List[str]
+
 @overload
 def zeros_like(
     a: _ArrayType,
@@ -30,20 +75,61 @@ def zeros_like(
 ) -> _ArrayType: ...
 @overload
 def zeros_like(
-    a: ArrayLike,
-    dtype: DTypeLike = ...,
+    a: _ArrayLike[_SCT],
+    dtype: None = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def zeros_like(
+    a: object,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def zeros_like(
+    a: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def zeros_like(
+    a: Any,
+    dtype: DTypeLike,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
+@overload
 def ones(
     shape: _ShapeLike,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[float64]: ...
+@overload
+def ones(
+    shape: _ShapeLike,
+    dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
+@overload
+def ones(
+    shape: _ShapeLike,
+    dtype: DTypeLike,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
 
 @overload
 def ones_like(
@@ -55,21 +141,64 @@ def ones_like(
 ) -> _ArrayType: ...
 @overload
 def ones_like(
-    a: ArrayLike,
-    dtype: DTypeLike = ...,
+    a: _ArrayLike[_SCT],
+    dtype: None = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def ones_like(
+    a: object,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def ones_like(
+    a: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def ones_like(
+    a: Any,
+    dtype: DTypeLike,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
+@overload
 def full(
     shape: _ShapeLike,
     fill_value: Any,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
+@overload
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
+@overload
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: DTypeLike,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
 
 @overload
 def full_like(
@@ -82,13 +211,40 @@ def full_like(
 ) -> _ArrayType: ...
 @overload
 def full_like(
-    a: ArrayLike,
+    a: _ArrayLike[_SCT],
     fill_value: Any,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def full_like(
+    a: object,
+    fill_value: Any,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def full_like(
+    a: Any,
+    fill_value: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def full_like(
+    a: Any,
+    fill_value: Any,
+    dtype: DTypeLike,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
 @overload
 def count_nonzero(
@@ -105,78 +261,306 @@ def count_nonzero(
     keepdims: bool = ...,
 ) -> Any: ...  # TODO: np.intp or ndarray[np.intp]
 
-def isfortran(a: Union[ndarray, generic]) -> bool: ...
+def isfortran(a: NDArray[Any] | generic) -> bool: ...
 
-def argwhere(a: ArrayLike) -> ndarray: ...
+def argwhere(a: ArrayLike) -> NDArray[intp]: ...
 
-def flatnonzero(a: ArrayLike) -> ndarray: ...
+def flatnonzero(a: ArrayLike) -> NDArray[intp]: ...
 
+@overload
 def correlate(
-    a: ArrayLike,
-    v: ArrayLike,
+    a: _ArrayLikeBool_co,
+    v: _ArrayLikeBool_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[bool_]: ...
+@overload
+def correlate(
+    a: _ArrayLikeUInt_co,
+    v: _ArrayLikeUInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeInt_co,
+    v: _ArrayLikeInt_co,
     mode: _CorrelateMode = ...,
-) -> ndarray: ...
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeFloat_co,
+    v: _ArrayLikeFloat_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeComplex_co,
+    v: _ArrayLikeComplex_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeTD64_co,
+    v: _ArrayLikeTD64_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def correlate(
+    a: _ArrayLikeObject_co,
+    v: _ArrayLikeObject_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[object_]: ...
 
+@overload
 def convolve(
-    a: ArrayLike,
-    v: ArrayLike,
+    a: _ArrayLikeBool_co,
+    v: _ArrayLikeBool_co,
     mode: _CorrelateMode = ...,
-) -> ndarray: ...
+) -> NDArray[bool_]: ...
+@overload
+def convolve(
+    a: _ArrayLikeUInt_co,
+    v: _ArrayLikeUInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeInt_co,
+    v: _ArrayLikeInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeFloat_co,
+    v: _ArrayLikeFloat_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeComplex_co,
+    v: _ArrayLikeComplex_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeTD64_co,
+    v: _ArrayLikeTD64_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def convolve(
+    a: _ArrayLikeObject_co,
+    v: _ArrayLikeObject_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[object_]: ...
 
 @overload
 def outer(
-    a: ArrayLike,
-    b: ArrayLike,
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
     out: None = ...,
-) -> ndarray: ...
+) -> NDArray[bool_]: ...
 @overload
 def outer(
-    a: ArrayLike,
-    b: ArrayLike,
-    out: _ArrayType = ...,
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    out: None = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    out: None = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    out: None = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    out: None = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeTD64_co,
+    b: _ArrayLikeTD64_co,
+    out: None = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def outer(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    out: None = ...,
+) -> NDArray[object_]: ...
+@overload
+def outer(
+    a: _ArrayLikeComplex_co | _ArrayLikeTD64_co | _ArrayLikeObject_co,
+    b: _ArrayLikeComplex_co | _ArrayLikeTD64_co | _ArrayLikeObject_co,
+    out: _ArrayType,
 ) -> _ArrayType: ...
 
+@overload
 def tensordot(
-    a: ArrayLike,
-    b: ArrayLike,
-    axes: Union[int, Tuple[_ShapeLike, _ShapeLike]] = ...,
-) -> ndarray: ...
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[bool_]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeTD64_co,
+    b: _ArrayLikeTD64_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[object_]: ...
 
+@overload
+def roll(
+    a: _ArrayLike[_SCT],
+    shift: _ShapeLike,
+    axis: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
 def roll(
     a: ArrayLike,
     shift: _ShapeLike,
-    axis: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    axis: None | _ShapeLike = ...,
+) -> NDArray[Any]: ...
 
-def rollaxis(a: ndarray, axis: int, start: int = ...) -> ndarray: ...
+def rollaxis(
+    a: NDArray[_SCT],
+    axis: int,
+    start: int = ...,
+) -> NDArray[_SCT]: ...
 
 def moveaxis(
-    a: ndarray,
+    a: NDArray[_SCT],
     source: _ShapeLike,
     destination: _ShapeLike,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
 
+@overload
 def cross(
-    a: ArrayLike,
-    b: ArrayLike,
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
     axisa: int = ...,
     axisb: int = ...,
     axisc: int = ...,
-    axis: Optional[int] = ...,
-) -> ndarray: ...
+    axis: None | int = ...,
+) -> NoReturn: ...
+@overload
+def cross(
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[object_]: ...
 
 @overload
 def indices(
     dimensions: Sequence[int],
-    dtype: DTypeLike = ...,
+    dtype: Type[int] = ...,
     sparse: Literal[False] = ...,
-) -> ndarray: ...
+) -> NDArray[int_]: ...
 @overload
 def indices(
     dimensions: Sequence[int],
-    dtype: DTypeLike = ...,
+    dtype: Type[int] = ...,
     sparse: Literal[True] = ...,
-) -> Tuple[ndarray, ...]: ...
+) -> Tuple[NDArray[int_], ...]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: _DTypeLike[_SCT],
+    sparse: Literal[False] = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: _DTypeLike[_SCT],
+    sparse: Literal[True],
+) -> Tuple[NDArray[_SCT], ...]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike,
+    sparse: Literal[False] = ...,
+) -> NDArray[Any]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike,
+    sparse: Literal[True],
+) -> Tuple[NDArray[Any], ...]: ...
 
 def fromfunction(
     function: Callable[..., _T],
@@ -187,18 +571,39 @@ def fromfunction(
     **kwargs: Any,
 ) -> _T: ...
 
-def isscalar(element: Any) -> bool: ...
+def isscalar(element: object) -> TypeGuard[
+    generic | bool | int | float | complex | str | bytes | memoryview
+]: ...
 
-def binary_repr(num: int, width: Optional[int] = ...) -> str: ...
+def binary_repr(num: int, width: None | int = ...) -> str: ...
 
-def base_repr(number: int, base: int = ..., padding: int = ...) -> str: ...
+def base_repr(
+    number: SupportsAbs[float],
+    base: float = ...,
+    padding: SupportsIndex = ...,
+) -> str: ...
 
+@overload
 def identity(
     n: int,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[float64]: ...
+@overload
+def identity(
+    n: int,
+    dtype: _DTypeLike[_SCT],
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def identity(
+    n: int,
+    dtype: DTypeLike,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[Any]: ...
 
 def allclose(
     a: ArrayLike,
@@ -208,13 +613,22 @@ def allclose(
     equal_nan: bool = ...,
 ) -> bool: ...
 
+@overload
+def isclose(
+    a: _ScalarLike_co,
+    b: _ScalarLike_co,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> bool_: ...
+@overload
 def isclose(
     a: ArrayLike,
     b: ArrayLike,
     rtol: float = ...,
     atol: float = ...,
     equal_nan: bool = ...,
-) -> Any: ...
+) -> NDArray[bool_]: ...
 
 def array_equal(a1: ArrayLike, a2: ArrayLike, equal_nan: bool = ...) -> bool: ...
 
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 12f424fd4..8e5de852b 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -80,12 +80,10 @@ Exported symbols include:
 
 """
 import numbers
-import warnings
 
 from numpy.core.multiarray import (
-        typeinfo, ndarray, array, empty, dtype, datetime_data,
-        datetime_as_string, busday_offset, busday_count, is_busday,
-        busdaycalendar
+        ndarray, array, dtype, datetime_data, datetime_as_string,
+        busday_offset, busday_count, is_busday, busdaycalendar
         )
 from numpy.core.overrides import set_module
 
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2b0e33244..1ec178445 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -672,16 +672,38 @@ def configuration(parent_package='',top_path=None):
         # but we cannot use add_installed_pkg_config here either, so we only
         # update the substitution dictionary during npymath build
         config_cmd = config.get_config_cmd()
-
         # Check that the toolchain works, to fail early if it doesn't
         # (avoid late errors with MATHLIB which are confusing if the
         # compiler does not work).
-        st = config_cmd.try_link('int main(void) { return 0;}')
-        if not st:
-            # rerun the failing command in verbose mode
-            config_cmd.compiler.verbose = True
-            config_cmd.try_link('int main(void) { return 0;}')
-            raise RuntimeError("Broken toolchain: cannot link a simple C program")
+        for lang, test_code, note in (
+            ('c', 'int main(void) { return 0;}', ''),
+            ('c++', (
+                    'int main(void)'
+                    '{ auto x = 0.0; return static_cast<int>(x); }'
+                ), (
+                    'note: A compiler with support for C++11 language '
+                    'features is required.'
+                )
+             ),
+        ):
+            is_cpp = lang == 'c++'
+            if is_cpp:
+                # this a workround to get rid of invalid c++ flags
+                # without doing big changes to config.
+                # c tested first, compiler should be here
+                bk_c = config_cmd.compiler
+                config_cmd.compiler = bk_c.cxx_compiler()
+            st = config_cmd.try_link(test_code, lang=lang)
+            if not st:
+                # rerun the failing command in verbose mode
+                config_cmd.compiler.verbose = True
+                config_cmd.try_link(test_code, lang=lang)
+                raise RuntimeError(
+                    f"Broken toolchain: cannot link a simple {lang.upper()} "
+                    f"program. {note}"
+                )
+            if is_cpp:
+                config_cmd.compiler = bk_c
         mlibs = check_mathlib(config_cmd)
 
         posix_mlib = ' '.join(['-l%s' % l for l in mlibs])
@@ -696,16 +718,24 @@ def configuration(parent_package='',top_path=None):
                        join('src', 'npymath', 'halffloat.c')
                        ]
 
-    # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977.
-    # Intel and Clang also don't seem happy with /GL
-    is_msvc = (platform.platform().startswith('Windows') and
-               platform.python_compiler().startswith('MS'))
+    def gl_if_msvc(build_cmd):
+        """ Add flag if we are using MSVC compiler
+
+        We can't see this in our scope, because we have not initialized the
+        distutils build command, so use this deferred calculation to run when
+        we are building the library.
+        """
+        if build_cmd.compiler.compiler_type == 'msvc':
+            # explicitly disable whole-program optimization
+            return ['/GL-']
+        return []
+
     config.add_installed_library('npymath',
             sources=npymath_sources + [get_mathlib_info],
             install_dir='lib',
             build_info={
                 'include_dirs' : [],  # empty list required for creating npy_math_internal.h
-                'extra_compiler_args' : (['/GL-'] if is_msvc else []),
+                'extra_compiler_args': [gl_if_msvc],
             })
     config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config",
             subst_dict)
@@ -732,6 +762,7 @@ def configuration(parent_package='',top_path=None):
     #######################################################################
 
     common_deps = [
+            join('src', 'common', 'dlpack', 'dlpack.h'),
             join('src', 'common', 'array_assign.h'),
             join('src', 'common', 'binop_override.h'),
             join('src', 'common', 'cblasfuncs.h'),
@@ -741,6 +772,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'npy_cblas.h'),
             join('src', 'common', 'npy_config.h'),
             join('src', 'common', 'npy_ctypes.h'),
+            join('src', 'common', 'npy_dlpack.h'),
             join('src', 'common', 'npy_extint128.h'),
             join('src', 'common', 'npy_import.h'),
             join('src', 'common', 'npy_hashtable.h'),
@@ -873,6 +905,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'datetime_busday.c'),
             join('src', 'multiarray', 'datetime_busdaycal.c'),
             join('src', 'multiarray', 'descriptor.c'),
+            join('src', 'multiarray', 'dlpack.c'),
             join('src', 'multiarray', 'dtypemeta.c'),
             join('src', 'multiarray', 'dragon4.c'),
             join('src', 'multiarray', 'dtype_transfer.c'),
@@ -909,7 +942,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'npysort', 'mergesort.c.src'),
             join('src', 'npysort', 'timsort.c.src'),
             join('src', 'npysort', 'heapsort.c.src'),
-            join('src', 'npysort', 'radixsort.c.src'),
+            join('src', 'npysort', 'radixsort.cpp'),
             join('src', 'common', 'npy_partition.h.src'),
             join('src', 'npysort', 'selection.c.src'),
             join('src', 'common', 'npy_binsearch.h.src'),
@@ -949,8 +982,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
-            join('src', 'umath', 'clip.h.src'),
-            join('src', 'umath', 'clip.c.src'),
+            join('src', 'umath', 'clip.h'),
+            join('src', 'umath', 'clip.cpp'),
             join('src', 'umath', 'dispatching.c'),
             join('src', 'umath', 'legacy_array_method.c'),
             join('src', 'umath', 'ufunc_object.c'),
@@ -980,6 +1013,9 @@ def configuration(parent_package='',top_path=None):
         svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
 
     config.add_extension('_multiarray_umath',
+                         # Forcing C language even though we have C++ sources.
+                         # It forces the C linker and don't link C++ runtime.
+                         language = 'c',
                          sources=multiarray_src + umath_src +
                                  common_src +
                                  [generate_config_h,
@@ -994,7 +1030,11 @@ def configuration(parent_package='',top_path=None):
                                 common_deps,
                          libraries=['npymath'],
                          extra_objects=svml_objs,
-                         extra_info=extra_info)
+                         extra_info=extra_info,
+                         extra_cxx_compile_args=['-std=c++11',
+                                                 '-D__STDC_VERSION__=0',
+                                                 '-fno-exceptions',
+                                                 '-fno-rtti'])
 
     #######################################################################
     #                        umath_tests module                           #
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 85c8f16d1..772c87c96 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -43,8 +43,9 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000d - 1.19.x
 # 0x0000000e - 1.20.x
 # 0x0000000e - 1.21.x
-# 0x0000000e - 1.22.x
-C_API_VERSION = 0x0000000e
+# 0x0000000f - 1.22.x
+# 0x0000000f - 1.23.x
+C_API_VERSION = 0x0000000f
 
 class MismatchCAPIWarning(Warning):
     pass
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 54770959c..84de9a059 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -381,7 +381,7 @@ SIMD_IMPL_INTRIN_1(sumup_@sfx@, @esfx@, v@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square#
+ * #intrin = sqrt, recip, abs, square, ceil, trunc#
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -615,7 +615,7 @@ SIMD_INTRIN_DEF(sumup_@sfx@)
  ***************************/
 #if @fp_only@
 /**begin repeat1
- * #intrin = sqrt, recip, abs, square#
+ * #intrin = sqrt, recip, abs, square, ceil, trunc#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
new file mode 100644
index 000000000..29209aee1
--- /dev/null
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -0,0 +1,201 @@
+// Taken from:
+// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 050
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+typedef enum {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data. This will be
+   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+   * aligned to 256 bytes as in CUDA.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/numpy/core/src/common/npy_dlpack.h b/numpy/core/src/common/npy_dlpack.h
new file mode 100644
index 000000000..14ca352c0
--- /dev/null
+++ b/numpy/core/src/common/npy_dlpack.h
@@ -0,0 +1,28 @@
+#include "Python.h"
+#include "dlpack/dlpack.h"
+
+#ifndef NPY_DLPACK_H
+#define NPY_DLPACK_H
+
+// Part of the Array API specification.
+#define NPY_DLPACK_CAPSULE_NAME "dltensor"
+#define NPY_DLPACK_USED_CAPSULE_NAME "used_dltensor"
+
+// Used internally by NumPy to store a base object
+// as it has to release a reference to the original
+// capsule.
+#define NPY_DLPACK_INTERNAL_CAPSULE_NAME "numpy_dltensor"
+
+PyObject *
+array_dlpack(PyArrayObject *self, PyObject *const *args, Py_ssize_t len_args,
+             PyObject *kwnames);
+
+
+PyObject *
+array_dlpack_device(PyArrayObject *self, PyObject *NPY_UNUSED(args));
+
+
+NPY_NO_EXPORT PyObject *
+_from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj);
+
+#endif
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index ddbde0c9b..b4a1e9b0c 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -49,9 +49,14 @@ NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *
  * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
  *         longlong, ulonglong#
  */
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 NPY_NO_EXPORT int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
 NPY_NO_EXPORT int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
 
 /**end repeat**/
 
diff --git a/numpy/core/src/common/numpy_tag.h b/numpy/core/src/common/numpy_tag.h
new file mode 100644
index 000000000..dc8d5286b
--- /dev/null
+++ b/numpy/core/src/common/numpy_tag.h
@@ -0,0 +1,78 @@
+#ifndef _NPY_COMMON_TAG_H_
+#define _NPY_COMMON_TAG_H_
+
+namespace npy {
+
+struct integral_tag {
+};
+struct floating_point_tag {
+};
+struct complex_tag {
+};
+struct date_tag {
+};
+
+struct bool_tag : integral_tag {
+    using type = npy_bool;
+};
+struct byte_tag : integral_tag {
+    using type = npy_byte;
+};
+struct ubyte_tag : integral_tag {
+    using type = npy_ubyte;
+};
+struct short_tag : integral_tag {
+    using type = npy_short;
+};
+struct ushort_tag : integral_tag {
+    using type = npy_ushort;
+};
+struct int_tag : integral_tag {
+    using type = npy_int;
+};
+struct uint_tag : integral_tag {
+    using type = npy_uint;
+};
+struct long_tag : integral_tag {
+    using type = npy_long;
+};
+struct ulong_tag : integral_tag {
+    using type = npy_ulong;
+};
+struct longlong_tag : integral_tag {
+    using type = npy_longlong;
+};
+struct ulonglong_tag : integral_tag {
+    using type = npy_ulonglong;
+};
+struct half_tag {
+    using type = npy_half;
+};
+struct float_tag : floating_point_tag {
+    using type = npy_float;
+};
+struct double_tag : floating_point_tag {
+    using type = npy_double;
+};
+struct longdouble_tag : floating_point_tag {
+    using type = npy_longdouble;
+};
+struct cfloat_tag : complex_tag {
+    using type = npy_cfloat;
+};
+struct cdouble_tag : complex_tag {
+    using type = npy_cdouble;
+};
+struct clongdouble_tag : complex_tag {
+    using type = npy_clongdouble;
+};
+struct datetime_tag : date_tag {
+    using type = npy_datetime;
+};
+struct timedelta_tag : date_tag {
+    using type = npy_timedelta;
+};
+
+}  // namespace npy
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/math.h b/numpy/core/src/common/simd/avx2/math.h
index 9460183df..ec15e50e1 100644
--- a/numpy/core/src/common/simd/avx2/math.h
+++ b/numpy/core/src/common/simd/avx2/math.h
@@ -105,4 +105,12 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return _mm256_blendv_epi8(a, b, _mm256_cmpgt_epi64(a, b));
 }
 
+// ceil
+#define npyv_ceil_f32 _mm256_ceil_ps
+#define npyv_ceil_f64 _mm256_ceil_pd
+
+// trunc
+#define npyv_trunc_f32(A) _mm256_round_ps(A, _MM_FROUND_TO_ZERO)
+#define npyv_trunc_f64(A) _mm256_round_pd(A, _MM_FROUND_TO_ZERO)
+
 #endif // _NPY_SIMD_AVX2_MATH_H
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index e27bf15fe..5891a270a 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 #if 0 // slower
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
 }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_maskload_epi64((const void*)ptr, mask);
 }
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i idx   = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
 }
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
     _mm256_maskstore_epi64((void*)ptr, mask, a);
 }
diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
index e96696dc9..5e91e91b3 100644
--- a/numpy/core/src/common/simd/avx2/misc.h
+++ b/numpy/core/src/common/simd/avx2/misc.h
@@ -24,11 +24,27 @@
 #define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
-#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
 #define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
 
+NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai);
+#else
+    return _mm256_set1_epi64x(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a);
+#else
+    return _mm256_set1_epi64x(a);
+#endif
+}
 /*
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
 }
 NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm256_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
+    );
+#else
     return _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
 }
 
 NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 0141396d0..f30e50ad0 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
     return _mm512_range_pd(a, a, 8);
 #else
     return npyv_and_f64(
-        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+        a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
     );
 #endif
 }
@@ -112,4 +112,12 @@ NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b)
 #define npyv_min_u64 _mm512_min_epu64
 #define npyv_min_s64 _mm512_min_epi64
 
+// ceil
+#define npyv_ceil_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_POS_INF)
+#define npyv_ceil_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_POS_INF)
+
+// trunc
+#define npyv_trunc_f32(A) _mm512_roundscale_ps(A, _MM_FROUND_TO_ZERO)
+#define npyv_trunc_f64(A) _mm512_roundscale_pd(A, _MM_FROUND_TO_ZERO)
+
 #endif // _NPY_SIMD_AVX512_MATH_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index bffd6e907..47095bf72 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 //// 64
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
 //// 64
 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
@@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
@@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
index 4b6729b05..c3039ecfe 100644
--- a/numpy/core/src/common/simd/avx512/misc.h
+++ b/numpy/core/src/common/simd/avx512/misc.h
@@ -24,11 +24,30 @@
 #define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
-#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
 #define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
 
+NPY_FINLINE __m512i npyv__setr_epi64(
+    npy_int64, npy_int64, npy_int64, npy_int64,
+    npy_int64, npy_int64, npy_int64, npy_int64
+);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
+#else
+    return _mm512_set1_epi64(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a, a, a, a, a);
+#else
+    return _mm512_set1_epi64(a);
+#endif
+}
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
 NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
                                      npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm512_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
+        (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
+        (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
+    );
+#else
     return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+#endif
 }
 
 NPY_FINLINE __m512 npyv__setr_ps(
diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h
index 8066283c6..c3079283f 100644
--- a/numpy/core/src/common/simd/avx512/utils.h
+++ b/numpy/core/src/common/simd/avx512/utils.h
@@ -26,7 +26,7 @@
     #define npyv512_combine_ps256(A, B) _mm512_insertf32x8(_mm512_castps256_ps512(A), B, 1)
 #else
     #define npyv512_combine_ps256(A, B) \
-        _mm512_castsi512_ps(npyv512_combine_si256(_mm512_castps_si512(A), _mm512_castps_si512(B)))
+        _mm512_castsi512_ps(npyv512_combine_si256(_mm256_castps_si256(A), _mm256_castps_si256(B)))
 #endif
 
 #define NPYV_IMPL_AVX512_FROM_AVX2_1ARG(FN_NAME, INTRIN) \
@@ -39,6 +39,26 @@
         return npyv512_combine_si256(l_a, h_a);          \
     }
 
+#define NPYV_IMPL_AVX512_FROM_AVX2_PS_1ARG(FN_NAME, INTRIN) \
+    NPY_FINLINE __m512 FN_NAME(__m512 a)                    \
+    {                                                       \
+        __m256 l_a  = npyv512_lower_ps256(a);               \
+        __m256 h_a  = npyv512_higher_ps256(a);              \
+        l_a = INTRIN(l_a);                                  \
+        h_a = INTRIN(h_a);                                  \
+        return npyv512_combine_ps256(l_a, h_a);             \
+    }
+
+#define NPYV_IMPL_AVX512_FROM_AVX2_PD_1ARG(FN_NAME, INTRIN) \
+    NPY_FINLINE __m512d FN_NAME(__m512d a)                  \
+    {                                                       \
+        __m256d l_a  = npyv512_lower_pd256(a);              \
+        __m256d h_a  = npyv512_higher_pd256(a);             \
+        l_a = INTRIN(l_a);                                  \
+        h_a = INTRIN(h_a);                                  \
+        return npyv512_combine_pd256(l_a, h_a);             \
+    }
+
 #define NPYV_IMPL_AVX512_FROM_AVX2_2ARG(FN_NAME, INTRIN) \
     NPY_FINLINE __m512i FN_NAME(__m512i a, __m512i b)    \
     {                                                    \
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index 5d2ab2906..a7a461721 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -162,11 +162,12 @@ NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor)
     npy_uint32 divisor_hi  = divisor >> 32;
     npy_uint32 divisor_lo  = divisor & 0xFFFFFFFF;
     // compute high quotient digit
-    npy_uint32 quotient_hi = (npy_uint32)(high / divisor_hi);
+    npy_uint64 quotient_hi = high / divisor_hi;
     npy_uint64 remainder   = high - divisor_hi * quotient_hi;
     npy_uint64 base32      = 1ULL << 32;
     while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) {
-        remainder += --divisor_hi;
+        --quotient_hi;
+        remainder += divisor_hi;
         if (remainder >= base32) {
             break;
         }
@@ -200,7 +201,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
     default:
         l   = npyv__bitscan_revnz_u32(d - 1) + 1;  // ceil(log2(d))
         l2  = (npy_uint8)(1 << l);                 // 2^l, overflow to 0 if l = 8
-        m   = ((l2 - d) << 8) / d + 1;             // multiplier
+        m   = ((npy_uint16)((l2 - d) << 8)) / d + 1; // multiplier
         sh1 = 1;  sh2 = l - 1;                     // shift counts
     }
     npyv_u8x3 divisor;
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 19ea6f22f..19e5cd846 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -88,16 +88,16 @@ NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a)
 #define npyv_max_f64 vmaxq_f64
 // Maximum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 #ifdef NPY_HAVE_ASIMD
     #define npyv_maxp_f32 vmaxnmq_f32
 #else
     NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b)
-    { 
+    {
         npyv_u32 nn_a = vceqq_f32(a, a);
         npyv_u32 nn_b = vceqq_f32(b, b);
         return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a));
-    } 
+    }
 #endif
 #if NPY_SIMD_F64
     #define npyv_maxp_f64 vmaxnmq_f64
@@ -123,16 +123,16 @@ NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b)
 #define npyv_min_f64 vminq_f64
 // Minimum, supports IEEE floating-point arithmetic (IEC 60559),
 // - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
+// - Only if both corresponded elements are NaN, NaN is set.
 #ifdef NPY_HAVE_ASIMD
     #define npyv_minp_f32 vminnmq_f32
 #else
     NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b)
-    { 
+    {
         npyv_u32 nn_a = vceqq_f32(a, a);
         npyv_u32 nn_b = vceqq_f32(b, b);
         return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a));
-    } 
+    }
 #endif
 #if NPY_SIMD_F64
     #define npyv_minp_f64 vminnmq_f64
@@ -153,4 +153,74 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// ceil
+#ifdef NPY_HAVE_ASIMD
+    #define npyv_ceil_f32 vrndpq_f32
+#else
+   NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+   {
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
+        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        /**
+         * On armv7, vcvtq.f32 handles special cases as follows:
+         *  NaN return 0
+         * +inf or +outrange return 0x80000000(-0.0f)
+         * -inf or -outrange return 0x7fffffff(nan)
+         */
+        npyv_s32 roundi = vcvtq_s32_f32(a);
+        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
+            vandq_u32(vcltq_f32(round, a), one))
+        );
+        // respect signed zero, e.g. -0.5 -> -0.0
+        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
+            vreinterpretq_s32_f32(ceil),
+            vandq_s32(vreinterpretq_s32_f32(a), szero)
+        ));
+        // if nan or overflow return a
+        npyv_u32 nnan = npyv_notnan_f32(a);
+        npyv_u32 overflow = vorrq_u32(
+            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        );
+        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+   }
+#endif
+#if NPY_SIMD_F64
+    #define npyv_ceil_f64 vrndpq_f64
+#endif // NPY_SIMD_F64
+
+// trunc
+#ifdef NPY_HAVE_ASIMD
+    #define npyv_trunc_f32 vrndq_f32
+#else
+   NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+   {
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        /**
+         * On armv7, vcvtq.f32 handles special cases as follows:
+         *  NaN return 0
+         * +inf or +outrange return 0x80000000(-0.0f)
+         * -inf or -outrange return 0x7fffffff(nan)
+         */
+        npyv_s32 roundi = vcvtq_s32_f32(a);
+        npyv_f32 round = vcvtq_f32_s32(roundi);
+        // respect signed zero, e.g. -0.5 -> -0.0
+        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
+            vreinterpretq_s32_f32(round),
+            vandq_s32(vreinterpretq_s32_f32(a), szero)
+        ));
+        // if nan or overflow return a
+        npyv_u32 nnan = npyv_notnan_f32(a);
+        npyv_u32 overflow = vorrq_u32(
+            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        );
+        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+   }
+#endif
+#if NPY_SIMD_F64
+    #define npyv_trunc_f64 vrndq_f64
+#endif // NPY_SIMD_F64
+
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index a3e2b95de..08b2a7d00 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -27,6 +27,25 @@ typedef npy_int64  npyv_lanetype_s64;
 typedef float      npyv_lanetype_f32;
 typedef double     npyv_lanetype_f64;
 
+#if defined(_MSC_VER) && defined(_M_IX86)
+/*
+ * Avoid using any of the following intrinsics with MSVC 32-bit,
+ * even if they are apparently work on newer versions.
+ * They had bad impact on the generated instructions,
+ * sometimes the compiler deal with them without the respect
+ * of 32-bit mode which lead to crush due to execute 64-bit
+ * instructions and other times generate bad emulated instructions. 
+ */
+    #undef _mm512_set1_epi64
+    #undef _mm256_set1_epi64x
+    #undef _mm_set1_epi64x
+    #undef _mm512_setr_epi64x
+    #undef _mm256_setr_epi64x
+    #undef _mm_setr_epi64x
+    #undef _mm512_set_epi64x
+    #undef _mm256_set_epi64x
+    #undef _mm_set_epi64x
+#endif
 #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
     #include "avx512/avx512.h"
 #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 97d35afc5..5daf7711e 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -143,4 +143,63 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return npyv_select_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
+// ceil
+#ifdef NPY_HAVE_SSE41
+    #define npyv_ceil_f32 _mm_ceil_ps
+    #define npyv_ceil_f64 _mm_ceil_pd
+#else
+    NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+    {
+        const npyv_f32 szero = _mm_set1_ps(-0.0f);
+        const npyv_f32 one = _mm_set1_ps(1.0f);
+        npyv_s32 roundi = _mm_cvttps_epi32(a);
+        npyv_f32 round = _mm_cvtepi32_ps(roundi);
+        npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
+        // respect signed zero, e.g. -0.5 -> -0.0
+        npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+        // if overflow return a
+        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+    }
+    NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
+    {
+        const npyv_f64 szero = _mm_set1_pd(-0.0);
+        const npyv_f64 one = _mm_set1_pd(1.0);
+        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+        npyv_f64 sign_two52 = _mm_or_pd(two_power_52, _mm_and_pd(a, szero));
+        // round by add magic number 2^52
+        npyv_f64 round = _mm_sub_pd(_mm_add_pd(a, sign_two52), sign_two52);
+        npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
+        // respect signed zero, e.g. -0.5 -> -0.0
+        return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+    }
+#endif
+
+// trunc
+#ifdef NPY_HAVE_SSE41
+    #define npyv_trunc_f32(A) _mm_round_ps(A, _MM_FROUND_TO_ZERO)
+    #define npyv_trunc_f64(A) _mm_round_pd(A, _MM_FROUND_TO_ZERO)
+#else
+    NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+    {
+        const npyv_f32 szero = _mm_set1_ps(-0.0f);
+        npyv_s32 roundi = _mm_cvttps_epi32(a);
+        npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+        // respect signed zero, e.g. -0.5 -> -0.0
+        npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+        // if overflow return a
+        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+    }
+    NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
+    {
+        const npyv_f64 szero = _mm_set1_pd(-0.0);
+        const npyv_f64 one = _mm_set1_pd(1.0);
+        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
+        npyv_f64 abs_a = npyv_abs_f64(a);
+        // round by add magic number 2^52
+        npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
+        npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
+        return _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
+    }
+#endif
+
 #endif // _NPY_SIMD_SSE_MATH_H
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 1099c491d..7d13fbf55 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -24,11 +24,28 @@
 #define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
 #define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
 #define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
-#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
-#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
 #define npyv_setall_f32 _mm_set1_ps
 #define npyv_setall_f64 _mm_set1_pd
 
+NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);
+
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
 }
 NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
+#else
     return _mm_set_epi64x(i1, i0);
+#endif
 }
 NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
 {
diff --git a/numpy/core/src/common/simd/vsx/math.h b/numpy/core/src/common/simd/vsx/math.h
index b2e393c7c..d138cae8a 100644
--- a/numpy/core/src/common/simd/vsx/math.h
+++ b/numpy/core/src/common/simd/vsx/math.h
@@ -69,4 +69,12 @@ NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a)
 #define npyv_min_u64 vec_min
 #define npyv_min_s64 vec_min
 
+// ceil
+#define npyv_ceil_f32 vec_ceil
+#define npyv_ceil_f64 vec_ceil
+
+// trunc
+#define npyv_trunc_f32 vec_trunc
+#define npyv_trunc_f64 vec_trunc
+
 #endif // _NPY_SIMD_VSX_MATH_H
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index e945d0771..9486b7cff 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -2193,7 +2193,7 @@ PrintFloat_Printf_g(PyObject *obj, int precision)
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = PyArrayScalar_VAL(obj, LongDouble);
-        PyOS_snprintf(str, sizeof(str), "%.*Lg", precision, x);
+        PyOS_snprintf(str, sizeof(str), "%.*" NPY_LONGDOUBLE_FMT, precision, x);
     }
     else{
         double val = PyFloat_AsDouble(obj);
@@ -2363,6 +2363,17 @@ run_intp_converter(PyObject* NPY_UNUSED(self), PyObject *args)
     return tup;
 }
 
+/* used to test NPY_ARRAY_ENSURENOCOPY raises ValueError */
+static PyObject*
+npy_ensurenocopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags = NPY_ARRAY_ENSURENOCOPY;
+    if (!PyArray_CheckFromAny(args, NULL, 0, 0, flags, NULL)) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"argparse_example_function",
          (PyCFunction)argparse_example_function,
@@ -2424,6 +2435,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
     {"npy_discard",
         npy_discard,
         METH_O, NULL},
+    {"npy_ensurenocopy",
+        npy_ensurenocopy,
+        METH_O, NULL},
     {"get_buffer_info",
         get_buffer_info,
         METH_VARARGS, NULL},
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index adb4ae128..94a7daa83 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -133,9 +133,10 @@ npy_alloc_cache(npy_uintp sz)
 
 /* zero initialized data, sz is number of bytes to allocate */
 NPY_NO_EXPORT void *
-npy_alloc_cache_zero(npy_uintp sz)
+npy_alloc_cache_zero(size_t nmemb, size_t size)
 {
     void * p;
+    size_t sz = nmemb * size;
     NPY_BEGIN_THREADS_DEF;
     if (sz < NBUCKETS) {
         p = _npy_alloc_cache(sz, 1, NBUCKETS, datacache, &PyDataMem_NEW);
@@ -145,7 +146,7 @@ npy_alloc_cache_zero(npy_uintp sz)
         return p;
     }
     NPY_BEGIN_THREADS;
-    p = PyDataMem_NEW_ZEROED(sz, 1);
+    p = PyDataMem_NEW_ZEROED(nmemb, size);
     NPY_END_THREADS;
     return p;
 }
@@ -185,10 +186,28 @@ npy_free_cache_dim(void * p, npy_uintp sz)
                     &PyArray_free);
 }
 
+/* Similar to array_dealloc in arrayobject.c */
+static NPY_INLINE void
+WARN_NO_RETURN(PyObject* warning, const char * msg) {
+    if (PyErr_WarnEx(warning, msg, 1) < 0) {
+        PyObject * s;
+
+        s = PyUnicode_FromString("PyDataMem_UserFREE");
+        if (s) {
+            PyErr_WriteUnraisable(s);
+            Py_DECREF(s);
+        }
+        else {
+            PyErr_WriteUnraisable(Py_None);
+        }
+    }
+}
+
+
 
 /* malloc/free/realloc hook */
-NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook;
-NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data;
+NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook = NULL;
+NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data = NULL;
 
 /*NUMPY_API
  * Sets the allocation event hook for numpy array data.
@@ -209,6 +228,8 @@ NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data;
  * operations that might cause new allocation events (such as the
  * creation/destruction numpy objects, or creating/destroying Python
  * objects which might cause a gc)
+ *
+ * Deprecated in 1.23
  */
 NPY_NO_EXPORT PyDataMem_EventHookFunc *
 PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook,
@@ -217,6 +238,10 @@ PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook,
     PyDataMem_EventHookFunc *temp;
     NPY_ALLOW_C_API_DEF
     NPY_ALLOW_C_API
+    /* 2021-11-18, 1.23 */
+    WARN_NO_RETURN(PyExc_DeprecationWarning,
+                     "PyDataMem_SetEventHook is deprecated, use tracemalloc "
+                     "and the 'np.lib.tracemalloc_domain' domain");
     temp = _PyDataMem_eventhook;
     _PyDataMem_eventhook = newhook;
     if (old_data != NULL) {
@@ -254,21 +279,21 @@ PyDataMem_NEW(size_t size)
  * Allocates zeroed memory for array data.
  */
 NPY_NO_EXPORT void *
-PyDataMem_NEW_ZEROED(size_t size, size_t elsize)
+PyDataMem_NEW_ZEROED(size_t nmemb, size_t size)
 {
     void *result;
 
-    result = calloc(size, elsize);
+    result = calloc(nmemb, size);
     if (_PyDataMem_eventhook != NULL) {
         NPY_ALLOW_C_API_DEF
         NPY_ALLOW_C_API
         if (_PyDataMem_eventhook != NULL) {
-            (*_PyDataMem_eventhook)(NULL, result, size * elsize,
+            (*_PyDataMem_eventhook)(NULL, result, nmemb * size,
                                     _PyDataMem_eventhook_user_data);
         }
         NPY_DISABLE_C_API
     }
-    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, nmemb * size);
     return result;
 }
 
@@ -316,3 +341,325 @@ PyDataMem_RENEW(void *ptr, size_t size)
     }
     return result;
 }
+
+// The default data mem allocator malloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserNEW
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_malloc(void *NPY_UNUSED(ctx), size_t size)
+{
+    return _npy_alloc_cache(size, 1, NBUCKETS, datacache, &malloc);
+}
+
+// The default data mem allocator calloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserNEW_ZEROED
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_calloc(void *NPY_UNUSED(ctx), size_t nelem, size_t elsize)
+{
+    void * p;
+    size_t sz = nelem * elsize;
+    NPY_BEGIN_THREADS_DEF;
+    if (sz < NBUCKETS) {
+        p = _npy_alloc_cache(sz, 1, NBUCKETS, datacache, &malloc);
+        if (p) {
+            memset(p, 0, sz);
+        }
+        return p;
+    }
+    NPY_BEGIN_THREADS;
+    p = calloc(nelem, elsize);
+    NPY_END_THREADS;
+    return p;
+}
+
+// The default data mem allocator realloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserRENEW
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_realloc(void *NPY_UNUSED(ctx), void *ptr, size_t new_size)
+{
+    return realloc(ptr, new_size);
+}
+
+// The default data mem allocator free routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserFREE
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void
+default_free(void *NPY_UNUSED(ctx), void *ptr, size_t size)
+{
+    _npy_free_cache(ptr, size, NBUCKETS, datacache, &free);
+}
+
+/* Memory handler global default */
+PyDataMem_Handler default_handler = {
+    "default_allocator",
+    1,
+    {
+        NULL,            /* ctx */
+        default_malloc,  /* malloc */
+        default_calloc,  /* calloc */
+        default_realloc, /* realloc */
+        default_free     /* free */
+    }
+};
+/* singleton capsule of the default handler */
+PyObject *PyDataMem_DefaultHandler;
+
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+PyObject *current_handler;
+#endif
+
+int uo_index=0;   /* user_override index */
+
+/* Wrappers for the default or any user-assigned PyDataMem_Handler */
+
+NPY_NO_EXPORT void *
+PyDataMem_UserNEW(size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+    assert(size != 0);
+    result = handler->allocator.malloc(handler->allocator.ctx, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    return result;
+}
+
+NPY_NO_EXPORT void *
+PyDataMem_UserNEW_ZEROED(size_t nmemb, size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+    result = handler->allocator.calloc(handler->allocator.ctx, nmemb, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, nmemb * size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, nmemb * size);
+    return result;
+}
+
+
+NPY_NO_EXPORT void
+PyDataMem_UserFREE(void *ptr, size_t size, PyObject *mem_handler)
+{
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        WARN_NO_RETURN(PyExc_RuntimeWarning,
+                     "Could not get pointer to 'mem_handler' from PyCapsule");
+        return;
+    }
+    PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
+    handler->allocator.free(handler->allocator.ctx, ptr, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, NULL, 0,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+}
+
+NPY_NO_EXPORT void *
+PyDataMem_UserRENEW(void *ptr, size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+
+    assert(size != 0);
+    result = handler->allocator.realloc(handler->allocator.ctx, ptr, size);
+    if (result != ptr) {
+        PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    return result;
+}
+
+/*NUMPY_API
+ * Set a new allocation policy. If the input value is NULL, will reset
+ * the policy to the default. Return the previous policy, or
+ * return NULL if an error has occurred. We wrap the user-provided
+ * functions so they will still call the python and numpy
+ * memory management callback hooks.
+ */
+NPY_NO_EXPORT PyObject *
+PyDataMem_SetHandler(PyObject *handler)
+{
+    PyObject *old_handler;
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    PyObject *token;
+    if (PyContextVar_Get(current_handler, NULL, &old_handler)) {
+        return NULL;
+    }
+    if (handler == NULL) {
+        handler = PyDataMem_DefaultHandler;
+    }
+    token = PyContextVar_Set(current_handler, handler);
+    if (token == NULL) {
+        Py_DECREF(old_handler);
+        return NULL;
+    }
+    Py_DECREF(token);
+    return old_handler;
+#else
+    PyObject *p;
+    p = PyThreadState_GetDict();
+    if (p == NULL) {
+        return NULL;
+    }
+    old_handler = PyDict_GetItemString(p, "current_allocator");
+    if (old_handler == NULL) {
+        old_handler = PyDataMem_DefaultHandler
+    }
+    Py_INCREF(old_handler);
+    if (handler == NULL) {
+        handler = PyDataMem_DefaultHandler;
+    }
+    const int error = PyDict_SetItemString(p, "current_allocator", handler);
+    if (error) {
+        Py_DECREF(old_handler);
+        return NULL;
+    }
+    return old_handler;
+#endif
+}
+
+/*NUMPY_API
+ * Return the policy that will be used to allocate data
+ * for the next PyArrayObject. On failure, return NULL.
+ */
+NPY_NO_EXPORT PyObject *
+PyDataMem_GetHandler()
+{
+    PyObject *handler;
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    if (PyContextVar_Get(current_handler, NULL, &handler)) {
+        return NULL;
+    }
+    return handler;
+#else
+    PyObject *p = PyThreadState_GetDict();
+    if (p == NULL) {
+        return NULL;
+    }
+    handler = PyDict_GetItemString(p, "current_allocator");
+    if (handler == NULL) {
+        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+        if (handler == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        Py_INCREF(handler);
+    }
+    return handler;
+#endif
+}
+
+NPY_NO_EXPORT PyObject *
+get_handler_name(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *arr=NULL;
+    if (!PyArg_ParseTuple(args, "|O:get_handler_name", &arr)) {
+        return NULL;
+    }
+    if (arr != NULL && !PyArray_Check(arr)) {
+         PyErr_SetString(PyExc_ValueError, "if supplied, argument must be an ndarray");
+         return NULL;
+    }
+    PyObject *mem_handler;
+    PyDataMem_Handler *handler;
+    PyObject *name;
+    if (arr != NULL) {
+        mem_handler = PyArray_HANDLER((PyArrayObject *) arr);
+        if (mem_handler == NULL) {
+            Py_RETURN_NONE;
+        }
+        Py_INCREF(mem_handler);
+    }
+    else {
+        mem_handler = PyDataMem_GetHandler();
+        if (mem_handler == NULL) {
+            return NULL;
+        }
+    }
+    handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        Py_DECREF(mem_handler);
+        return NULL;
+    }
+    name = PyUnicode_FromString(handler->name);
+    Py_DECREF(mem_handler);
+    return name;
+}
+
+NPY_NO_EXPORT PyObject *
+get_handler_version(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *arr=NULL;
+    if (!PyArg_ParseTuple(args, "|O:get_handler_version", &arr)) {
+        return NULL;
+    }
+    if (arr != NULL && !PyArray_Check(arr)) {
+         PyErr_SetString(PyExc_ValueError, "if supplied, argument must be an ndarray");
+         return NULL;
+    }
+    PyObject *mem_handler;
+    PyDataMem_Handler *handler;
+    PyObject *version;
+    if (arr != NULL) {
+        mem_handler = PyArray_HANDLER((PyArrayObject *) arr);
+        if (mem_handler == NULL) {
+            Py_RETURN_NONE;
+        }
+        Py_INCREF(mem_handler);
+    }
+    else {
+        mem_handler = PyDataMem_GetHandler();
+        if (mem_handler == NULL) {
+            return NULL;
+        }
+    }
+    handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        Py_DECREF(mem_handler);
+        return NULL;
+    }
+    version = PyLong_FromLong(handler->version);
+    Py_DECREF(mem_handler);
+    return version;
+}
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index 1259abca5..13c828458 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -11,13 +11,16 @@ NPY_NO_EXPORT PyObject *
 _set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj);
 
 NPY_NO_EXPORT void *
-npy_alloc_cache(npy_uintp sz);
+PyDataMem_UserNEW(npy_uintp sz, PyObject *mem_handler);
 
 NPY_NO_EXPORT void *
-npy_alloc_cache_zero(npy_uintp sz);
+PyDataMem_UserNEW_ZEROED(size_t nmemb, size_t size, PyObject *mem_handler);
 
 NPY_NO_EXPORT void
-npy_free_cache(void * p, npy_uintp sd);
+PyDataMem_UserFREE(void * p, npy_uintp sd, PyObject *mem_handler);
+
+NPY_NO_EXPORT void *
+PyDataMem_UserRENEW(void *ptr, size_t size, PyObject *mem_handler);
 
 NPY_NO_EXPORT void *
 npy_alloc_cache_dim(npy_uintp sz);
@@ -37,4 +40,14 @@ npy_free_cache_dim_array(PyArrayObject * arr)
     npy_free_cache_dim(PyArray_DIMS(arr), PyArray_NDIM(arr));
 }
 
+extern PyDataMem_Handler default_handler;
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+extern PyObject *current_handler; /* PyContextVar/PyCapsule */
+#endif
+
+NPY_NO_EXPORT PyObject *
+get_handler_name(PyObject *NPY_UNUSED(self), PyObject *obj);
+NPY_NO_EXPORT PyObject *
+get_handler_version(PyObject *NPY_UNUSED(self), PyObject *obj);
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ALLOC_H_ */
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 847bdafc3..2598e4bde 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -555,6 +555,7 @@ npy_new_coercion_cache(
         cache = PyMem_Malloc(sizeof(coercion_cache_obj));
     }
     if (cache == NULL) {
+        Py_DECREF(arr_or_sequence);
         PyErr_NoMemory();
         return -1;
     }
@@ -857,6 +858,7 @@ PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype)
  *        (Initially it is a pointer to the user-provided head pointer).
  * @param fixed_DType User provided fixed DType class
  * @param flags Discovery flags (reporting and behaviour flags, see def.)
+ * @param never_copy Specifies if a copy is allowed during array creation.
  * @return The updated number of maximum dimensions (i.e. scalars will set
  *         this to the current dimensions).
  */
@@ -865,7 +867,8 @@ PyArray_DiscoverDTypeAndShape_Recursive(
         PyObject *obj, int curr_dims, int max_dims, PyArray_Descr**out_descr,
         npy_intp out_shape[NPY_MAXDIMS],
         coercion_cache_obj ***coercion_cache_tail_ptr,
-        PyArray_DTypeMeta *fixed_DType, enum _dtype_discovery_flags *flags)
+        PyArray_DTypeMeta *fixed_DType, enum _dtype_discovery_flags *flags,
+        int never_copy)
 {
     PyArrayObject *arr = NULL;
     PyObject *seq;
@@ -923,7 +926,7 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             requested_descr = *out_descr;
         }
         arr = (PyArrayObject *)_array_from_array_like(obj,
-                requested_descr, 0, NULL);
+                requested_descr, 0, NULL, never_copy);
         if (arr == NULL) {
             return -1;
         }
@@ -1117,7 +1120,7 @@ PyArray_DiscoverDTypeAndShape_Recursive(
         max_dims = PyArray_DiscoverDTypeAndShape_Recursive(
                 objects[i], curr_dims + 1, max_dims,
                 out_descr, out_shape, coercion_cache_tail_ptr, fixed_DType,
-                flags);
+                flags, never_copy);
 
         if (max_dims < 0) {
             return -1;
@@ -1157,6 +1160,7 @@ PyArray_DiscoverDTypeAndShape_Recursive(
  *        The result may be unchanged (remain NULL) when converting a
  *        sequence with no elements. In this case it is callers responsibility
  *        to choose a default.
+ * @param never_copy Specifies that a copy is not allowed.
  * @return dimensions of the discovered object or -1 on error.
  *         WARNING: If (and only if) the output is a single array, the ndim
  *         returned _can_ exceed the maximum allowed number of dimensions.
@@ -1169,7 +1173,7 @@ PyArray_DiscoverDTypeAndShape(
         npy_intp out_shape[NPY_MAXDIMS],
         coercion_cache_obj **coercion_cache,
         PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
-        PyArray_Descr **out_descr)
+        PyArray_Descr **out_descr, int never_copy)
 {
     coercion_cache_obj **coercion_cache_head = coercion_cache;
     *coercion_cache = NULL;
@@ -1214,7 +1218,7 @@ PyArray_DiscoverDTypeAndShape(
 
     int ndim = PyArray_DiscoverDTypeAndShape_Recursive(
             obj, 0, max_dims, out_descr, out_shape, &coercion_cache,
-            fixed_DType, &flags);
+            fixed_DType, &flags, never_copy);
     if (ndim < 0) {
         goto fail;
     }
@@ -1499,7 +1503,7 @@ _discover_array_parameters(PyObject *NPY_UNUSED(self),
     int ndim = PyArray_DiscoverDTypeAndShape(
             obj, NPY_MAXDIMS, shape,
             &coercion_cache,
-            fixed_DType, fixed_descriptor, (PyArray_Descr **)&out_dtype);
+            fixed_DType, fixed_descriptor, (PyArray_Descr **)&out_dtype, 0);
     Py_XDECREF(fixed_DType);
     Py_XDECREF(fixed_descriptor);
     if (ndim < 0) {
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index db0e479fe..f2482cecc 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -31,7 +31,7 @@ PyArray_DiscoverDTypeAndShape(
         npy_intp out_shape[NPY_MAXDIMS],
         coercion_cache_obj **coercion_cache,
         PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
-        PyArray_Descr **out_descr);
+        PyArray_Descr **out_descr, int never_copy);
 
 NPY_NO_EXPORT int
 PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index 406b0c6ff..d93dac506 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -780,6 +780,13 @@ _masked_stridedloop_data_free(NpyAuxData *auxdata)
  * This function wraps a regular unmasked strided-loop as a
  * masked strided-loop, only calling the function for elements
  * where the mask is True.
+ *
+ * TODO: Reductions also use this code to implement masked reductions.
+ *       Before consolidating them, reductions had a special case for
+ *       broadcasts: when the mask stride was 0 the code does not check all
+ *       elements as `npy_memchr` currently does.
+ *       It may be worthwhile to add such an optimization again if broadcasted
+ *       masks are common enough.
  */
 static int
 generic_masked_strided_loop(PyArrayMethod_Context *context,
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index b29c7c077..7b7372bd0 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -21,6 +21,17 @@ typedef enum {
     NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
     /* Whether the method supports unaligned access (not runtime) */
     NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
+    /*
+     * Private flag for now for *logic* functions.  The logical functions
+     * `logical_or` and `logical_and` can always cast the inputs to booleans
+     * "safely" (because that is how the cast to bool is defined).
+     * @seberg: I am not sure this is the best way to handle this, so its
+     * private for now (also it is very limited anyway).
+     * There is one "exception". NA aware dtypes cannot cast to bool
+     * (hopefully), so the `??->?` loop should error even with this flag.
+     * But a second NA fallback loop will be necessary.
+     */
+    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
 
     /* All flags which can change at runtime */
     NPY_METH_RUNTIME_FLAGS = (
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 9b9df08f2..1b197d0f2 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -263,7 +263,7 @@ PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object)
      */
     ndim = PyArray_DiscoverDTypeAndShape(src_object,
             PyArray_NDIM(dest), dims, &cache,
-            NPY_DTYPE(PyArray_DESCR(dest)), PyArray_DESCR(dest), &dtype);
+            NPY_DTYPE(PyArray_DESCR(dest)), PyArray_DESCR(dest), &dtype, 0);
     if (ndim < 0) {
         return -1;
     }
@@ -493,7 +493,28 @@ array_dealloc(PyArrayObject *self)
         if (PyDataType_FLAGCHK(fa->descr, NPY_ITEM_REFCOUNT)) {
             PyArray_XDECREF(self);
         }
-        npy_free_cache(fa->data, PyArray_NBYTES(self));
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        size_t nbytes = PyArray_NBYTES(self);
+        if (nbytes == 0) {
+            nbytes = fa->descr->elsize ? fa->descr->elsize : 1;
+        }
+        if (fa->mem_handler == NULL) {
+            char *env = getenv("NUMPY_WARN_IF_NO_MEM_POLICY");
+            if ((env != NULL) && (strncmp(env, "1", 1) == 0)) {
+                char const * msg = "Trying to dealloc data, but a memory policy "
+                    "is not set. If you take ownership of the data, you must "
+                    "set a base owning the data (e.g. a PyCapsule).";
+                WARN_IN_DEALLOC(PyExc_RuntimeWarning, msg);
+            }
+            // Guess at malloc/free ???
+            free(fa->data);
+        } else {
+            PyDataMem_UserFREE(fa->data, nbytes, fa->mem_handler);
+            Py_DECREF(fa->mem_handler);
+        }
     }
 
     /* must match allocation in PyArray_NewFromDescr */
@@ -1705,22 +1726,6 @@ array_iter(PyArrayObject *arr)
     return PySeqIter_New((PyObject *)arr);
 }
 
-static PyObject *
-array_alloc(PyTypeObject *type, Py_ssize_t NPY_UNUSED(nitems))
-{
-    /* nitems will always be 0 */
-    PyObject *obj = PyObject_Malloc(type->tp_basicsize);
-    PyObject_Init(obj, type);
-    return obj;
-}
-
-static void
-array_free(PyObject * v)
-{
-    /* avoid same deallocator as PyBaseObject, see gentype_free */
-    PyObject_Free(v);
-}
-
 
 NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
@@ -1741,7 +1746,5 @@ NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     .tp_iter = (getiterfunc)array_iter,
     .tp_methods = array_methods,
     .tp_getset = array_getsetlist,
-    .tp_alloc = (allocfunc)array_alloc,
     .tp_new = (newfunc)array_new,
-    .tp_free = (freefunc)array_free,
 };
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 15782a91b..71808cc48 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -2759,10 +2759,10 @@ VOID_nonzero (char *ip, PyArrayObject *ap)
             dummy_fields.descr = new;
             if ((new->alignment > 1) && !__ALIGNED(ip + offset,
                         new->alignment)) {
-                PyArray_CLEARFLAGS(ap, NPY_ARRAY_ALIGNED);
+                PyArray_CLEARFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
             }
             else {
-                PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
+                PyArray_ENABLEFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
             }
             if (new->f->nonzero(ip+offset, dummy_arr)) {
                 nonz = NPY_TRUE;
@@ -3093,6 +3093,10 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     if (!PyArray_HASFIELDS(ap)) {
         return STRING_compare(ip1, ip2, ap);
     }
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        goto finish;
+    }
     descr = PyArray_DESCR(ap);
     /*
      * Compare on the first-field.  If equal, then
@@ -3107,15 +3111,19 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
         if (_unpack_field(tup, &new, &offset) < 0) {
             goto finish;
         }
-        /* descr is the only field checked by compare or copyswap */
+        /* Set the fields needed by compare or copyswap */
         dummy_struct.descr = new;
+
         swap = PyArray_ISBYTESWAPPED(dummy);
         nip1 = ip1 + offset;
         nip2 = ip2 + offset;
         if (swap || new->alignment > 1) {
             if (swap || !npy_is_aligned(nip1, new->alignment)) {
-                /* create buffer and copy */
-                nip1 = npy_alloc_cache(new->elsize);
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip1 = PyDataMem_UserNEW(new->elsize, mem_handler);
                 if (nip1 == NULL) {
                     goto finish;
                 }
@@ -3124,11 +3132,15 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
                     new->f->copyswap(nip1, NULL, swap, dummy);
             }
             if (swap || !npy_is_aligned(nip2, new->alignment)) {
-                /* create buffer and copy */
-                nip2 = npy_alloc_cache(new->elsize);
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip2 = PyDataMem_UserNEW(new->elsize, mem_handler);
                 if (nip2 == NULL) {
                     if (nip1 != ip1 + offset) {
-                        npy_free_cache(nip1, new->elsize);
+                        /* destroy temporary buffer */
+                        PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
                     }
                     goto finish;
                 }
@@ -3140,10 +3152,12 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
         res = new->f->compare(nip1, nip2, dummy);
         if (swap || new->alignment > 1) {
             if (nip1 != ip1 + offset) {
-                npy_free_cache(nip1, new->elsize);
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
             }
             if (nip2 != ip2 + offset) {
-                npy_free_cache(nip2, new->elsize);
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip2, new->elsize, mem_handler);
             }
         }
         if (res != 0) {
@@ -3152,6 +3166,7 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     }
 
 finish:
+    Py_XDECREF(mem_handler);
     return res;
 }
 
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 82d34193d..aa95d285a 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -119,7 +119,7 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
     int ndim;
 
     ndim = PyArray_DiscoverDTypeAndShape(
-            obj, maxdims, shape, &cache, NULL, NULL, out_dtype);
+            obj, maxdims, shape, &cache, NULL, NULL, out_dtype, 0);
     if (ndim < 0) {
         return -1;
     }
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 9910fffe6..5853e068b 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1393,7 +1393,7 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
     PyObject *obj;
     PyObject *str;
-    #if PY_VERSION_HEX >= 0x030700A2 && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300)
+    #if !defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM > 0x07030300
     const char *docstr;
     #else
     char *docstr;
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 6de764fb1..ef101a78b 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -163,6 +163,41 @@ PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq)
     return PyArray_IntpConverter(obj, seq);
 }
 
+NPY_NO_EXPORT int
+PyArray_CopyConverter(PyObject *obj, _PyArray_CopyMode *copymode) {
+    if (obj == Py_None) {
+        PyErr_SetString(PyExc_ValueError,
+                        "NoneType copy mode not allowed.");
+        return NPY_FAIL;
+    }
+
+    int int_copymode;
+    PyObject* numpy_CopyMode = NULL;
+    npy_cache_import("numpy", "_CopyMode", &numpy_CopyMode);
+
+    if (numpy_CopyMode != NULL && (PyObject *)Py_TYPE(obj) == numpy_CopyMode) {
+        PyObject* mode_value = PyObject_GetAttrString(obj, "value");
+        if (mode_value == NULL) {
+            return NPY_FAIL;
+        }
+
+        int_copymode = (int)PyLong_AsLong(mode_value);
+        if (error_converting(int_copymode)) {
+            return NPY_FAIL;
+        }
+    }
+    else {
+        npy_bool bool_copymode;
+        if (!PyArray_BoolConverter(obj, &bool_copymode)) {
+            return NPY_FAIL;
+        }
+        int_copymode = (int)bool_copymode;
+    }
+
+    *copymode = (_PyArray_CopyMode)int_copymode;
+    return NPY_SUCCEED;
+}
+
 /*NUMPY_API
  * Get buffer chunk from object
  *
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 89cf2ef27..4072841ee 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -9,6 +9,15 @@ PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq);
 NPY_NO_EXPORT int
 PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq);
 
+typedef enum {
+    NPY_COPY_IF_NEEDED = 0,
+    NPY_COPY_ALWAYS = 1,
+    NPY_COPY_NEVER = 2,
+} _PyArray_CopyMode;
+
+NPY_NO_EXPORT int
+PyArray_CopyConverter(PyObject *obj, _PyArray_CopyMode *copyflag);
+
 NPY_NO_EXPORT int
 PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf);
 
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index eeadad374..3135d6989 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -2119,7 +2119,7 @@ PyArray_ObjectType(PyObject *op, int minimum_type)
  * This function is only used in one place within NumPy and should
  * generally be avoided. It is provided mainly for backward compatibility.
  *
- * The user of the function has to free the returned array.
+ * The user of the function has to free the returned array with PyDataMem_FREE.
  */
 NPY_NO_EXPORT PyArrayObject **
 PyArray_ConvertToCommonType(PyObject *op, int *retn)
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 9da75fb8a..b62426854 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -726,6 +726,7 @@ PyArray_NewFromDescr_int(
     fa->nd = nd;
     fa->dimensions = NULL;
     fa->data = NULL;
+    fa->mem_handler = NULL;
 
     if (data == NULL) {
         fa->flags = NPY_ARRAY_DEFAULT;
@@ -805,12 +806,19 @@ PyArray_NewFromDescr_int(
         fa->flags |= NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_F_CONTIGUOUS;
     }
 
+
     if (data == NULL) {
+        /* Store the handler in case the default is modified */
+        fa->mem_handler = PyDataMem_GetHandler();
+        if (fa->mem_handler == NULL) {
+            goto fail;
+        }
         /*
          * Allocate something even for zero-space arrays
          * e.g. shape=(0,) -- otherwise buffer exposure
          * (a.data) doesn't work as it should.
          * Could probably just allocate a few bytes here. -- Chuck
+         * Note: always sync this with calls to PyDataMem_UserFREE
          */
         if (nbytes == 0) {
             nbytes = descr->elsize ? descr->elsize : 1;
@@ -820,21 +828,23 @@ PyArray_NewFromDescr_int(
          * which could also be sub-fields of a VOID array
          */
         if (zeroed || PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
-            data = npy_alloc_cache_zero(nbytes);
+            data = PyDataMem_UserNEW_ZEROED(nbytes, 1, fa->mem_handler);
         }
         else {
-            data = npy_alloc_cache(nbytes);
+            data = PyDataMem_UserNEW(nbytes, fa->mem_handler);
         }
         if (data == NULL) {
             raise_memory_error(fa->nd, fa->dimensions, descr);
             goto fail;
         }
+
         fa->flags |= NPY_ARRAY_OWNDATA;
     }
     else {
+        /* The handlers should never be called in this case */
+        fa->mem_handler = NULL;
         /*
-         * If data is passed in, this object won't own it by default.
-         * Caller must arrange for this to be reset if truly desired
+         * If data is passed in, this object won't own it.
          */
         fa->flags &= ~NPY_ARRAY_OWNDATA;
     }
@@ -902,6 +912,7 @@ PyArray_NewFromDescr_int(
     return (PyObject *)fa;
 
  fail:
+    Py_XDECREF(fa->mem_handler);
     Py_DECREF(fa);
     return NULL;
 }
@@ -1273,6 +1284,7 @@ fail:
  *                       DType may be used, but is not enforced.
  * @param writeable whether the result must be writeable.
  * @param context Unused parameter, must be NULL (should be removed later).
+ * @param never_copy Specifies that a copy is not allowed.
  *
  * @returns The array object, Py_NotImplemented if op is not array-like,
  *          or NULL with an error set. (A new reference to Py_NotImplemented
@@ -1280,7 +1292,8 @@ fail:
  */
 NPY_NO_EXPORT PyObject *
 _array_from_array_like(PyObject *op,
-        PyArray_Descr *requested_dtype, npy_bool writeable, PyObject *context) {
+        PyArray_Descr *requested_dtype, npy_bool writeable, PyObject *context,
+        int never_copy) {
     PyObject* tmp;
 
     /*
@@ -1336,7 +1349,7 @@ _array_from_array_like(PyObject *op,
      *      this should be changed!
      */
     if (!writeable && tmp == Py_NotImplemented) {
-        tmp = PyArray_FromArrayAttr(op, requested_dtype, context);
+        tmp = PyArray_FromArrayAttr_int(op, requested_dtype, never_copy);
         if (tmp == NULL) {
             return NULL;
         }
@@ -1436,7 +1449,7 @@ setArrayFromSequence(PyArrayObject *a, PyObject *s,
     }
 
     /* Try __array__ before using s as a sequence */
-    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL);
+    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL, 0);
     if (tmp == NULL) {
         goto fail;
     }
@@ -1564,7 +1577,8 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     Py_XDECREF(newtype);
 
     ndim = PyArray_DiscoverDTypeAndShape(op,
-            NPY_MAXDIMS, dims, &cache, fixed_DType, fixed_descriptor, &dtype);
+            NPY_MAXDIMS, dims, &cache, fixed_DType, fixed_descriptor, &dtype,
+            flags & NPY_ARRAY_ENSURENOCOPY);
 
     Py_XDECREF(fixed_descriptor);
     Py_XDECREF(fixed_DType);
@@ -1689,7 +1703,17 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 ((PyVoidScalarObject *)op)->flags,
                 NULL, op);
     }
-    else if (cache == 0 && newtype != NULL &&
+    /*
+     * If we got this far, we definitely have to create a copy, since we are
+     * converting either from a scalar (cache == NULL) or a (nested) sequence.
+     */
+    if (flags & NPY_ARRAY_ENSURENOCOPY ) {
+        PyErr_SetString(PyExc_ValueError,
+                "Unable to avoid copy while creating an array.");
+        return NULL;
+    }
+
+    if (cache == 0 && newtype != NULL &&
             PyDataType_ISSIGNED(newtype) && PyArray_IsScalar(op, Generic)) {
         assert(ndim == 0);
         /*
@@ -1790,7 +1814,8 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
  * NPY_ARRAY_WRITEBACKIFCOPY,
  * NPY_ARRAY_FORCECAST,
  * NPY_ARRAY_ENSUREARRAY,
- * NPY_ARRAY_ELEMENTSTRIDES
+ * NPY_ARRAY_ELEMENTSTRIDES,
+ * NPY_ARRAY_ENSURENOCOPY
  *
  * or'd (|) together
  *
@@ -1851,9 +1876,15 @@ PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
     if (obj == NULL) {
         return NULL;
     }
-    if ((requires & NPY_ARRAY_ELEMENTSTRIDES) &&
-        !PyArray_ElementStrides(obj)) {
+
+    if ((requires & NPY_ARRAY_ELEMENTSTRIDES)
+            && !PyArray_ElementStrides(obj)) {
         PyObject *ret;
+        if (requires & NPY_ARRAY_ENSURENOCOPY) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Unable to avoid copy while creating a new array.");
+            return NULL;
+        }
         ret = PyArray_NewCopy((PyArrayObject *)obj, NPY_ANYORDER);
         Py_DECREF(obj);
         obj = ret;
@@ -1928,6 +1959,12 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
            !PyArray_EquivTypes(oldtype, newtype);
 
     if (copy) {
+        if (flags & NPY_ARRAY_ENSURENOCOPY ) {
+            PyErr_SetString(PyExc_ValueError,
+                    "Unable to avoid copy while creating an array from given array.");
+            return NULL;
+        }
+
         NPY_ORDER order = NPY_KEEPORDER;
         int subok = 1;
 
@@ -2000,7 +2037,6 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
             if (flags & NPY_ARRAY_ENSUREARRAY) {
                 subtype = &PyArray_Type;
             }
-
             ret = (PyArrayObject *)PyArray_View(arr, NULL, subtype);
             if (ret == NULL) {
                 return NULL;
@@ -2425,18 +2461,30 @@ PyArray_FromInterface(PyObject *origin)
     return NULL;
 }
 
-/*NUMPY_API
+
+/**
+ * Check for an __array__ attribute and call it when it exists.
+ *
+ *  .. warning:
+ *      If returned, `NotImplemented` is borrowed and must not be Decref'd
+ *
+ * @param op The Python object to convert to an array.
+ * @param descr The desired `arr.dtype`, passed into the `__array__` call,
+ *        as information but is not checked/enforced!
+ * @param never_copy Specifies that a copy is not allowed.
+ *        NOTE: Currently, this means an error is raised instead of calling
+ *        `op.__array__()`.  In the future we could call for example call
+ *        `op.__array__(never_copy=True)` instead.
+ * @returns NotImplemented if `__array__` is not defined or a NumPy array
+ *          (or subclass).  On error, return NULL.
  */
 NPY_NO_EXPORT PyObject *
-PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
+PyArray_FromArrayAttr_int(
+        PyObject *op, PyArray_Descr *descr, int never_copy)
 {
     PyObject *new;
     PyObject *array_meth;
 
-    if (context != NULL) {
-        PyErr_SetString(PyExc_RuntimeError, "'context' must be NULL");
-        return NULL;
-    }
     array_meth = PyArray_LookupSpecial_OnInstance(op, "__array__");
     if (array_meth == NULL) {
         if (PyErr_Occurred()) {
@@ -2452,6 +2500,16 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
         }
         return Py_NotImplemented;
     }
+    if (never_copy) {
+        /* Currently, we must always assume that `__array__` returns a copy */
+        PyErr_SetString(PyExc_ValueError,
+                "Unable to avoid copy while converting from an object "
+                "implementing the `__array__` protocol.  NumPy cannot ensure "
+                "that no copy will be made.");
+        Py_DECREF(array_meth);
+        return NULL;
+    }
+
     if (PyType_Check(op) && PyObject_HasAttrString(array_meth, "__get__")) {
         /*
          * If the input is a class `array_meth` may be a property-like object.
@@ -2462,11 +2520,11 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
         Py_DECREF(array_meth);
         return Py_NotImplemented;
     }
-    if (typecode == NULL) {
+    if (descr == NULL) {
         new = PyObject_CallFunction(array_meth, NULL);
     }
     else {
-        new = PyObject_CallFunction(array_meth, "O", typecode);
+        new = PyObject_CallFunction(array_meth, "O", descr);
     }
     Py_DECREF(array_meth);
     if (new == NULL) {
@@ -2482,6 +2540,21 @@ PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
     return new;
 }
 
+
+/*NUMPY_API
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject *context)
+{
+    if (context != NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "'context' must be NULL");
+        return NULL;
+    }
+
+    return PyArray_FromArrayAttr_int(op, typecode, 0);
+}
+
+
 /*NUMPY_API
 * new reference -- accepts NULL for mintype
 */
@@ -3409,7 +3482,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         dptr += dtype->elsize;
         if (num < 0 && thisbuf == size) {
             totalbytes += bytes;
-            tmp = PyDataMem_RENEW(PyArray_DATA(r), totalbytes);
+            /* The handler is always valid */
+            tmp = PyDataMem_UserRENEW(PyArray_DATA(r), totalbytes,
+                                  PyArray_HANDLER(r));
             if (tmp == NULL) {
                 err = 1;
                 break;
@@ -3431,7 +3506,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         const size_t nsize = PyArray_MAX(*nread,1)*dtype->elsize;
 
         if (nsize != 0) {
-            tmp = PyDataMem_RENEW(PyArray_DATA(r), nsize);
+            /* The handler is always valid */
+            tmp = PyDataMem_UserRENEW(PyArray_DATA(r), nsize,
+                                  PyArray_HANDLER(r));
             if (tmp == NULL) {
                 err = 1;
             }
@@ -3536,7 +3613,9 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
         const size_t nsize = PyArray_MAX(nread,1) * dtype->elsize;
         char *tmp;
 
-        if ((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+        /* The handler is always valid */
+        if((tmp = PyDataMem_UserRENEW(PyArray_DATA(ret), nsize,
+                                     PyArray_HANDLER(ret))) == NULL) {
             Py_DECREF(dtype);
             Py_DECREF(ret);
             return PyErr_NoMemory();
@@ -3820,7 +3899,9 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
             */
             elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
             if (!npy_mul_with_overflow_intp(&nbytes, elcount, elsize)) {
-                new_data = PyDataMem_RENEW(PyArray_DATA(ret), nbytes);
+                /* The handler is always valid */
+                new_data = PyDataMem_UserRENEW(PyArray_DATA(ret), nbytes,
+                                  PyArray_HANDLER(ret));
             }
             else {
                 new_data = NULL;
@@ -3858,10 +3939,12 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
      * (assuming realloc is reasonably good about reusing space...)
      */
     if (i == 0 || elsize == 0) {
-        /* The size cannot be zero for PyDataMem_RENEW. */
+        /* The size cannot be zero for realloc. */
         goto done;
     }
-    new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * elsize);
+    /* The handler is always valid */
+    new_data = PyDataMem_UserRENEW(PyArray_DATA(ret), i * elsize,
+                                   PyArray_HANDLER(ret));
     if (new_data == NULL) {
         PyErr_SetString(PyExc_MemoryError,
                 "cannot allocate array memory");
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index e59e86e8b..98160b1cc 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -32,7 +32,8 @@ PyArray_New(
 
 NPY_NO_EXPORT PyObject *
 _array_from_array_like(PyObject *op,
-        PyArray_Descr *requested_dtype, npy_bool writeable, PyObject *context);
+        PyArray_Descr *requested_dtype, npy_bool writeable, PyObject *context,
+        int never_copy);
 
 NPY_NO_EXPORT PyObject *
 PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
@@ -52,6 +53,10 @@ NPY_NO_EXPORT PyObject *
 PyArray_FromInterface(PyObject *input);
 
 NPY_NO_EXPORT PyObject *
+PyArray_FromArrayAttr_int(
+        PyObject *op, PyArray_Descr *descr, int never_copy);
+
+NPY_NO_EXPORT PyObject *
 PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode,
                       PyObject *context);
 
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 6a09f92ac..0c539053c 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1326,7 +1326,7 @@ _convert_from_dict(PyObject *obj, int align)
             goto fail;
         }
         /* If align is set, make sure the alignment divides into the size */
-        if (align && itemsize % new->alignment != 0) {
+        if (align && new->alignment > 0 && itemsize % new->alignment != 0) {
             PyErr_Format(PyExc_ValueError,
                     "NumPy dtype descriptor requires alignment of %d bytes, "
                     "which is not divisible into the specified itemsize %d",
@@ -2305,8 +2305,9 @@ arraydescr_new(PyTypeObject *subtype,
 {
     if (subtype != &PyArrayDescr_Type) {
         if (Py_TYPE(subtype) == &PyArrayDTypeMeta_Type &&
-                !(PyType_GetFlags(Py_TYPE(subtype)) & Py_TPFLAGS_HEAPTYPE) &&
-                (NPY_DT_SLOTS((PyArray_DTypeMeta *)subtype)) != NULL) {
+                (NPY_DT_SLOTS((PyArray_DTypeMeta *)subtype)) != NULL &&
+                !NPY_DT_is_legacy((PyArray_DTypeMeta *)subtype) &&
+                subtype->tp_new != PyArrayDescr_Type.tp_new) {
             /*
              * Appears to be a properly initialized user DType. Allocate
              * it and initialize the main part as best we can.
@@ -2333,7 +2334,9 @@ arraydescr_new(PyTypeObject *subtype,
         }
         /* The DTypeMeta class should prevent this from happening. */
         PyErr_Format(PyExc_SystemError,
-                "'%S' must not inherit np.dtype.__new__().", subtype);
+                "'%S' must not inherit np.dtype.__new__(). User DTypes should "
+                "currently call `PyArrayDescr_Type.tp_new` from their new.",
+                subtype);
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/dlpack.c b/numpy/core/src/multiarray/dlpack.c
new file mode 100644
index 000000000..291e60a22
--- /dev/null
+++ b/numpy/core/src/multiarray/dlpack.c
@@ -0,0 +1,408 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dlpack/dlpack.h>
+
+#include "numpy/arrayobject.h"
+#include "common/npy_argparse.h"
+
+#include "common/dlpack/dlpack.h"
+#include "common/npy_dlpack.h"
+
+static void
+array_dlpack_deleter(DLManagedTensor *self)
+{
+    PyArrayObject *array = (PyArrayObject *)self->manager_ctx;
+    // This will also free the strides as it's one allocation.
+    PyMem_Free(self->dl_tensor.shape);
+    PyMem_Free(self);
+    Py_XDECREF(array);
+}
+
+/* This is exactly as mandated by dlpack */
+static void dlpack_capsule_deleter(PyObject *self) {
+    if (PyCapsule_IsValid(self, NPY_DLPACK_USED_CAPSULE_NAME)) {
+        return;
+    }
+
+    /* an exception may be in-flight, we must save it in case we create another one */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_CAPSULE_NAME);
+    if (managed == NULL) {
+        PyErr_WriteUnraisable(self);
+        goto done;
+    }
+    /*
+     *  the spec says the deleter can be NULL if there is no way for the caller
+     * to provide a reasonable destructor.
+     */
+    if (managed->deleter) {
+        managed->deleter(managed);
+        /* TODO: is the deleter allowed to set a python exception? */
+        assert(!PyErr_Occurred());
+    }
+
+done:
+    PyErr_Restore(type, value, traceback);
+}
+
+/* used internally, almost identical to dlpack_capsule_deleter() */
+static void array_dlpack_internal_capsule_deleter(PyObject *self)
+{
+    /* an exception may be in-flight, we must save it in case we create another one */
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(self, NPY_DLPACK_INTERNAL_CAPSULE_NAME);
+    if (managed == NULL) {
+        PyErr_WriteUnraisable(self);
+        goto done;
+    }
+    /*
+     *  the spec says the deleter can be NULL if there is no way for the caller
+     * to provide a reasonable destructor.
+     */
+    if (managed->deleter) {
+        managed->deleter(managed);
+        /* TODO: is the deleter allowed to set a python exception? */
+        assert(!PyErr_Occurred());
+    }
+
+done:
+    PyErr_Restore(type, value, traceback);
+}
+
+
+// This function cannot return NULL, but it can fail,
+// So call PyErr_Occurred to check if it failed after
+// calling it.
+static DLDevice
+array_get_dl_device(PyArrayObject *self) {
+    DLDevice ret;
+    ret.device_type = kDLCPU;
+    ret.device_id = 0;
+    PyObject *base = PyArray_BASE(self);
+    // The outer if is due to the fact that NumPy arrays are on the CPU
+    // by default (if not created from DLPack).
+    if (PyCapsule_IsValid(base, NPY_DLPACK_INTERNAL_CAPSULE_NAME)) {
+        DLManagedTensor *managed = PyCapsule_GetPointer(
+                base, NPY_DLPACK_INTERNAL_CAPSULE_NAME);
+        if (managed == NULL) {
+            return ret;
+        }
+        return managed->dl_tensor.device;
+    }
+    return ret;
+}
+
+
+PyObject *
+array_dlpack(PyArrayObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *stream = Py_None;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("__dlpack__", args, len_args, kwnames,
+            "$stream", NULL, &stream, NULL, NULL, NULL)) {
+        return NULL;
+    }
+
+    if (stream != Py_None) {
+        PyErr_SetString(PyExc_RuntimeError, "NumPy only supports "
+                "stream=None.");
+        return NULL;
+    }
+
+    if ( !(PyArray_FLAGS(self) & NPY_ARRAY_WRITEABLE)) {
+        PyErr_SetString(PyExc_TypeError, "NumPy currently only supports "
+                "dlpack for writeable arrays");
+        return NULL;
+    }
+
+    npy_intp itemsize = PyArray_ITEMSIZE(self);
+    int ndim = PyArray_NDIM(self);
+    npy_intp *strides = PyArray_STRIDES(self);
+    npy_intp *shape = PyArray_SHAPE(self);
+
+    if (!PyArray_IS_C_CONTIGUOUS(self) && PyArray_SIZE(self) != 1) {
+        for (int i = 0; i < ndim; ++i) {
+            if (strides[i] % itemsize != 0) {
+                PyErr_SetString(PyExc_RuntimeError,
+                        "DLPack only supports strides which are a multiple of "
+                        "itemsize.");
+                return NULL;
+            }
+        }
+    }
+
+    DLDataType managed_dtype;
+    PyArray_Descr *dtype = PyArray_DESCR(self);
+
+    if (PyDataType_ISBYTESWAPPED(dtype)) {
+        PyErr_SetString(PyExc_TypeError, "DLPack only supports native "
+                    "byte swapping.");
+            return NULL;
+    }
+
+    managed_dtype.bits = 8 * itemsize;
+    managed_dtype.lanes = 1;
+
+    if (PyDataType_ISSIGNED(dtype)) {
+        managed_dtype.code = kDLInt;
+    }
+    else if (PyDataType_ISUNSIGNED(dtype)) {
+        managed_dtype.code = kDLUInt;
+    }
+    else if (PyDataType_ISFLOAT(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 8) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "floating point types without padding.");
+            return NULL;
+        }
+        managed_dtype.code = kDLFloat;
+    }
+    else if (PyDataType_ISCOMPLEX(dtype)) {
+        // We can't be sure that the dtype is
+        // IEEE or padded.
+        if (itemsize > 16) {
+            PyErr_SetString(PyExc_TypeError, "DLPack only supports IEEE "
+                    "complex point types without padding.");
+            return NULL;
+        }
+        managed_dtype.code = kDLComplex;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                        "DLPack only supports signed/unsigned integers, float "
+                        "and complex dtypes.");
+        return NULL;
+    }
+
+    DLDevice device = array_get_dl_device(self);
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+
+    DLManagedTensor *managed = PyMem_Malloc(sizeof(DLManagedTensor));
+    if (managed == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /*
+     * Note: the `dlpack.h` header suggests/standardizes that `data` must be
+     * 256-byte aligned.  We ignore this intentionally, because `__dlpack__`
+     * standardizes that `byte_offset` must be 0 (for now) to not break pytorch:
+     * https://github.com/data-apis/array-api/issues/293#issuecomment-964111413
+     *
+     * We further assume that exporting fully unaligned data is OK even without
+     * `byte_offset` since the standard does not reject it.
+     * Presumably, pytorch will support importing `byte_offset != 0` and NumPy
+     * can choose to use it starting about 2023.  At that point, it may be
+     * that NumPy MUST use `byte_offset` to adhere to the standard (as
+     * specified in the header)!
+     */
+    managed->dl_tensor.data = PyArray_DATA(self);
+    managed->dl_tensor.byte_offset = 0;
+    managed->dl_tensor.device = device;
+    managed->dl_tensor.dtype = managed_dtype;
+
+    int64_t *managed_shape_strides = PyMem_Malloc(sizeof(int64_t) * ndim * 2);
+    if (managed_shape_strides == NULL) {
+        PyErr_NoMemory();
+        PyMem_Free(managed);
+        return NULL;
+    }
+
+    int64_t *managed_shape = managed_shape_strides;
+    int64_t *managed_strides = managed_shape_strides + ndim;
+    for (int i = 0; i < ndim; ++i) {
+        managed_shape[i] = shape[i];
+        // Strides in DLPack are items; in NumPy are bytes.
+        managed_strides[i] = strides[i] / itemsize;
+    }
+
+    managed->dl_tensor.ndim = ndim;
+    managed->dl_tensor.shape = managed_shape;
+    managed->dl_tensor.strides = NULL;
+    if (PyArray_SIZE(self) != 1 && !PyArray_IS_C_CONTIGUOUS(self)) {
+        managed->dl_tensor.strides = managed_strides;
+    }
+    managed->dl_tensor.byte_offset = 0;
+    managed->manager_ctx = self;
+    managed->deleter = array_dlpack_deleter;
+
+    PyObject *capsule = PyCapsule_New(managed, NPY_DLPACK_CAPSULE_NAME,
+            dlpack_capsule_deleter);
+    if (capsule == NULL) {
+        PyMem_Free(managed);
+        PyMem_Free(managed_shape_strides);
+        return NULL;
+    }
+
+    // the capsule holds a reference
+    Py_INCREF(self);
+    return capsule;
+}
+
+PyObject *
+array_dlpack_device(PyArrayObject *self, PyObject *NPY_UNUSED(args))
+{
+    DLDevice device = array_get_dl_device(self);
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+    return Py_BuildValue("ii", device.device_type, device.device_id);
+}
+
+NPY_NO_EXPORT PyObject *
+_from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj) {
+    PyObject *capsule = PyObject_CallMethod((PyObject *)obj->ob_type,
+            "__dlpack__", "O", obj);
+    if (capsule == NULL) {
+        return NULL;
+    }
+
+    DLManagedTensor *managed =
+        (DLManagedTensor *)PyCapsule_GetPointer(capsule,
+        NPY_DLPACK_CAPSULE_NAME);
+
+    if (managed == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    const int ndim = managed->dl_tensor.ndim;
+    if (ndim > NPY_MAXDIMS) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "maxdims of DLPack tensor is higher than the supported "
+                "maxdims.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    DLDeviceType device_type = managed->dl_tensor.device.device_type;
+    if (device_type != kDLCPU &&
+            device_type != kDLCUDAHost &&
+            device_type != kDLROCMHost &&
+            device_type != kDLCUDAManaged) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported device in DLTensor.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    if (managed->dl_tensor.dtype.lanes != 1) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported lanes in DLTensor dtype.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    int typenum = -1;
+    const uint8_t bits = managed->dl_tensor.dtype.bits;
+    const npy_intp itemsize = bits / 8;
+    switch (managed->dl_tensor.dtype.code) {
+    case kDLInt:
+        switch (bits)
+        {
+            case 8: typenum = NPY_INT8; break;
+            case 16: typenum = NPY_INT16; break;
+            case 32: typenum = NPY_INT32; break;
+            case 64: typenum = NPY_INT64; break;
+        }
+        break;
+    case kDLUInt:
+        switch (bits)
+        {
+            case 8: typenum = NPY_UINT8; break;
+            case 16: typenum = NPY_UINT16; break;
+            case 32: typenum = NPY_UINT32; break;
+            case 64: typenum = NPY_UINT64; break;
+        }
+        break;
+    case kDLFloat:
+        switch (bits)
+        {
+            case 16: typenum = NPY_FLOAT16; break;
+            case 32: typenum = NPY_FLOAT32; break;
+            case 64: typenum = NPY_FLOAT64; break;
+        }
+        break;
+    case kDLComplex:
+        switch (bits)
+        {
+            case 64: typenum = NPY_COMPLEX64; break;
+            case 128: typenum = NPY_COMPLEX128; break;
+        }
+        break;
+    }
+
+    if (typenum == -1) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Unsupported dtype in DLTensor.");
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    npy_intp shape[NPY_MAXDIMS];
+    npy_intp strides[NPY_MAXDIMS];
+
+    for (int i = 0; i < ndim; ++i) {
+        shape[i] = managed->dl_tensor.shape[i];
+        // DLPack has elements as stride units, NumPy has bytes.
+        if (managed->dl_tensor.strides != NULL) {
+            strides[i] = managed->dl_tensor.strides[i] * itemsize;
+        }
+    }
+
+    char *data = (char *)managed->dl_tensor.data +
+            managed->dl_tensor.byte_offset;
+
+    PyArray_Descr *descr = PyArray_DescrFromType(typenum);
+    if (descr == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    PyObject *ret = PyArray_NewFromDescr(&PyArray_Type, descr, ndim, shape,
+            managed->dl_tensor.strides != NULL ? strides : NULL, data, 0, NULL);
+    if (ret == NULL) {
+        Py_DECREF(capsule);
+        return NULL;
+    }
+
+    PyObject *new_capsule = PyCapsule_New(managed,
+            NPY_DLPACK_INTERNAL_CAPSULE_NAME,
+            array_dlpack_internal_capsule_deleter);
+    if (new_capsule == NULL) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    if (PyArray_SetBaseObject((PyArrayObject *)ret, new_capsule) < 0) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    if (PyCapsule_SetName(capsule, NPY_DLPACK_USED_CAPSULE_NAME) < 0) {
+        Py_DECREF(capsule);
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    Py_DECREF(capsule);
+    return ret;
+}
+
+
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 05e9e2394..2a61fe39d 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -74,9 +74,9 @@ typedef struct {
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
 #define NPY_DT_SLOTS(dtype) ((NPY_DType_Slots *)(dtype)->dt_slots)
 
-#define NPY_DT_is_legacy(dtype) ((dtype)->flags & NPY_DT_LEGACY)
-#define NPY_DT_is_abstract(dtype) ((dtype)->flags & NPY_DT_ABSTRACT)
-#define NPY_DT_is_parametric(dtype) ((dtype)->flags & NPY_DT_PARAMETRIC)
+#define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
+#define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
+#define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
 
 /*
  * Macros for convenient classmethod calls, since these require
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 29ceabd71..3114a5896 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -337,13 +337,13 @@ static NPY_GCC_OPT_3 void
         /**begin repeat2
          * #i = 0, 1, 2, 3#
          */
-        const @type@ b@i@ = @from@(data[@i@]);
-        const @type@ c@i@ = @from@(data_out[@i@]);
+        const @temptype@ b@i@ = @from@(data[@i@]);
+        const @temptype@ c@i@ = @from@(data_out[@i@]);
         /**end repeat2**/
         /**begin repeat2
          * #i = 0, 1, 2, 3#
          */
-        const @type@ abc@i@ = scalar * b@i@ + c@i@;
+        const @temptype@ abc@i@ = scalar * b@i@ + c@i@;
         /**end repeat2**/
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -353,8 +353,8 @@ static NPY_GCC_OPT_3 void
     }
 #endif // !NPY_DISABLE_OPTIMIZATION
     for (; count > 0; --count, ++data, ++data_out) {
-        const @type@ b = @from@(*data);
-        const @type@ c = @from@(*data_out);
+        const @temptype@ b = @from@(*data);
+        const @temptype@ c = @from@(*data_out);
         *data_out = @to@(scalar * b + c);
     }
 #endif // NPYV check for @type@
@@ -417,14 +417,14 @@ static void
         /**begin repeat2
          * #i = 0, 1, 2, 3#
          */
-        const @type@ a@i@ = @from@(data0[@i@]);
-        const @type@ b@i@ = @from@(data1[@i@]);
-        const @type@ c@i@ = @from@(data_out[@i@]);
+        const @temptype@ a@i@ = @from@(data0[@i@]);
+        const @temptype@ b@i@ = @from@(data1[@i@]);
+        const @temptype@ c@i@ = @from@(data_out[@i@]);
         /**end repeat2**/
         /**begin repeat2
          * #i = 0, 1, 2, 3#
          */
-        const @type@ abc@i@ = a@i@ * b@i@ + c@i@;
+        const @temptype@ abc@i@ = a@i@ * b@i@ + c@i@;
         /**end repeat2**/
         /**begin repeat2
          * #i = 0, 1, 2, 3#
@@ -434,9 +434,9 @@ static void
     }
 #endif // !NPY_DISABLE_OPTIMIZATION
     for (; count > 0; --count, ++data0, ++data1, ++data_out) {
-        const @type@ a = @from@(*data0);
-        const @type@ b = @from@(*data1);
-        const @type@ c = @from@(*data_out);
+        const @temptype@ a = @from@(*data0);
+        const @temptype@ b = @from@(*data1);
+        const @temptype@ c = @from@(*data_out);
         *data_out = @to@(a * b + c);
     }
 #endif // NPYV check for @type@
@@ -521,14 +521,14 @@ static NPY_GCC_OPT_3 void
         /**begin repeat2
          * #i = 0, 1, 2, 3#
          */
-        const @type@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
+        const @temptype@ ab@i@ = @from@(data0[@i@]) * @from@(data1[@i@]);
         /**end repeat2**/
         accum += ab0 + ab1 + ab2 + ab3;
     }
 #endif // !NPY_DISABLE_OPTIMIZATION
     for (; count > 0; --count, ++data0, ++data1) {
-        const @type@ a = @from@(*data0);
-        const @type@ b = @from@(*data1);
+        const @temptype@ a = @from@(*data0);
+        const @temptype@ b = @from@(*data1);
         accum += a * b;
     }
 #endif // NPYV check for @type@
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index 1e8abe9d6..4b9c7199b 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -13,9 +13,10 @@
 #include "dtypemeta.h"
 #include "array_coercion.h"
 #include "convert_datatype.h"
+#include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 1
+#define EXPERIMENTAL_DTYPE_API_VERSION 2
 
 
 typedef struct{
@@ -130,6 +131,14 @@ PyArrayInitDTypeMeta_FromSpec(
         return -1;
     }
 
+    if (((PyTypeObject *)DType)->tp_repr == PyArrayDescr_Type.tp_repr
+            || ((PyTypeObject *)DType)->tp_str == PyArrayDescr_Type.tp_str) {
+        PyErr_SetString(PyExc_TypeError,
+                "A custom DType must implement `__repr__` and `__str__` since "
+                "the default inherited version (currently) fails.");
+        return -1;
+    }
+
     if (spec->typeobj == NULL || !PyType_Check(spec->typeobj)) {
         PyErr_SetString(PyExc_TypeError,
                 "Not giving a type object is currently not supported, but "
@@ -324,13 +333,41 @@ PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec)
 }
 
 
+static int
+PyUFunc_AddPromoter(
+        PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter)
+{
+    if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) {
+        PyErr_SetString(PyExc_TypeError,
+                "ufunc object passed is not a ufunc!");
+        return -1;
+    }
+    if (!PyCapsule_CheckExact(promoter)) {
+        PyErr_SetString(PyExc_TypeError,
+                "promoter must (currently) be a PyCapsule.");
+        return -1;
+    }
+    if (PyCapsule_GetPointer(promoter, "numpy._ufunc_promoter") == NULL) {
+        return -1;
+    }
+    PyObject *info = PyTuple_Pack(2, DType_tuple, promoter);
+    if (info == NULL) {
+        return -1;
+    }
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
+
 NPY_NO_EXPORT PyObject *
 _get_experimental_dtype_api(PyObject *NPY_UNUSED(mod), PyObject *arg)
 {
     static void *experimental_api_table[] = {
             &PyUFunc_AddLoopFromSpec,
+            &PyUFunc_AddPromoter,
             &PyArrayDTypeMeta_Type,
             &PyArrayInitDTypeMeta_FromSpec,
+            &PyArray_CommonDType,
+            &PyArray_PromoteDTypeSequence,
             NULL,
     };
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index 2c8d1b3b4..e81ca2947 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -384,7 +384,23 @@ array_data_set(PyArrayObject *self, PyObject *op, void *NPY_UNUSED(ignored))
     }
     if (PyArray_FLAGS(self) & NPY_ARRAY_OWNDATA) {
         PyArray_XDECREF(self);
-        PyDataMem_FREE(PyArray_DATA(self));
+        size_t nbytes = PyArray_NBYTES(self);
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        if (nbytes == 0) {
+            PyArray_Descr *dtype = PyArray_DESCR(self);
+            nbytes = dtype->elsize ? dtype->elsize : 1;
+        }
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return -1;
+        }
+        PyDataMem_UserFREE(PyArray_DATA(self), nbytes, handler);
     }
     if (PyArray_BASE(self)) {
         if ((PyArray_FLAGS(self) & NPY_ARRAY_WRITEBACKIFCOPY) ||
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index ee66378a9..086b674c8 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -776,6 +776,7 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     return NULL;
 }
 
+
 /*NUMPY_API
  */
 NPY_NO_EXPORT PyObject *
@@ -907,7 +908,7 @@ PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject *out,
         Py_XDECREF(mps[i]);
     }
     Py_DECREF(ap);
-    npy_free_cache(mps, n * sizeof(mps[0]));
+    PyDataMem_FREE(mps);
     if (out != NULL && out != obj) {
         Py_INCREF(out);
         PyArray_ResolveWritebackIfCopy(obj);
@@ -922,7 +923,7 @@ PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject *out,
         Py_XDECREF(mps[i]);
     }
     Py_XDECREF(ap);
-    npy_free_cache(mps, n * sizeof(mps[0]));
+    PyDataMem_FREE(mps);
     PyArray_DiscardWritebackIfCopy(obj);
     Py_XDECREF(obj);
     return NULL;
@@ -962,14 +963,19 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         return 0;
     }
 
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        return -1;
+    }
     it = (PyArrayIterObject *)PyArray_IterAllButAxis((PyObject *)op, &axis);
     if (it == NULL) {
+        Py_DECREF(mem_handler);
         return -1;
     }
     size = it->size;
 
     if (needcopy) {
-        buffer = npy_alloc_cache(N * elsize);
+        buffer = PyDataMem_UserNEW(N * elsize, mem_handler);
         if (buffer == NULL) {
             ret = -1;
             goto fail;
@@ -1053,12 +1059,14 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
-    npy_free_cache(buffer, N * elsize);
+    /* cleanup internal buffer */
+    PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
     if (ret < 0 && !PyErr_Occurred()) {
         /* Out of memory during sorting or buffer creation */
         PyErr_NoMemory();
     }
     Py_DECREF(it);
+    Py_DECREF(mem_handler);
 
     return ret;
 }
@@ -1090,11 +1098,16 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     NPY_BEGIN_THREADS_DEF;
 
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        return NULL;
+    }
     rop = (PyArrayObject *)PyArray_NewFromDescr(
             Py_TYPE(op), PyArray_DescrFromType(NPY_INTP),
             PyArray_NDIM(op), PyArray_DIMS(op), NULL, NULL,
             0, (PyObject *)op);
     if (rop == NULL) {
+        Py_DECREF(mem_handler);
         return NULL;
     }
     rstride = PyArray_STRIDE(rop, axis);
@@ -1102,6 +1115,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     /* Check if there is any argsorting to do */
     if (N <= 1 || PyArray_SIZE(op) == 0) {
+        Py_DECREF(mem_handler);
         memset(PyArray_DATA(rop), 0, PyArray_NBYTES(rop));
         return (PyObject *)rop;
     }
@@ -1115,7 +1129,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     size = it->size;
 
     if (needcopy) {
-        valbuffer = npy_alloc_cache(N * elsize);
+        valbuffer = PyDataMem_UserNEW(N * elsize, mem_handler);
         if (valbuffer == NULL) {
             ret = -1;
             goto fail;
@@ -1123,7 +1137,8 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     }
 
     if (needidxbuffer) {
-        idxbuffer = (npy_intp *)npy_alloc_cache(N * sizeof(npy_intp));
+        idxbuffer = (npy_intp *)PyDataMem_UserNEW(N * sizeof(npy_intp),
+                                                  mem_handler);
         if (idxbuffer == NULL) {
             ret = -1;
             goto fail;
@@ -1212,8 +1227,9 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
-    npy_free_cache(valbuffer, N * elsize);
-    npy_free_cache(idxbuffer, N * sizeof(npy_intp));
+    /* cleanup internal buffers */
+    PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    PyDataMem_UserFREE(idxbuffer, N * sizeof(npy_intp), mem_handler);
     if (ret < 0) {
         if (!PyErr_Occurred()) {
             /* Out of memory during sorting or buffer creation */
@@ -1224,6 +1240,7 @@ fail:
     }
     Py_XDECREF(it);
     Py_XDECREF(rit);
+    Py_DECREF(mem_handler);
 
     return (PyObject *)rop;
 }
@@ -2398,19 +2415,14 @@ PyArray_CountNonzero(PyArrayObject *self)
     npy_intp *strideptr, *innersizeptr;
     NPY_BEGIN_THREADS_DEF;
 
-    // Special low-overhead version specific to the boolean/int types
     dtype = PyArray_DESCR(self);
-    switch(dtype->kind) {
-        case 'u':
-        case 'i':
-        case 'b':
-            if (dtype->elsize > 8) {
-                break;
-            }
-            return count_nonzero_int(
-                PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
-                PyArray_STRIDES(self), dtype->elsize
-            );
+    /* Special low-overhead version specific to the boolean/int types */
+    if (PyArray_ISALIGNED(self) && (
+            PyDataType_ISBOOL(dtype) || PyDataType_ISINTEGER(dtype))) {
+        return count_nonzero_int(
+            PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+            PyArray_STRIDES(self), dtype->elsize
+        );
     }
 
     nonzero = PyArray_DESCR(self)->f->nonzero;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index 391e65f6a..b0b6f42f1 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -26,6 +26,7 @@
 #include "shape.h"
 #include "strfuncs.h"
 #include "array_assign.h"
+#include "npy_dlpack.h"
 
 #include "methods.h"
 #include "alloc.h"
@@ -833,15 +834,15 @@ array_astype(PyArrayObject *self,
      */
     NPY_CASTING casting = NPY_UNSAFE_CASTING;
     NPY_ORDER order = NPY_KEEPORDER;
-    int forcecopy = 1, subok = 1;
+    _PyArray_CopyMode forcecopy = 1;
+    int subok = 1;
     NPY_PREPARE_ARGPARSER;
-
     if (npy_parse_arguments("astype", args, len_args, kwnames,
             "dtype", &PyArray_DescrConverter, &dtype,
             "|order", &PyArray_OrderConverter, &order,
             "|casting", &PyArray_CastingConverter, &casting,
             "|subok", &PyArray_PythonPyIntFromInt, &subok,
-            "|copy", &PyArray_PythonPyIntFromInt, &forcecopy,
+            "|copy", &PyArray_CopyConverter, &forcecopy,
             NULL, NULL, NULL) < 0) {
         Py_XDECREF(dtype);
         return NULL;
@@ -858,20 +859,29 @@ array_astype(PyArrayObject *self,
      * and it's not a subtype if subok is False, then we
      * can skip the copy.
      */
-    if (!forcecopy && (order == NPY_KEEPORDER ||
-                       (order == NPY_ANYORDER &&
-                            (PyArray_IS_C_CONTIGUOUS(self) ||
-                            PyArray_IS_F_CONTIGUOUS(self))) ||
-                       (order == NPY_CORDER &&
-                            PyArray_IS_C_CONTIGUOUS(self)) ||
-                       (order == NPY_FORTRANORDER &&
-                            PyArray_IS_F_CONTIGUOUS(self))) &&
-                    (subok || PyArray_CheckExact(self)) &&
-                    PyArray_EquivTypes(dtype, PyArray_DESCR(self))) {
+    if (forcecopy != NPY_COPY_ALWAYS && 
+                    (order == NPY_KEEPORDER ||
+                    (order == NPY_ANYORDER &&
+                        (PyArray_IS_C_CONTIGUOUS(self) ||
+                        PyArray_IS_F_CONTIGUOUS(self))) ||
+                    (order == NPY_CORDER &&
+                        PyArray_IS_C_CONTIGUOUS(self)) ||
+                    (order == NPY_FORTRANORDER &&
+                        PyArray_IS_F_CONTIGUOUS(self))) &&
+                (subok || PyArray_CheckExact(self)) &&
+                PyArray_EquivTypes(dtype, PyArray_DESCR(self))) {
         Py_DECREF(dtype);
         Py_INCREF(self);
         return (PyObject *)self;
     }
+
+    if (forcecopy == NPY_COPY_NEVER) {
+        PyErr_SetString(PyExc_ValueError,
+                "Unable to avoid copy while casting in never copy mode.");
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    
     if (!PyArray_CanCastArrayTo(self, dtype, casting)) {
         PyErr_Clear();
         npy_set_invalid_cast_error(
@@ -1821,22 +1831,8 @@ array_reduce_ex_picklebuffer(PyArrayObject *self, int protocol)
 
     descr = PyArray_DESCR(self);
 
-    /* if the python version is below 3.8, the pickle module does not provide
-     * built-in support for protocol 5. We try importing the pickle5
-     * backport instead */
-#if PY_VERSION_HEX >= 0x03080000
     /* we expect protocol 5 to be available in Python 3.8 */
     pickle_module = PyImport_ImportModule("pickle");
-#else
-    pickle_module = PyImport_ImportModule("pickle5");
-    if (pickle_module == NULL) {
-        /* for protocol 5, raise a clear ImportError if pickle5 is not found
-         */
-        PyErr_SetString(PyExc_ImportError, "Using pickle protocol 5 "
-                "requires the pickle5 module for Python >=3.6 and <3.8");
-        return NULL;
-    }
-#endif
     if (pickle_module == NULL){
         return NULL;
     }
@@ -1975,6 +1971,16 @@ array_setstate(PyArrayObject *self, PyObject *args)
         return NULL;
     }
 
+    /*
+     * Reassigning fa->descr messes with the reallocation strategy,
+     * since fa could be a 0-d or scalar, and then
+     * PyDataMem_UserFREE will be confused
+     */
+    size_t n_tofree = PyArray_NBYTES(self);
+    if (n_tofree == 0) {
+        PyArray_Descr *dtype = PyArray_DESCR(self);
+        n_tofree = dtype->elsize ? dtype->elsize : 1;
+    }
     Py_XDECREF(PyArray_DESCR(self));
     fa->descr = typecode;
     Py_INCREF(typecode);
@@ -2041,7 +2047,18 @@ array_setstate(PyArrayObject *self, PyObject *args)
     }
 
     if ((PyArray_FLAGS(self) & NPY_ARRAY_OWNDATA)) {
-        PyDataMem_FREE(PyArray_DATA(self));
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return NULL;
+        }
+        PyDataMem_UserFREE(PyArray_DATA(self), n_tofree, handler);
         PyArray_CLEARFLAGS(self, NPY_ARRAY_OWNDATA);
     }
     Py_XDECREF(PyArray_BASE(self));
@@ -2077,7 +2094,6 @@ array_setstate(PyArrayObject *self, PyObject *args)
 
     if (!PyDataType_FLAGCHK(typecode, NPY_LIST_PICKLE)) {
         int swap = PyArray_ISBYTESWAPPED(self);
-        fa->data = datastr;
         /* Bytes should always be considered immutable, but we just grab the
          * pointer if they are large, to save memory. */
         if (!IsAligned(self) || swap || (len <= 1000)) {
@@ -2086,8 +2102,16 @@ array_setstate(PyArrayObject *self, PyObject *args)
                 Py_DECREF(rawdata);
                 Py_RETURN_NONE;
             }
-            fa->data = PyDataMem_NEW(num);
+            /* Store the handler in case the default is modified */
+            Py_XDECREF(fa->mem_handler);
+            fa->mem_handler = PyDataMem_GetHandler();
+            if (fa->mem_handler == NULL) {
+                Py_DECREF(rawdata);
+                return NULL;
+            }
+            fa->data = PyDataMem_UserNEW(num, PyArray_HANDLER(self));
             if (PyArray_DATA(self) == NULL) {
+                Py_DECREF(fa->mem_handler);
                 Py_DECREF(rawdata);
                 return PyErr_NoMemory();
             }
@@ -2123,7 +2147,12 @@ array_setstate(PyArrayObject *self, PyObject *args)
             Py_DECREF(rawdata);
         }
         else {
+            /* The handlers should never be called in this case */
+            Py_XDECREF(fa->mem_handler);
+            fa->mem_handler = NULL;
+            fa->data = datastr;
             if (PyArray_SetBaseObject(self, rawdata) < 0) {
+                Py_DECREF(rawdata);
                 return NULL;
             }
         }
@@ -2134,8 +2163,15 @@ array_setstate(PyArrayObject *self, PyObject *args)
         if (num == 0 || elsize == 0) {
             Py_RETURN_NONE;
         }
-        fa->data = PyDataMem_NEW(num);
+        /* Store the functions in case the default handler is modified */
+        Py_XDECREF(fa->mem_handler);
+        fa->mem_handler = PyDataMem_GetHandler();
+        if (fa->mem_handler == NULL) {
+            return NULL;
+        }
+        fa->data = PyDataMem_UserNEW(num, PyArray_HANDLER(self));
         if (PyArray_DATA(self) == NULL) {
+            Py_DECREF(fa->mem_handler);
             return PyErr_NoMemory();
         }
         if (PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_NEEDS_INIT)) {
@@ -2144,6 +2180,7 @@ array_setstate(PyArrayObject *self, PyObject *args)
         PyArray_ENABLEFLAGS(self, NPY_ARRAY_OWNDATA);
         fa->base = NULL;
         if (_setlist_pkl(self, rawdata) < 0) {
+            Py_DECREF(fa->mem_handler);
             return NULL;
         }
     }
@@ -2209,7 +2246,7 @@ array_dumps(PyArrayObject *self, PyObject *args, PyObject *kwds)
 
 
 static PyObject *
-array_sizeof(PyArrayObject *self)
+array_sizeof(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 {
     /* object + dimension and strides */
     Py_ssize_t nbytes = Py_TYPE(self)->tp_basicsize +
@@ -2948,5 +2985,13 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
     {"view",
         (PyCFunction)array_view,
         METH_FASTCALL | METH_KEYWORDS, NULL},
+    // For data interchange between libraries
+    {"__dlpack__",
+        (PyCFunction)array_dlpack,
+        METH_FASTCALL | METH_KEYWORDS, NULL},
+
+    {"__dlpack_device__",
+        (PyCFunction)array_dlpack_device,
+        METH_NOARGS, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d211f01bc..cf0160a2b 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -70,6 +70,8 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "get_attr_string.h"
 #include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
 
+#include "npy_dlpack.h"
+
 /*
  *****************************************************************************
  **                    INCLUDE GENERATED CODE                               **
@@ -1560,7 +1562,7 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin, NPY_ORDER order)
 
 static NPY_INLINE PyObject *
 _array_fromobject_generic(
-        PyObject *op, PyArray_Descr *type, npy_bool copy, NPY_ORDER order,
+        PyObject *op, PyArray_Descr *type, _PyArray_CopyMode copy, NPY_ORDER order,
         npy_bool subok, int ndmin)
 {
     PyArrayObject *oparr = NULL, *ret = NULL;
@@ -1577,12 +1579,17 @@ _array_fromobject_generic(
     if (PyArray_CheckExact(op) || (subok && PyArray_Check(op))) {
         oparr = (PyArrayObject *)op;
         if (type == NULL) {
-            if (!copy && STRIDING_OK(oparr, order)) {
+            if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
                 ret = oparr;
                 Py_INCREF(ret);
                 goto finish;
             }
             else {
+                if (copy == NPY_COPY_NEVER) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "Unable to avoid copy while creating a new array.");
+                    return NULL;
+                }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
                 goto finish;
             }
@@ -1590,12 +1597,17 @@ _array_fromobject_generic(
         /* One more chance */
         oldtype = PyArray_DESCR(oparr);
         if (PyArray_EquivTypes(oldtype, type)) {
-            if (!copy && STRIDING_OK(oparr, order)) {
+            if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
                 Py_INCREF(op);
                 ret = oparr;
                 goto finish;
             }
             else {
+                if (copy == NPY_COPY_NEVER) {
+                    PyErr_SetString(PyExc_ValueError,
+                            "Unable to avoid copy while creating a new array.");
+                    return NULL;
+                }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
                 if (oldtype == type || ret == NULL) {
                     goto finish;
@@ -1608,9 +1620,12 @@ _array_fromobject_generic(
         }
     }
 
-    if (copy) {
+    if (copy == NPY_COPY_ALWAYS) {
         flags = NPY_ARRAY_ENSURECOPY;
     }
+    else if (copy == NPY_COPY_NEVER ) {
+        flags = NPY_ARRAY_ENSURENOCOPY;
+    }
     if (order == NPY_CORDER) {
         flags |= NPY_ARRAY_C_CONTIGUOUS;
     }
@@ -1654,7 +1669,7 @@ array_array(PyObject *NPY_UNUSED(ignored),
 {
     PyObject *op;
     npy_bool subok = NPY_FALSE;
-    npy_bool copy = NPY_TRUE;
+    _PyArray_CopyMode copy = NPY_COPY_ALWAYS;
     int ndmin = 0;
     PyArray_Descr *type = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -1665,7 +1680,7 @@ array_array(PyObject *NPY_UNUSED(ignored),
         if (npy_parse_arguments("array", args, len_args, kwnames,
                 "object", NULL, &op,
                 "|dtype", &PyArray_DescrConverter2, &type,
-                "$copy", &PyArray_BoolConverter, &copy,
+                "$copy", &PyArray_CopyConverter, &copy,
                 "$order", &PyArray_OrderConverter, &order,
                 "$subok", &PyArray_BoolConverter, &subok,
                 "$ndmin", &PyArray_PythonPyIntFromInt, &ndmin,
@@ -4197,7 +4212,7 @@ normalize_axis_index(PyObject *NPY_UNUSED(self),
 
 
 static PyObject *
-_reload_guard(PyObject *NPY_UNUSED(self)) {
+_reload_guard(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) {
     static int initialized = 0;
 
 #if !defined(PYPY_VERSION)
@@ -4231,7 +4246,6 @@ _reload_guard(PyObject *NPY_UNUSED(self)) {
     Py_RETURN_NONE;
 }
 
-
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
@@ -4433,6 +4447,12 @@ static struct PyMethodDef array_module_methods[] = {
     {"geterrobj",
         (PyCFunction) ufunc_geterr,
         METH_VARARGS, NULL},
+    {"get_handler_name",
+        (PyCFunction) get_handler_name,
+        METH_VARARGS, NULL},
+    {"get_handler_version",
+        (PyCFunction) get_handler_version,
+        METH_VARARGS, NULL},
     {"_add_newdoc_ufunc", (PyCFunction)add_newdoc_ufunc,
         METH_VARARGS, NULL},
     {"_get_sfloat_dtype",
@@ -4442,6 +4462,8 @@ static struct PyMethodDef array_module_methods[] = {
     {"_reload_guard", (PyCFunction)_reload_guard,
         METH_NOARGS,
         "Give a warning on reload and big warning in sub-interpreters."},
+    {"_from_dlpack", (PyCFunction)_from_dlpack,
+        METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };
 
@@ -4672,14 +4694,14 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     PyObject *m, *d, *s;
     PyObject *c_api;
 
-    /* Initialize CPU features */
-    if (npy_cpu_init() < 0) {
-        goto err;
-    }
-
     /* Create the module and add the functions */
     m = PyModule_Create(&moduledef);
     if (!m) {
+        return NULL;
+    }
+
+    /* Initialize CPU features */
+    if (npy_cpu_init() < 0) {
         goto err;
     }
 
@@ -4910,6 +4932,23 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (initumath(m) != 0) {
         goto err;
     }
+    /*
+     * Initialize the default PyDataMem_Handler capsule singleton.
+     */
+    PyDataMem_DefaultHandler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+    if (PyDataMem_DefaultHandler == NULL) {
+        goto err;
+    }
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    /*
+     * Initialize the context-local current handler
+     * with the default PyDataMem_Handler capsule.
+    */
+    current_handler = PyContextVar_New("current_allocator", PyDataMem_DefaultHandler);
+    if (current_handler == NULL) {
+        goto err;
+    }
+#endif
     return m;
 
  err:
@@ -4917,5 +4956,6 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
         PyErr_SetString(PyExc_RuntimeError,
                         "cannot load multiarray module.");
     }
+    Py_DECREF(m);
     return NULL;
 }
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 8e072d5f4..2675496ab 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -1190,7 +1190,7 @@ npyiter_resetbasepointers(NewNpyArrayIterObject *self)
 }
 
 static PyObject *
-npyiter_reset(NewNpyArrayIterObject *self)
+npyiter_reset(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1227,7 +1227,7 @@ npyiter_reset(NewNpyArrayIterObject *self)
  * copied.
  */
 static PyObject *
-npyiter_copy(NewNpyArrayIterObject *self)
+npyiter_copy(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     NewNpyArrayIterObject *iter;
 
@@ -1263,7 +1263,7 @@ npyiter_copy(NewNpyArrayIterObject *self)
 }
 
 static PyObject *
-npyiter_iternext(NewNpyArrayIterObject *self)
+npyiter_iternext(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter != NULL && self->iternext != NULL &&
                         !self->finished && self->iternext(self->iter)) {
@@ -1320,7 +1320,8 @@ npyiter_remove_axis(NewNpyArrayIterObject *self, PyObject *args)
 }
 
 static PyObject *
-npyiter_remove_multi_index(NewNpyArrayIterObject *self)
+npyiter_remove_multi_index(
+    NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1345,7 +1346,8 @@ npyiter_remove_multi_index(NewNpyArrayIterObject *self)
 }
 
 static PyObject *
-npyiter_enable_external_loop(NewNpyArrayIterObject *self)
+npyiter_enable_external_loop(
+    NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_ValueError,
@@ -1370,7 +1372,7 @@ npyiter_enable_external_loop(NewNpyArrayIterObject *self)
 }
 
 static PyObject *
-npyiter_debug_print(NewNpyArrayIterObject *self)
+npyiter_debug_print(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter != NULL) {
         NpyIter_DebugPrint(self->iter);
@@ -2315,7 +2317,7 @@ npyiter_ass_subscript(NewNpyArrayIterObject *self, PyObject *op,
 }
 
 static PyObject *
-npyiter_enter(NewNpyArrayIterObject *self)
+npyiter_enter(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     if (self->iter == NULL) {
         PyErr_SetString(PyExc_RuntimeError, "operation on non-initialized iterator");
@@ -2326,7 +2328,7 @@ npyiter_enter(NewNpyArrayIterObject *self)
 }
 
 static PyObject *
-npyiter_close(NewNpyArrayIterObject *self)
+npyiter_close(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     NpyIter *iter = self->iter;
     int ret;
@@ -2347,7 +2349,7 @@ static PyObject *
 npyiter_exit(NewNpyArrayIterObject *self, PyObject *NPY_UNUSED(args))
 {
     /* even if called via exception handling, writeback any data */
-    return npyiter_close(self);
+    return npyiter_close(self, NULL);
 }
 
 static PyMethodDef npyiter_methods[] = {
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index e409e9874..564352f1f 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -233,8 +233,12 @@ PyArray_CastScalarToCtype(PyObject *scalar, void *ctypeptr,
     PyArray_VectorUnaryFunc* castfunc;
 
     descr = PyArray_DescrFromScalar(scalar);
+    if (descr == NULL) {
+        return -1;
+    }
     castfunc = PyArray_GetCastFunc(descr, outcode->type_num);
     if (castfunc == NULL) {
+        Py_DECREF(descr);
         return -1;
     }
     if (PyTypeNum_ISEXTENDED(descr->type_num) ||
@@ -254,6 +258,7 @@ PyArray_CastScalarToCtype(PyObject *scalar, void *ctypeptr,
                     NPY_ARRAY_CARRAY, NULL);
         if (aout == NULL) {
             Py_DECREF(ain);
+            Py_DECREF(descr);
             return -1;
         }
         castfunc(PyArray_DATA(ain), PyArray_DATA(aout), 1, ain, aout);
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 56f17431a..db1e49db8 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -34,6 +34,16 @@
 
 #include "binop_override.h"
 
+/*
+ * used for allocating a single scalar, so use the default numpy
+ * memory allocators instead of the (maybe) user overrides
+ */
+NPY_NO_EXPORT void *
+npy_alloc_cache_zero(size_t nmemb, size_t size);
+
+NPY_NO_EXPORT void
+npy_free_cache(void * p, npy_uintp sz);
+
 NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[] = {
     {PyObject_HEAD_INIT(&PyBoolArrType_Type) 0},
     {PyObject_HEAD_INIT(&PyBoolArrType_Type) 1},
@@ -209,6 +219,27 @@ gentype_multiply(PyObject *m1, PyObject *m2)
 }
 
 /**begin repeat
+ * #TYPE    = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *            LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type    = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *            npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #c       = hh, uhh, h, uh,, u, l, ul, ll, ull#
+ * #Name    = Byte, UByte, Short, UShort, Int, UInt,
+ *            Long, ULong, LongLong, ULongLong#
+ * #convert = Long*8, LongLong*2#
+ */
+static PyObject *
+@type@_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    @type@ scalar = PyArrayScalar_VAL(self, @Name@);
+    uint8_t count = npy_popcount@c@(scalar);
+    PyObject *result = PyLong_From@convert@(count);
+
+    return result;
+}
+/**end repeat**/
+
+/**begin repeat
  *
  * #name = positive, negative, absolute, invert, int, float#
  */
@@ -1129,7 +1160,7 @@ gentype_size_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
 }
 
 static PyObject *
-gentype_sizeof(PyObject *self)
+gentype_sizeof(PyObject *self, PyObject *NPY_UNUSED(args))
 {
     Py_ssize_t nbytes;
     PyObject * isz = gentype_itemsize_get(self, NULL);
@@ -1321,7 +1352,7 @@ gentype_imag_get(PyObject *self, void *NPY_UNUSED(ignored))
         int elsize;
         typecode = PyArray_DescrFromScalar(self);
         elsize = typecode->elsize;
-        temp = npy_alloc_cache_zero(elsize);
+        temp = npy_alloc_cache_zero(1, elsize);
         ret = PyArray_Scalar(temp, typecode, NULL);
         npy_free_cache(temp, elsize);
     }
@@ -1887,7 +1918,7 @@ static PyObject *
  */
 /* Heavily copied from the builtin float.as_integer_ratio */
 static PyObject *
-@name@_as_integer_ratio(PyObject *self)
+@name@_as_integer_ratio(PyObject *self, PyObject *NPY_UNUSED(args))
 {
 #if @is_half@
     npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, @Name@));
@@ -1968,7 +1999,7 @@ error:
  *  #c    = f, f, , l#
  */
 static PyObject *
-@name@_is_integer(PyObject *self)
+@name@_is_integer(PyObject *self, PyObject *NPY_UNUSED(args))
 {
 #if @is_half@
     npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, @Name@));
@@ -1991,7 +2022,7 @@ static PyObject *
 /**end repeat**/
 
 static PyObject *
-integer_is_integer(PyObject *self) {
+integer_is_integer(PyObject *self, PyObject *NPY_UNUSED(args)) {
     Py_RETURN_TRUE;
 }
 
@@ -2306,8 +2337,7 @@ static PyMethodDef @name@type_methods[] = {
 /**end repeat**/
 
 /**begin repeat
- * #name = byte, short, int, long, longlong, ubyte, ushort,
- *         uint, ulong, ulonglong, timedelta, cdouble#
+ * #name = timedelta, cdouble#
  */
 static PyMethodDef @name@type_methods[] = {
     /* for typing; requires python >= 3.9 */
@@ -2318,6 +2348,23 @@ static PyMethodDef @name@type_methods[] = {
 };
 /**end repeat**/
 
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ */
+static PyMethodDef @name@type_methods[] = {
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_@name@_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+/**end repeat**/
+
+
 /************* As_mapping functions for void array scalar ************/
 
 static Py_ssize_t
@@ -3151,7 +3198,10 @@ void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                     (int) NPY_MAX_INT);
             return NULL;
         }
-        destptr = npy_alloc_cache_zero(memu);
+        if (memu == 0) {
+            memu = 1;
+        }
+        destptr = npy_alloc_cache_zero(memu, 1);
         if (destptr == NULL) {
             return PyErr_NoMemory();
         }
@@ -4092,6 +4142,17 @@ initialize_numeric_types(void)
     /**end repeat**/
 
     /**begin repeat
+     * #name = byte, short, int, long, longlong,
+     *         ubyte, ushort, uint, ulong, ulonglong#
+     * #Name = Byte, Short, Int, Long, LongLong,
+     *         UByte, UShort, UInt, ULong, ULongLong#
+     */
+
+    Py@Name@ArrType_Type.tp_methods = @name@type_methods;
+
+    /**end repeat**/
+
+    /**begin repeat
      * #name = half, float, double, longdouble#
      * #Name = Half, Float, Double, LongDouble#
      */
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 5a4e8c0f3..162abd6a4 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -121,8 +121,16 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
         }
 
         /* Reallocate space if needed - allocating 0 is forbidden */
-        new_data = PyDataMem_RENEW(
-            PyArray_DATA(self), newnbytes == 0 ? elsize : newnbytes);
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return NULL;
+        }
+        new_data = PyDataMem_UserRENEW(PyArray_DATA(self),
+                                       newnbytes == 0 ? elsize : newnbytes,
+                                       handler);
         if (new_data == NULL) {
             PyErr_SetString(PyExc_MemoryError,
                     "cannot allocate memory for array");
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index cae84befe..5b418342f 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -55,6 +55,29 @@
  */
 #include "npy_math_private.h"
 
+/* Magic binary numbers used by bit_count
+ * For type T, the magic numbers are computed as follows:
+ * Magic[0]: 01 01 01 01 01 01... = (T)~(T)0/3
+ * Magic[1]: 0011 0011 0011...    = (T)~(T)0/15  * 3
+ * Magic[2]: 00001111 00001111... = (T)~(T)0/255 * 15
+ * Magic[3]: 00000001 00000001... = (T)~(T)0/255
+ *
+ * Counting bits set, in parallel
+ * Based on: http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ *
+ * Generic Algorithm for type T:
+ * a = a - ((a >> 1) & (T)~(T)0/3);
+ * a = (a & (T)~(T)0/15*3) + ((a >> 2) & (T)~(T)0/15*3);
+ * a = (a + (a >> 4)) & (T)~(T)0/255*15;
+ * c = (T)(a * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT;
+*/
+
+static const npy_uint8  MAGIC8[]  = {0x55u,                 0x33u,                 0x0Fu,                 0x01u};
+static const npy_uint16 MAGIC16[] = {0x5555u,               0x3333u,               0x0F0Fu,               0x0101u};
+static const npy_uint32 MAGIC32[] = {0x55555555ul,          0x33333333ul,          0x0F0F0F0Ful,          0x01010101ul};
+static const npy_uint64 MAGIC64[] = {0x5555555555555555ull, 0x3333333333333333ull, 0x0F0F0F0F0F0F0F0Full, 0x0101010101010101ull};
+
+
 /*
  *****************************************************************************
  **                     BASIC MATH FUNCTIONS                                **
@@ -454,10 +477,16 @@ NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
 
 /**begin repeat
  * #type = npy_longdouble, npy_double, npy_float#
+ * #TYPE = LONGDOUBLE, DOUBLE, FLOAT#
  * #c = l,,f#
  * #C = L,,F#
  */
-
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_@TYPE@ == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, @c@)
+#endif
 /*
  * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
  * raise "invalid" when given INFINITY as input.
@@ -483,7 +512,7 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
         return (x - x);
     }
 #endif
-    return @kind@@c@(x);
+    return NPY__FP_SFX(@kind@)(x);
 }
 #endif
 
@@ -498,7 +527,7 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 {
-    return @kind@@c@(x, y);
+    return NPY__FP_SFX(@kind@)(x, y);
 }
 #endif
 /**end repeat1**/
@@ -506,21 +535,21 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #ifdef HAVE_MODF@C@
 NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
 {
-    return modf@c@(x, iptr);
+    return NPY__FP_SFX(modf)(x, iptr);
 }
 #endif
 
 #ifdef HAVE_LDEXP@C@
 NPY_INPLACE @type@ npy_ldexp@c@(@type@ x, int exp)
 {
-    return ldexp@c@(x, exp);
+    return NPY__FP_SFX(ldexp)(x, exp);
 }
 #endif
 
 #ifdef HAVE_FREXP@C@
 NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
 {
-    return frexp@c@(x, exp);
+    return NPY__FP_SFX(frexp)(x, exp);
 }
 #endif
 
@@ -543,10 +572,10 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 #else
 NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 {
-    return cbrt@c@(x);
+    return NPY__FP_SFX(cbrt)(x);
 }
 #endif
-
+#undef NPY__FP_SFX
 /**end repeat**/
 
 
@@ -556,10 +585,16 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 
 /**begin repeat
  * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #c = f, ,l#
  * #C = F, ,L#
  */
-
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_@TYPE@ == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, @c@)
+#endif
 @type@ npy_heaviside@c@(@type@ x, @type@ h0)
 {
     if (npy_isnan(x)) {
@@ -576,10 +611,10 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
     }
 }
 
-#define LOGE2    NPY_LOGE2@c@
-#define LOG2E    NPY_LOG2E@c@
-#define RAD2DEG  (180.0@c@/NPY_PI@c@)
-#define DEG2RAD  (NPY_PI@c@/180.0@c@)
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
 
 NPY_INPLACE @type@ npy_rad2deg@c@(@type@ x)
 {
@@ -733,7 +768,7 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 #undef LOG2E
 #undef RAD2DEG
 #undef DEG2RAD
-
+#undef NPY__FP_SFX
 /**end repeat**/
 
 /**begin repeat
@@ -814,3 +849,66 @@ npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
 }
 /**end repeat1**/
 /**end repeat**/
+
+
+#define __popcnt32 __popcnt
+/**begin repeat
+ *
+ * #type  = ubyte, ushort, uint, ulong, ulonglong#
+ * #STYPE = BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ * #c     = hh,    h,      ,     l,     ll#
+ */
+#undef TO_BITS_LEN
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_BITS_LEN(X) X##@len@
+/**end repeat1**/
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallel@c@(npy_@type@ a)
+{
+    a = a - ((a >> 1) & (npy_@type@) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_@type@) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_@type@) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_@type@) TO_BITS_LEN(MAGIC)[2];
+    return (npy_@type@) (a * (npy_@type@) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_@STYPE@ - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountu@c@(npy_@type@ a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_@STYPE@ >= 32
+    return __builtin_popcount@c@(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_@STYPE@ >= 16
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_@STYPE@ != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_@STYPE@ == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallel@c@(a);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #type = byte, short, int, long, longlong#
+ * #c    = hh,   h,     ,    l,    ll#
+ */
+NPY_INPLACE uint8_t
+npy_popcount@c@(npy_@type@ a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountu@c@(a < 0 ? -a : a);
+}
+/**end repeat**/
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index 212d11a0b..7ca0c5ba0 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -19,7 +19,13 @@
 #define _NPY_MATH_PRIVATE_H_
 
 #include <Python.h>
+#ifdef __cplusplus
+#include <cmath>
+using std::isgreater;
+using std::isless;
+#else
 #include <math.h>
+#endif
 
 #include "npy_config.h"
 #include "npy_fpmath.h"
@@ -507,17 +513,29 @@ typedef union {
 #else /* !_MSC_VER */
 typedef union {
         npy_cdouble npy_z;
+#ifdef __cplusplus
+        std::complex<double> c99z;
+#else
         complex double c99_z;
+#endif
 } __npy_cdouble_to_c99_cast;
 
 typedef union {
         npy_cfloat npy_z;
+#ifdef __cplusplus
+        std::complex<float> c99z;
+#else
         complex float c99_z;
+#endif
 } __npy_cfloat_to_c99_cast;
 
 typedef union {
         npy_clongdouble npy_z;
+#ifdef __cplusplus
+        std::complex<long double> c99_z;
+#else
         complex long double c99_z;
+#endif
 } __npy_clongdouble_to_c99_cast;
 #endif /* !_MSC_VER */
 
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
deleted file mode 100644
index 99d8ed42a..000000000
--- a/numpy/core/src/npysort/radixsort.c.src
+++ /dev/null
@@ -1,231 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "npy_sort.h"
-#include "npysort_common.h"
-#include <stdlib.h>
-
-/*
- *****************************************************************************
- **                            INTEGER SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
- *         LONGLONG, ULONGLONG#
- * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
- *         longlong, ulonglong#
- * #type = npy_ubyte, npy_ubyte, npy_ubyte, npy_ushort, npy_ushort, npy_uint,
- *         npy_uint, npy_ulong, npy_ulong, npy_ulonglong, npy_ulonglong#
- * #sign = 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
- * #floating = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
- */
-
-// Reference: https://github.com/eloj/radix-sorting#-key-derivation
-#if @sign@
-    // Floating-point is currently disabled.
-    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
-    // Basic sorting tests succeed but others relying on sort fail.
-    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
-    #if @floating@
-        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
-        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(@type@) * 8 - 1)) | ((@type@)1 << (sizeof(@type@) * 8 - 1))))
-    #else
-        // For signed ints, we flip the sign bit so the negatives are below the positives.
-        #define KEY_OF(x) ((x) ^ ((@type@)1 << (sizeof(@type@) * 8 - 1)))
-    #endif
-#else
-    // For unsigned ints, the key is as-is
-    #define KEY_OF(x) (x)
-#endif
-
-static inline npy_ubyte
-nth_byte_@suff@(@type@ key, npy_intp l) {
-    return (key >> (l << 3)) & 0xFF;
-}
-
-static @type@*
-radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
-{
-    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
-    npy_intp i;
-    size_t l;
-    @type@ key0 = KEY_OF(arr[0]);
-    size_t ncols = 0;
-    npy_ubyte cols[sizeof(@type@)];
-
-    for (i = 0; i < num; i++) {
-        @type@ k = KEY_OF(arr[i]);
-
-        for (l = 0; l < sizeof(@type@); l++) {
-            cnt[l][nth_byte_@suff@(k, l)]++;
-        }
-    }
-
-    for (l = 0; l < sizeof(@type@); l++) {
-	    if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
-	        cols[ncols++] = l;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp a = 0;
-        for (i = 0; i < 256; i++) {
-            npy_intp b = cnt[cols[l]][i];
-            cnt[cols[l]][i] = a;
-            a += b;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        @type@* temp;
-        for (i = 0; i < num; i++) {
-            @type@ k = KEY_OF(arr[i]);
-            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
-            aux[dst] = arr[i];
-        }
-
-        temp = aux;
-        aux = arr;
-        arr = temp;
-    }
-
-    return arr;
-}
-
-NPY_NO_EXPORT int
-radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
-{
-    void *sorted;
-    @type@ *aux;
-    @type@ *arr = start;
-    @type@ k1, k2;
-    npy_bool all_sorted = 1;
-
-    if (num < 2) {
-        return 0;
-    }
-
-    k1 = KEY_OF(arr[0]);
-    for (npy_intp i = 1; i < num; i++) {
-        k2 = KEY_OF(arr[i]);
-        if (k1 > k2) {
-            all_sorted = 0;
-            break;
-        }
-        k1 = k2;
-    }
-
-    if (all_sorted) {
-        return 0;
-    }
-
-    aux = malloc(num * sizeof(@type@));
-    if (aux == NULL) {
-        return -NPY_ENOMEM;
-    }
-
-    sorted = radixsort0_@suff@(start, aux, num);
-    if (sorted != start) {
-        memcpy(start, sorted, num * sizeof(@type@));
-    }
-
-    free(aux);
-    return 0;
-}
-
-static npy_intp*
-aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
-{
-    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
-    npy_intp i;
-    size_t l;
-    @type@ key0 = KEY_OF(arr[0]);
-    size_t ncols = 0;
-    npy_ubyte cols[sizeof(@type@)];
-
-    for (i = 0; i < num; i++) {
-        @type@ k = KEY_OF(arr[i]);
-
-        for (l = 0; l < sizeof(@type@); l++) {
-            cnt[l][nth_byte_@suff@(k, l)]++;
-        }
-    }
-
-    for (l = 0; l < sizeof(@type@); l++) {
-        if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
-            cols[ncols++] = l;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp a = 0;
-        for (i = 0; i < 256; i++) {
-            npy_intp b = cnt[cols[l]][i];
-            cnt[cols[l]][i] = a;
-            a += b;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp* temp;
-        for (i = 0; i < num; i++) {
-            @type@ k = KEY_OF(arr[tosort[i]]);
-            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
-            aux[dst] = tosort[i];
-        }
-
-        temp = aux;
-        aux = tosort;
-        tosort = temp;
-    }
-
-    return tosort;
-}
-
-NPY_NO_EXPORT int
-aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
-{
-    npy_intp *sorted;
-    npy_intp *aux;
-    @type@ *arr = start;
-    @type@ k1, k2;
-    npy_bool all_sorted = 1;
-
-    if (num < 2) {
-        return 0;
-    }
-
-    k1 = KEY_OF(arr[tosort[0]]);
-    for (npy_intp i = 1; i < num; i++) {
-        k2 = KEY_OF(arr[tosort[i]]);
-        if (k1 > k2) {
-            all_sorted = 0;
-            break;
-        }
-        k1 = k2;
-    }
-
-    if (all_sorted) {
-        return 0;
-    }
-
-    aux = malloc(num * sizeof(npy_intp));
-    if (aux == NULL) {
-        return -NPY_ENOMEM;
-    }
-
-    sorted = aradixsort0_@suff@(start, aux, tosort, num);
-    if (sorted != tosort) {
-        memcpy(tosort, sorted, num * sizeof(npy_intp));
-    }
-
-    free(aux);
-    return 0;
-}
-
-#undef KEY_OF
-
-/**end repeat**/
diff --git a/numpy/core/src/npysort/radixsort.cpp b/numpy/core/src/npysort/radixsort.cpp
new file mode 100644
index 000000000..017ea43b6
--- /dev/null
+++ b/numpy/core/src/npysort/radixsort.cpp
@@ -0,0 +1,354 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#include "../common/numpy_tag.h"
+#include <stdlib.h>
+#include <type_traits>
+
+/*
+ *****************************************************************************
+ **                            INTEGER SORTS                                **
+ *****************************************************************************
+ */
+
+// Reference: https://github.com/eloj/radix-sorting#-key-derivation
+template <class T>
+T
+KEY_OF(T x)
+{
+    // Floating-point is currently disabled.
+    // Floating-point tests succeed for double and float on macOS but not on
+    // Windows/Linux. Basic sorting tests succeed but others relying on sort
+    // fail. Possibly related to floating-point normalisation or multiple NaN
+    // reprs? Not sure.
+    if (std::is_floating_point<T>::value) {
+        // For floats, we invert the key if the sign bit is set, else we invert
+        // the sign bit.
+        return ((x) ^ (-((x) >> (sizeof(T) * 8 - 1)) |
+                       ((T)1 << (sizeof(T) * 8 - 1))));
+    }
+    else if (std::is_signed<T>::value) {
+        // For signed ints, we flip the sign bit so the negatives are below the
+        // positives.
+        return ((x) ^ ((T)1 << (sizeof(T) * 8 - 1)));
+    }
+    else {
+        return x;
+    }
+}
+
+template <class T>
+static inline npy_ubyte
+nth_byte(T key, npy_intp l)
+{
+    return (key >> (l << 3)) & 0xFF;
+}
+
+template <class T>
+static T *
+radixsort0(T *start, T *aux, npy_intp num)
+{
+    npy_intp cnt[sizeof(T)][1 << 8] = {{0}};
+    T key0 = KEY_OF(start[0]);
+
+    for (npy_intp i = 0; i < num; i++) {
+        T k = KEY_OF(start[i]);
+
+        for (size_t l = 0; l < sizeof(T); l++) {
+            cnt[l][nth_byte(k, l)]++;
+        }
+    }
+
+    size_t ncols = 0;
+    npy_ubyte cols[sizeof(T)];
+    for (size_t l = 0; l < sizeof(T); l++) {
+        if (cnt[l][nth_byte(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (npy_intp i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        T *temp;
+        for (npy_intp i = 0; i < num; i++) {
+            T k = KEY_OF(start[i]);
+            npy_intp dst = cnt[cols[l]][nth_byte(k, cols[l])]++;
+            aux[dst] = start[i];
+        }
+
+        temp = aux;
+        aux = start;
+        start = temp;
+    }
+
+    return start;
+}
+
+template <class T>
+static int
+radixsort_(T *start, npy_intp num)
+{
+    if (num < 2) {
+        return 0;
+    }
+
+    npy_bool all_sorted = 1;
+    T k1 = KEY_OF(start[0]), k2;
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(start[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    T *aux = (T *)malloc(num * sizeof(T));
+    if (aux == nullptr) {
+        return -NPY_ENOMEM;
+    }
+
+    T *sorted = radixsort0(start, aux, num);
+    if (sorted != start) {
+        memcpy(start, sorted, num * sizeof(T));
+    }
+
+    free(aux);
+    return 0;
+}
+
+template <class T>
+static int
+radixsort(void *start, npy_intp num)
+{
+    return radixsort_((T *)start, num);
+}
+
+template <class T>
+static npy_intp *
+aradixsort0(T *start, npy_intp *aux, npy_intp *tosort, npy_intp num)
+{
+    npy_intp cnt[sizeof(T)][1 << 8] = {{0}};
+    T key0 = KEY_OF(start[0]);
+
+    for (npy_intp i = 0; i < num; i++) {
+        T k = KEY_OF(start[i]);
+
+        for (size_t l = 0; l < sizeof(T); l++) {
+            cnt[l][nth_byte(k, l)]++;
+        }
+    }
+
+    size_t ncols = 0;
+    npy_ubyte cols[sizeof(T)];
+    for (size_t l = 0; l < sizeof(T); l++) {
+        if (cnt[l][nth_byte(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (npy_intp i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp *temp;
+        for (npy_intp i = 0; i < num; i++) {
+            T k = KEY_OF(start[tosort[i]]);
+            npy_intp dst = cnt[cols[l]][nth_byte(k, cols[l])]++;
+            aux[dst] = tosort[i];
+        }
+
+        temp = aux;
+        aux = tosort;
+        tosort = temp;
+    }
+
+    return tosort;
+}
+
+template <class T>
+static int
+aradixsort_(T *start, npy_intp *tosort, npy_intp num)
+{
+    npy_intp *sorted;
+    npy_intp *aux;
+    T k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(start[tosort[0]]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(start[tosort[i]]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = (npy_intp *)malloc(num * sizeof(npy_intp));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = aradixsort0(start, aux, tosort, num);
+    if (sorted != tosort) {
+        memcpy(tosort, sorted, num * sizeof(npy_intp));
+    }
+
+    free(aux);
+    return 0;
+}
+
+template <class T>
+static int
+aradixsort(void *start, npy_intp *tosort, npy_intp num)
+{
+    return aradixsort_((T *)start, tosort, num);
+}
+
+extern "C" {
+NPY_NO_EXPORT int
+radixsort_bool(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_bool>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_byte(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_byte>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ubyte(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ubyte>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_short(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_short>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ushort(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ushort>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_int(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_int>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_uint(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_uint>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_long(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_long>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ulong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ulong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_longlong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_longlong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ulonglong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ulonglong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_bool>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_byte>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ubyte>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_short(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_short>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ushort(void *vec, npy_intp *ind, npy_intp cnt,
+                  void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ushort>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_int>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_uint>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_long>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ulong(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ulong>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_longlong(void *vec, npy_intp *ind, npy_intp cnt,
+                    void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_longlong>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt,
+                     void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ulonglong>(vec, ind, cnt);
+}
+}
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index eeef33a3d..b6c19362a 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -398,6 +398,42 @@ float_to_from_sfloat_resolve_descriptors(
 }
 
 
+/*
+ * Cast to boolean (for testing the logical functions a bit better).
+ */
+static int
+cast_sfloat_to_bool(PyArrayMethod_Context *NPY_UNUSED(context),
+        char *const data[], npy_intp const dimensions[],
+        npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in = data[0];
+    char *out = data[1];
+    for (npy_intp i = 0; i < N; i++) {
+        *(npy_bool *)out = *(double *)in != 0;
+        in += strides[0];
+        out += strides[1];
+    }
+    return 0;
+}
+
+static NPY_CASTING
+sfloat_to_bool_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    loop_descrs[1] = PyArray_DescrFromType(NPY_BOOL);  /* cannot fail */
+    return NPY_UNSAFE_CASTING;
+}
+
+
 static int
 init_casts(void)
 {
@@ -453,6 +489,22 @@ init_casts(void)
         return -1;
     }
 
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &sfloat_to_bool_resolve_descriptors;
+    slots[1].slot = NPY_METH_strided_loop;
+    slots[1].pfunc = &cast_sfloat_to_bool;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    spec.name = "sfloat_to_bool_cast";
+    dtypes[0] = &PyArray_SFloatDType;
+    dtypes[1] = PyArray_DTypeFromTypeNum(NPY_BOOL);
+    Py_DECREF(dtypes[1]);  /* immortal anyway */
+
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 0)) {
+        return -1;
+    }
+
     return 0;
 }
 
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 33d8539d5..ce42fc271 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -400,6 +400,16 @@ addUfuncs(PyObject *dictionary) {
     }
     PyDict_SetItemString(dictionary, "always_error", f);
     Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(always_error_functions,
+            always_error_data, always_error_signatures, 1, 2, 1, PyUFunc_None,
+            "always_error_gufunc",
+            "simply, broken, gufunc that sets an error (but releases the GIL).",
+            0, "(i),()->()");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "always_error_gufunc", f);
+    Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
                     inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d",
                     "inner on the last dimension and broadcast on the rest \n"
diff --git a/numpy/core/src/umath/clip.c.src b/numpy/core/src/umath/clip.c.src
deleted file mode 100644
index bc966b7ac..000000000
--- a/numpy/core/src/umath/clip.c.src
+++ /dev/null
@@ -1,120 +0,0 @@
-/**
- * This module provides the inner loops for the clip ufunc
- */
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "numpy/halffloat.h"
-#include "numpy/npy_math.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/npy_common.h"
-#include "numpy/utils.h"
-#include "fast_loop_macros.h"
-
-/*
- * Produce macros that perform nan/nat-propagating min and max
- */
-
-/**begin repeat
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG#
- */
-#define _NPY_@name@_MIN(a, b) PyArray_MIN(a, b)
-#define _NPY_@name@_MAX(a, b) PyArray_MAX(a, b)
-/**end repeat**/
-
-#define _NPY_HALF_MIN(a, b) (npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b))
-#define _NPY_HALF_MAX(a, b) (npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b))
-
-/**begin repeat
- * #name = FLOAT, DOUBLE, LONGDOUBLE#
- */
-#define _NPY_@name@_MIN(a, b) (npy_isnan(a) ? (a) : PyArray_MIN(a, b))
-#define _NPY_@name@_MAX(a, b) (npy_isnan(a) ? (a) : PyArray_MAX(a, b))
-/**end repeat**/
-
-/**begin repeat
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- */
-#define _NPY_@name@_MIN(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b) ? (a) : (b))
-#define _NPY_@name@_MAX(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b) ? (a) : (b))
-/**end repeat**/
-
-/**begin repeat
- * #name = DATETIME, TIMEDELTA#
- */
-#define _NPY_@name@_MIN(a, b) ( \
-    (a) == NPY_DATETIME_NAT ? (a) : \
-    (b) == NPY_DATETIME_NAT ? (b) : \
-    (a) < (b) ? (a) : (b) \
-)
-#define _NPY_@name@_MAX(a, b) ( \
-    (a) == NPY_DATETIME_NAT ? (a) : \
-    (b) == NPY_DATETIME_NAT ? (b) : \
-    (a) > (b) ? (a) : (b) \
-)
-/**end repeat**/
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble,
- *         npy_datetime, npy_timedelta#
- */
-
-#define _NPY_CLIP(x, min, max) \
-    _NPY_@name@_MIN(_NPY_@name@_MAX((x), (min)), (max))
-
-NPY_NO_EXPORT void
-@name@_clip(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (steps[1] == 0 && steps[2] == 0) {
-        /* min and max are constant throughout the loop, the most common case */
-        /* NOTE: it may be possible to optimize these checks for nan */
-        @type@ min_val = *(@type@ *)args[1];
-        @type@ max_val = *(@type@ *)args[2];
-
-        char *ip1 = args[0], *op1 = args[3];
-        npy_intp is1 = steps[0], os1 = steps[3];
-        npy_intp n = dimensions[0];
-
-        /* contiguous, branch to let the compiler optimize */
-        if (is1 == sizeof(@type@) && os1 == sizeof(@type@)) {
-            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
-            }
-        }
-        else {
-            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
-            }
-        }
-    }
-    else {
-        TERNARY_LOOP {
-            *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, *(@type@ *)ip2, *(@type@ *)ip3);
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-// clean up the macros we defined above
-#undef _NPY_CLIP
-#undef _NPY_@name@_MAX
-#undef _NPY_@name@_MIN
-
-/**end repeat**/
diff --git a/numpy/core/src/umath/clip.cpp b/numpy/core/src/umath/clip.cpp
new file mode 100644
index 000000000..19d05c848
--- /dev/null
+++ b/numpy/core/src/umath/clip.cpp
@@ -0,0 +1,282 @@
+/**
+ * This module provides the inner loops for the clip ufunc
+ */
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "numpy/halffloat.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/npy_common.h"
+#include "numpy/npy_math.h"
+#include "numpy/utils.h"
+
+#include "fast_loop_macros.h"
+
+#include "../common/numpy_tag.h"
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::integral_tag const &)
+{
+    return PyArray_MIN(a, b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::integral_tag const &)
+{
+    return PyArray_MAX(a, b);
+}
+
+npy_half
+_NPY_MIN(npy_half a, npy_half b, npy::half_tag const &)
+{
+    return npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b);
+}
+npy_half
+_NPY_MAX(npy_half a, npy_half b, npy::half_tag const &)
+{
+    return npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::floating_point_tag const &)
+{
+    return npy_isnan(a) ? (a) : PyArray_MIN(a, b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::floating_point_tag const &)
+{
+    return npy_isnan(a) ? (a) : PyArray_MAX(a, b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::complex_tag const &)
+{
+    return npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b)
+                   ? (a)
+                   : (b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::complex_tag const &)
+{
+    return npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b)
+                   ? (a)
+                   : (b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::date_tag const &)
+{
+    return (a) == NPY_DATETIME_NAT   ? (a)
+           : (b) == NPY_DATETIME_NAT ? (b)
+           : (a) < (b)               ? (a)
+                                     : (b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::date_tag const &)
+{
+    return (a) == NPY_DATETIME_NAT   ? (a)
+           : (b) == NPY_DATETIME_NAT ? (b)
+           : (a) > (b)               ? (a)
+                                     : (b);
+}
+
+/* generic dispatcher */
+template <class Tag, class T = typename Tag::type>
+T
+_NPY_MIN(T const &a, T const &b)
+{
+    return _NPY_MIN(a, b, Tag{});
+}
+template <class Tag, class T = typename Tag::type>
+T
+_NPY_MAX(T const &a, T const &b)
+{
+    return _NPY_MAX(a, b, Tag{});
+}
+
+template <class Tag, class T>
+T
+_NPY_CLIP(T x, T min, T max)
+{
+    return _NPY_MIN<Tag>(_NPY_MAX<Tag>((x), (min)), (max));
+}
+
+template <class Tag, class T = typename Tag::type>
+static void
+_npy_clip_(T **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    npy_intp n = dimensions[0];
+    if (steps[1] == 0 && steps[2] == 0) {
+        /* min and max are constant throughout the loop, the most common case
+         */
+        /* NOTE: it may be possible to optimize these checks for nan */
+        T min_val = *args[1];
+        T max_val = *args[2];
+
+        T *ip1 = args[0], *op1 = args[3];
+        npy_intp is1 = steps[0] / sizeof(T), os1 = steps[3] / sizeof(T);
+
+        /* contiguous, branch to let the compiler optimize */
+        if (is1 == 1 && os1 == 1) {
+            for (npy_intp i = 0; i < n; i++, ip1++, op1++) {
+                *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
+            }
+        }
+        else {
+            for (npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
+            }
+        }
+    }
+    else {
+        T *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];
+        npy_intp is1 = steps[0] / sizeof(T), is2 = steps[1] / sizeof(T),
+                 is3 = steps[2] / sizeof(T), os1 = steps[3] / sizeof(T);
+        for (npy_intp i = 0; i < n;
+             i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+            *op1 = _NPY_CLIP<Tag>(*ip1, *ip2, *ip3);
+    }
+    npy_clear_floatstatus_barrier((char *)dimensions);
+}
+
+template <class Tag>
+static void
+_npy_clip(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    using T = typename Tag::type;
+    return _npy_clip_<Tag>((T **)args, dimensions, steps);
+}
+
+extern "C" {
+NPY_NO_EXPORT void
+BOOL_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::bool_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+BYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::byte_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+UBYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ubyte_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+SHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::short_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+USHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ushort_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+INT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::int_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+UINT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::uint_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::long_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+ULONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ulong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::longlong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+ULONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ulonglong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+HALF_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::half_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+FLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::float_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+DOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::double_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONGDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::longdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CFLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::cfloat_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::cdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CLONGDOUBLE_clip(char **args, npy_intp const *dimensions,
+                 npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::clongdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+DATETIME_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::datetime_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+TIMEDELTA_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::timedelta_tag>(args, dimensions, steps);
+}
+}
diff --git a/numpy/core/src/umath/clip.h b/numpy/core/src/umath/clip.h
new file mode 100644
index 000000000..f69ebd1e3
--- /dev/null
+++ b/numpy/core/src/umath/clip.h
@@ -0,0 +1,73 @@
+#ifndef _NPY_UMATH_CLIP_H_
+#define _NPY_UMATH_CLIP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NPY_NO_EXPORT void
+BOOL_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+BYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+UBYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+SHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+USHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+INT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+UINT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+ULONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+ULONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+HALF_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+FLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+DOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CFLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CLONGDOUBLE_clip(char **args, npy_intp const *dimensions,
+                 npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+DATETIME_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+TIMEDELTA_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/numpy/core/src/umath/clip.h.src b/numpy/core/src/umath/clip.h.src
deleted file mode 100644
index f16856cdf..000000000
--- a/numpy/core/src/umath/clip.h.src
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _NPY_UMATH_CLIP_H_
-#define _NPY_UMATH_CLIP_H_
-
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- */
-NPY_NO_EXPORT void
-@name@_clip(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat**/
-
-#endif
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 40de28754..8e99c0420 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -193,6 +193,10 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                 /* Unspecified out always matches (see below for inputs) */
                 continue;
             }
+            if (resolver_dtype == (PyArray_DTypeMeta *)Py_None) {
+                /* always matches */
+                continue;
+            }
             if (given_dtype == resolver_dtype) {
                 continue;
             }
@@ -267,8 +271,39 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                  *       the subclass should be considered a better match
                  *       (subclasses are always more specific).
                  */
+                /* Whether this (normally output) dtype was specified at all */
+                if (op_dtypes[i] == NULL) {
+                    /*
+                     * When DType is completely unspecified, prefer abstract
+                     * over concrete, assuming it will resolve.
+                     * Furthermore, we cannot decide which abstract/None
+                     * is "better", only concrete ones which are subclasses
+                     * of Abstract ones are defined as worse.
+                     */
+                    npy_bool prev_is_concrete = NPY_FALSE;
+                    npy_bool new_is_concrete = NPY_FALSE;
+                    if ((prev_dtype != Py_None) &&
+                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
+                        prev_is_concrete = NPY_TRUE;
+                    }
+                    if ((new_dtype != Py_None) &&
+                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
+                        new_is_concrete = NPY_TRUE;
+                    }
+                    if (prev_is_concrete == new_is_concrete) {
+                        best = -1;
+                    }
+                    else if (prev_is_concrete) {
+                        unambiguously_equally_good = 0;
+                        best = 1;
+                    }
+                    else {
+                        unambiguously_equally_good = 0;
+                        best = 0;
+                    }
+                }
                 /* If either is None, the other is strictly more specific */
-                if (prev_dtype == Py_None) {
+                else if (prev_dtype == Py_None) {
                     unambiguously_equally_good = 0;
                     best = 1;
                 }
@@ -289,13 +324,29 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                      */
                     best = -1;
                 }
+                else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
+                    /* old is not abstract, so better (both not possible) */
+                    unambiguously_equally_good = 0;
+                    best = 0;
+                }
+                else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
+                    /* new is not abstract, so better (both not possible) */
+                    unambiguously_equally_good = 0;
+                    best = 1;
+                }
                 /*
-                 * TODO: Unreachable, but we will need logic for abstract
-                 *       DTypes to decide if one is a subclass of the other
-                 *       (And their subclass relation is well defined.)
+                 * TODO: This will need logic for abstract DTypes to decide if
+                 *       one is a subclass of the other (And their subclass
+                 *       relation is well defined).  For now, we bail out
+                 *       in cas someone manages to get here.
                  */
                 else {
-                    assert(0);
+                    PyErr_SetString(PyExc_NotImplementedError,
+                            "deciding which one of two abstract dtypes is "
+                            "a better match is not yet implemented.  This "
+                            "will pick the better (or bail) in the future.");
+                    *out_info = NULL;
+                    return -1;
                 }
 
                 if ((current_best != -1) && (current_best != best)) {
@@ -612,6 +663,35 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
             }
             return info;
         }
+        else if (info == NULL && op_dtypes[0] == NULL) {
+            /*
+             * If we have a reduction, fill in the unspecified input/array
+             * assuming it should have the same dtype as the operand input
+             * (or the output one if given).
+             * Then, try again.  In some cases, this will choose different
+             * paths, such as `ll->?` instead of an `??->?` loop for `np.equal`
+             * when the input is `.l->.` (`.` meaning undefined).  This will
+             * then cause an error.  But cast to `?` would always lose
+             * information, and in many cases important information:
+             *
+             * ```python
+             * from operator import eq
+             * from functools import reduce
+             *
+             * reduce(eq, [1, 2, 3]) != reduce(eq, [True, True, True])
+             * ```
+             *
+             * The special cases being `logical_(and|or|xor)` which can always
+             * cast to boolean ahead of time and still give the right answer
+             * (unsafe cast to bool is fine here). We special case these at
+             * the time of this comment (NumPy 1.21).
+             */
+            assert(ufunc->nin == 2 && ufunc->nout == 1);
+            op_dtypes[0] = op_dtypes[2] != NULL ? op_dtypes[2] : op_dtypes[1];
+            Py_INCREF(op_dtypes[0]);
+            return promote_and_get_info_and_ufuncimpl(ufunc,
+                    ops, signature, op_dtypes, allow_legacy_promotion, 1);
+        }
     }
 
     /*
@@ -743,3 +823,94 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
 
     return method;
 }
+
+
+/*
+ * Special promoter for the logical ufuncs.  The logical ufuncs can always
+ * use the ??->? and still get the correct output (as long as the output
+ * is not supposed to be `object`).
+ */
+static int
+logical_ufunc_promoter(PyUFuncObject *NPY_UNUSED(ufunc),
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[])
+{
+    /*
+     * If we find any object DType at all, we currently force to object.
+     * However, if the output is specified and not object, there is no point,
+     * it should be just as well to cast the input rather than doing the
+     * unsafe out cast.
+     */
+    int force_object = 0;
+
+    for (int i = 0; i < 3; i++) {
+        PyArray_DTypeMeta *item;
+        if (signature[i] != NULL) {
+            item = signature[i];
+            Py_INCREF(item);
+            if (item->type_num == NPY_OBJECT) {
+                force_object = 1;
+            }
+        }
+        else {
+            /* Always override to boolean */
+            item = PyArray_DTypeFromTypeNum(NPY_BOOL);
+            if (op_dtypes[i] != NULL && op_dtypes[i]->type_num == NPY_OBJECT) {
+                force_object = 1;
+            }
+        }
+        new_op_dtypes[i] = item;
+    }
+
+    if (!force_object || (op_dtypes[2] != NULL
+                          && op_dtypes[2]->type_num != NPY_OBJECT)) {
+        return 0;
+    }
+    /*
+     * Actually, we have to use the OBJECT loop after all, set all we can
+     * to object (that might not work out, but try).
+     *
+     * NOTE: Change this to check for `op_dtypes[0] == NULL` to STOP
+     *       returning `object` for `np.logical_and.reduce(obj_arr)`
+     *       which will also affect `np.all` and `np.any`!
+     */
+    for (int i = 0; i < 3; i++) {
+        if (signature[i] != NULL) {
+            continue;
+        }
+        Py_SETREF(new_op_dtypes[i], PyArray_DTypeFromTypeNum(NPY_OBJECT));
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+install_logical_ufunc_promoter(PyObject *ufunc)
+{
+    if (PyObject_Type(ufunc) != (PyObject *)&PyUFunc_Type) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "internal numpy array, logical ufunc was not a ufunc?!");
+        return -1;
+    }
+    PyObject *dtype_tuple = PyTuple_Pack(3,
+            &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type, NULL);
+    if (dtype_tuple == NULL) {
+        return -1;
+    }
+    PyObject *promoter = PyCapsule_New(&logical_ufunc_promoter,
+            "numpy._ufunc_promoter", NULL);
+    if (promoter == NULL) {
+        Py_DECREF(dtype_tuple);
+        return -1;
+    }
+
+    PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter);
+    Py_DECREF(dtype_tuple);
+    Py_DECREF(promoter);
+    if (info == NULL) {
+        return -1;
+    }
+
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h
index 8d116873c..2f314615d 100644
--- a/numpy/core/src/umath/dispatching.h
+++ b/numpy/core/src/umath/dispatching.h
@@ -26,4 +26,8 @@ NPY_NO_EXPORT PyObject *
 add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *operation_dtypes[], int ignore_duplicate);
 
+NPY_NO_EXPORT int
+install_logical_ufunc_promoter(PyObject *ufunc);
+
+
 #endif  /*_NPY_DISPATCHING_H */
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index 77b1b9013..a423823d4 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -217,6 +217,25 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
      */
     int any_output_flexible = 0;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufunc->nargs == 3 &&
+            signature[0]->type_num == NPY_BOOL &&
+            signature[1]->type_num == NPY_BOOL &&
+            signature[2]->type_num == NPY_BOOL && (
+                strcmp(ufunc->name, "logical_or") == 0 ||
+                strcmp(ufunc->name, "logical_and") == 0 ||
+                strcmp(ufunc->name, "logical_xor") == 0)) {
+        /*
+         * This is a logical ufunc, and the `??->?` loop`. It is always OK
+         * to cast any input to bool, because that cast is defined by
+         * truthiness.
+         * This allows to ensure two things:
+         * 1. `np.all`/`np.any` know that force casting the input is OK
+         *    (they must do this since there are no `?l->?`, etc. loops)
+         * 2. The logical functions automatically work for any DType
+         *    implementing a cast to boolean.
+         */
+        flags = _NPY_METH_FORCE_CAST_INPUTS;
+    }
 
     for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
         if (signature[i]->singleton->flags & (
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index fa7844014..6076e0b2d 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1506,8 +1506,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  */
 
 /**begin repeat
- *  #func = rint, ceil, floor, trunc#
- *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ *  #func = rint, floor, trunc#
+ *  #scalarf = npy_rint, npy_floor, npy_trunc#
  */
 
 /**begin repeat1
@@ -1542,8 +1542,8 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
  */
 
 /**begin repeat2
- *  #func = rint, ceil, floor, trunc#
- *  #scalarf = npy_rint, npy_ceil, npy_floor, npy_trunc#
+ *  #func = rint, floor, trunc#
+ *  #scalarf = npy_rint, npy_floor, npy_trunc#
  */
 
 NPY_NO_EXPORT NPY_GCC_OPT_3 void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 90115006f..3eafbdf66 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -186,7 +186,7 @@ NPY_NO_EXPORT void
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
- * #kind = sqrt, absolute, square, reciprocal#
+ * #kind = ceil, sqrt, absolute, square, reciprocal#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -227,7 +227,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos# 
+ * #func = sin, cos#
  */
 
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
@@ -274,7 +274,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
 /**end repeat**/
 
 /**begin repeat
- *  #func = rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
  */
 
 /**begin repeat1
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 8c2c83e7c..cd9b2ed54 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -209,7 +209,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
-    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
 #if NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 852604655..a8289fc51 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -96,7 +96,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     const npy_intp len = dimensions[0];
-    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
     if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
         npyv_loadable_stride_@sfx@(ssrc) &&
         npyv_storable_stride_@sfx@(sdst)) {
@@ -125,7 +125,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     const npy_intp len = dimensions[0];
-    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
     if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
         npyv_loadable_stride_f64(ssrc) &&
         npyv_storable_stride_f64(sdst)) {
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 2d5917282..93761b98c 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 vsx2 neon
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
  **/
 /**
  * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
@@ -65,6 +67,9 @@ NPY_FINLINE double c_square_f64(double a)
     #define c_sqrt_f64 npy_sqrt
 #endif
 
+#define c_ceil_f32 npy_ceilf
+#define c_ceil_f64 npy_ceil
+
 /********************************************************************************
  ** Defining the SIMD kernels
  ********************************************************************************/
@@ -134,10 +139,10 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #if @VCHK@
 /**begin repeat1
- * #kind     = sqrt, absolute, square, reciprocal#
- * #intr     = sqrt, abs,      square, recip#
- * #repl_0w1 = 0,    0,        0,      1#
- * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_RECIPROCAL_BUG#
+ * #kind     = ceil, sqrt, absolute, square, reciprocal#
+ * #intr     = ceil, sqrt, abs,      square, recip#
+ * #repl_0w1 = 0,    0,    0,        0,      1#
+ * #RECIP_WORKAROUND = 0, 0, 0, 0, WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -245,9 +250,9 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
  * #VCHK = NPY_SIMD, NPY_SIMD_F64#
  */
 /**begin repeat1
- * #kind  = sqrt, absolute, square, reciprocal#
- * #intr  = sqrt, abs,      square, recip#
- * #clear = 0,    1,        0,      0#
+ * #kind  = ceil, sqrt, absolute, square, reciprocal#
+ * #intr  = ceil, sqrt, abs,      square, recip#
+ * #clear = 0,    0,    1,        0,      0#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -258,7 +263,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
     npy_intp len = dimensions[0];
 #if @VCHK@
     const int lsize = sizeof(npyv_lanetype_@sfx@);
-    assert(src_step % lsize == 0 && dst_step % lsize == 0);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
     if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
         goto no_unroll;
     }
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index d5a251368..c28c8abd8 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -145,14 +145,12 @@ PyArray_CopyInitialReduceValues(
  * boilerplate code, just calling the appropriate inner loop function where
  * necessary.
  *
+ * context     : The ArrayMethod context (with ufunc, method, and descriptors).
  * operand     : The array to be reduced.
  * out         : NULL, or the array into which to place the result.
  * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
  *               so that support can be added in the future without breaking
  *               API compatibility. Pass in NULL.
- * operand_dtype : The dtype the inner loop expects for the operand.
- * result_dtype : The dtype the inner loop expects for the result.
- * casting     : The casting rule to apply to the operands.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
  * reorderable : If True, the reduction being done is reorderable, which
  *               means specifying multiple axes of reduction at once is ok,
@@ -182,10 +180,8 @@ PyArray_CopyInitialReduceValues(
  * generalized ufuncs!)
  */
 NPY_NO_EXPORT PyArrayObject *
-PyUFunc_ReduceWrapper(
+PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        PyArray_Descr *operand_dtype, PyArray_Descr *result_dtype,
-        NPY_CASTING casting,
         npy_bool *axis_flags, int reorderable, int keepdims,
         PyObject *identity, PyArray_ReduceLoopFunc *loop,
         void *data, npy_intp buffersize, const char *funcname, int errormask)
@@ -199,6 +195,8 @@ PyUFunc_ReduceWrapper(
     PyArrayObject *op[3];
     PyArray_Descr *op_dtypes[3];
     npy_uint32 it_flags, op_flags[3];
+    /* Loop auxdata (must be freed on error) */
+    NpyAuxData *auxdata = NULL;
 
     /* More than one axis means multiple orders are possible */
     if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
@@ -221,8 +219,8 @@ PyUFunc_ReduceWrapper(
     /* Set up the iterator */
     op[0] = out;
     op[1] = operand;
-    op_dtypes[0] = result_dtype;
-    op_dtypes[1] = operand_dtype;
+    op_dtypes[0] = context->descriptors[0];
+    op_dtypes[1] = context->descriptors[1];
 
     it_flags = NPY_ITER_BUFFERED |
             NPY_ITER_EXTERNAL_LOOP |
@@ -291,7 +289,7 @@ PyUFunc_ReduceWrapper(
     }
 
     iter = NpyIter_AdvancedNew(wheremask == NULL ? 2 : 3, op, it_flags,
-                               NPY_KEEPORDER, casting,
+                               NPY_KEEPORDER, NPY_UNSAFE_CASTING,
                                op_flags,
                                op_dtypes,
                                PyArray_NDIM(operand), op_axes, NULL, buffersize);
@@ -301,9 +299,29 @@ PyUFunc_ReduceWrapper(
 
     result = NpyIter_GetOperandArray(iter)[0];
 
-    int needs_api = NpyIter_IterationNeedsAPI(iter);
-    /* Start with the floating-point exception flags cleared */
-    npy_clear_floatstatus_barrier((char*)&iter);
+    PyArrayMethod_StridedLoop *strided_loop;
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    npy_intp fixed_strides[3];
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    if (wheremask != NULL) {
+        if (PyArrayMethod_GetMaskedStridedLoop(context,
+                1, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+            goto fail;
+        }
+    }
+    else {
+        if (context->method->get_strided_loop(context,
+                1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+            goto fail;
+        }
+    }
+
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    needs_api |= NpyIter_IterationNeedsAPI(iter);
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
 
     /*
      * Initialize the result to the reduction unit if possible,
@@ -345,16 +363,18 @@ PyUFunc_ReduceWrapper(
         strideptr = NpyIter_GetInnerStrideArray(iter);
         countptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-        if (loop(iter, dataptr, strideptr, countptr,
-                        iternext, needs_api, skip_first_count, data) < 0) {
+        if (loop(context, strided_loop, auxdata,
+                iter, dataptr, strideptr, countptr, iternext,
+                needs_api, skip_first_count) < 0) {
             goto fail;
         }
     }
 
-    /* Check whether any errors occurred during the loop */
-    if (PyErr_Occurred() ||
-            _check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
-        goto fail;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even on error */
+        if (_check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
+            goto fail;
+        }
     }
 
     if (out != NULL) {
@@ -369,6 +389,7 @@ PyUFunc_ReduceWrapper(
     return result;
 
 fail:
+    NPY_AUXDATA_FREE(auxdata);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
     }
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 372605dba..2170e27a7 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -19,93 +19,17 @@ typedef int (PyArray_AssignReduceIdentityFunc)(PyArrayObject *result,
                                                void *data);
 
 /*
- * This is a function for the reduce loop.
+ * Inner definition of the reduce loop, only used for a static function.
+ * At some point around NumPy 1.6, there was probably an intention to make
+ * the reduce loop customizable at this level (per ufunc?).
  *
- * The needs_api parameter indicates whether it's ok to release the GIL during
- * the loop, such as when the iternext() function never calls
- * a function which could raise a Python exception.
- *
- * The skip_first_count parameter indicates how many elements need to be
- * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
- * when the 'assign_identity' parameter was NULL when calling
- * PyArray_ReduceWrapper.
- *
- * The loop gets two data pointers and two strides, and should
- * look roughly like this:
- *  {
- *      NPY_BEGIN_THREADS_DEF;
- *      if (!needs_api) {
- *          NPY_BEGIN_THREADS;
- *      }
- *      // This first-visit loop can be skipped if 'assign_identity' was non-NULL
- *      if (skip_first_count > 0) {
- *          do {
- *              char *data0 = dataptr[0], *data1 = dataptr[1];
- *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *              npy_intp count = *countptr;
- *
- *              // Skip any first-visit elements
- *              if (NpyIter_IsFirstVisit(iter, 0)) {
- *                  if (stride0 == 0) {
- *                      --count;
- *                      --skip_first_count;
- *                      data1 += stride1;
- *                  }
- *                  else {
- *                      skip_first_count -= count;
- *                      count = 0;
- *                  }
- *              }
- *
- *              while (count--) {
- *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                    *(operand_t *)data1);
- *                  data0 += stride0;
- *                  data1 += stride1;
- *              }
- *
- *              // Jump to the faster loop when skipping is done
- *              if (skip_first_count == 0) {
- *                  if (iternext(iter)) {
- *                      break;
- *                  }
- *                  else {
- *                      goto finish_loop;
- *                  }
- *              }
- *          } while (iternext(iter));
- *      }
- *      do {
- *          char *data0 = dataptr[0], *data1 = dataptr[1];
- *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *          npy_intp count = *countptr;
- *
- *          while (count--) {
- *              *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                *(operand_t *)data1);
- *              data0 += stride0;
- *              data1 += stride1;
- *          }
- *      } while (iternext(iter));
- *  finish_loop:
- *      if (!needs_api) {
- *          NPY_END_THREADS;
- *      }
- *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
- *  }
- *
- * If needs_api is True, this function should call PyErr_Occurred()
- * to check if an error occurred during processing, and return -1 for
- * error, 0 for success.
+ * TODO: This should be refactored/removed.
  */
-typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
-                                            char **dataptr,
-                                            npy_intp const *strideptr,
-                                            npy_intp const *countptr,
-                                            NpyIter_IterNextFunc *iternext,
-                                            int needs_api,
-                                            npy_intp skip_first_count,
-                                            void *data);
+typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
+        PyArrayMethod_StridedLoop *strided_loop, NpyAuxData *auxdata,
+        NpyIter *iter, char **dataptrs, npy_intp const *strides,
+        npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
+        int needs_api, npy_intp skip_first_count);
 
 /*
  * This function executes all the standard NumPy reduction function
@@ -138,16 +62,10 @@ typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
  * errormask   : forwarded from _get_bufsize_errmask
  */
 NPY_NO_EXPORT PyArrayObject *
-PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
-                      PyArrayObject *wheremask,
-                      PyArray_Descr *operand_dtype,
-                      PyArray_Descr *result_dtype,
-                      NPY_CASTING casting,
-                      npy_bool *axis_flags, int reorderable,
-                      int keepdims,
-                      PyObject *identity,
-                      PyArray_ReduceLoopFunc *loop,
-                      void *data, npy_intp buffersize, const char *funcname,
-                      int errormask);
+PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
+        PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
+        npy_bool *axis_flags, int reorderable, int keepdims,
+        PyObject *identity, PyArray_ReduceLoopFunc *loop,
+        void *data, npy_intp buffersize, const char *funcname, int errormask);
 
 #endif
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index d47be9a30..0e2c1ab8b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -169,7 +169,7 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  */
 
 /**begin repeat2
- *  #func = rint, floor, ceil, trunc#
+ *  #func = rint, floor, trunc#
  */
 
 #if defined @CHK@ && defined NPY_HAVE_SSE2_INTRINSICS
@@ -850,12 +850,6 @@ fma_floor_@vsub@(@vtype@ x)
 }
 
 NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_ceil_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_POS_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
 fma_trunc_@vsub@(@vtype@ x)
 {
     return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
@@ -988,12 +982,6 @@ avx512_floor_@vsub@(@vtype@ x)
 }
 
 NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_ceil_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x0A);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
 avx512_trunc_@vsub@(@vtype@ x)
 {
     return _mm512_roundscale_@vsub@(x, 0x0B);
@@ -1327,8 +1315,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
  */
 
 /**begin repeat1
- *  #func = rint, ceil, floor, trunc#
- *  #vectorf = rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
+ *  #vectorf = rint, floor, trunc#
  */
 
 #if defined @CHK@
@@ -1398,8 +1386,8 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
  */
 
 /**begin repeat1
- *  #func = rint, ceil, floor, trunc#
- *  #vectorf =  rint, ceil, floor, trunc#
+ *  #func = rint, floor, trunc#
+ *  #vectorf =  rint, floor, trunc#
  */
 
 #if defined @CHK@
diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml
-Subproject 9f8af767ed6c75455d9a382af829048f8dd1806
+Subproject 1c5260a61e7dce6be48073dfa96291edb0a11d7
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 42290e8c9..186f18a62 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -616,9 +616,24 @@ _is_same_name(const char* s1, const char* s2)
 }
 
 /*
- * Sets core_num_dim_ix, core_num_dims, core_dim_ixs, core_offsets,
- * and core_signature in PyUFuncObject "ufunc".  Returns 0 unless an
- * error occurred.
+ * Sets the following fields in the PyUFuncObject 'ufunc':
+ *
+ * Field             Type                     Array Length
+ * core_enabled      int (effectively bool)   N/A
+ * core_num_dim_ix   int                      N/A
+ * core_dim_flags    npy_uint32 *             core_num_dim_ix
+ * core_dim_sizes    npy_intp *               core_num_dim_ix
+ * core_num_dims     int *                    nargs (i.e. nin+nout)
+ * core_offsets      int *                    nargs
+ * core_dim_ixs      int *                    sum(core_num_dims)
+ * core_signature    char *                   strlen(signature) + 1
+ *
+ * The function assumes that the values that are arrays have not
+ * been set already, and sets these pointers to memory allocated
+ * with PyArray_malloc.  These are freed when the ufunc dealloc
+ * method is called.
+ *
+ * Returns 0 unless an error occurred.
  */
 static int
 _parse_signature(PyUFuncObject *ufunc, const char *signature)
@@ -990,6 +1005,7 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
     }
 
     /* Convert and fill in output arguments */
+    memset(out_op_DTypes + nin, 0, nout * sizeof(*out_op_DTypes));
     if (full_args.out != NULL) {
         for (int i = 0; i < nout; i++) {
             obj = PyTuple_GET_ITEM(full_args.out, i);
@@ -1047,6 +1063,7 @@ check_for_trivial_loop(PyArrayMethodObject *ufuncimpl,
         PyArrayObject **op, PyArray_Descr **dtypes,
         NPY_CASTING casting, npy_intp buffersize)
 {
+    int force_cast_input = ufuncimpl->flags & _NPY_METH_FORCE_CAST_INPUTS;
     int i, nin = ufuncimpl->nin, nop = nin + ufuncimpl->nout;
 
     for (i = 0; i < nop; ++i) {
@@ -1070,7 +1087,13 @@ check_for_trivial_loop(PyArrayMethodObject *ufuncimpl,
                 must_copy = 1;
             }
 
-            if (PyArray_MinCastSafety(safety, casting) != casting) {
+            if (force_cast_input && i < nin) {
+                /*
+                 * ArrayMethod flagged to ignore casting (logical funcs
+                 * can  force cast to bool)
+                 */
+            }
+            else if (PyArray_MinCastSafety(safety, casting) != casting) {
                 return 0;  /* the cast is not safe enough */
             }
         }
@@ -1360,8 +1383,15 @@ validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc,
          */
         return 0;
     }
-    if (PyUFunc_ValidateCasting(ufunc, casting, ops, descriptors) < 0) {
-        return -1;
+    if (method->flags & _NPY_METH_FORCE_CAST_INPUTS) {
+        if (PyUFunc_ValidateOutCasting(ufunc, casting, ops, descriptors) < 0) {
+            return -1;
+        }
+    }
+    else {
+        if (PyUFunc_ValidateCasting(ufunc, casting, ops, descriptors) < 0) {
+            return -1;
+        }
     }
     return 0;
 }
@@ -2470,9 +2500,9 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     /* Final preparation of the arraymethod call */
     PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = operation_descrs,
     };
     PyArrayMethod_StridedLoop *strided_loop;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
@@ -2527,7 +2557,7 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     PyArray_free(inner_strides);
     NPY_AUXDATA_FREE(auxdata);
-    if (NpyIter_Deallocate(iter) < 0) {
+    if (!NpyIter_Deallocate(iter)) {
         retval = -1;
     }
 
@@ -2592,9 +2622,9 @@ PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
 
     /* Final preparation of the arraymethod call */
     PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = operation_descrs,
     };
 
     /* Do the ufunc loop */
@@ -2661,195 +2691,129 @@ PyUFunc_GenericFunction(PyUFuncObject *NPY_UNUSED(ufunc),
 
 
 /*
- * Given the output type, finds the specified binary op.  The
- * ufunc must have nin==2 and nout==1.  The function may modify
- * otype if the given type isn't found.
+ * Promote and resolve a reduction like operation.
  *
- * Returns 0 on success, -1 on failure.
+ * @param ufunc
+ * @param arr The operation array
+ * @param out The output array or NULL if not provided.  Note that NumPy always
+ *        used out to mean the same as `dtype=out.dtype` and never passed
+ *        the array itself to the type-resolution.
+ * @param signature The DType signature, which may already be set due to the
+ *        dtype passed in by the user, or the special cases (add, multiply).
+ *        (Contains strong references and may be modified.)
+ * @param enforce_uniform_args If `NPY_TRUE` fully uniform dtypes/descriptors
+ *        are enforced as required for accumulate and (currently) reduceat.
+ * @param out_descrs New references to the resolved descriptors (on success).
+ * @param method The ufunc method, "reduce", "reduceat", or "accumulate".
+
+ * @returns ufuncimpl The `ArrayMethod` implemention to use. Or NULL if an
+ *          error occurred.
  */
-static int
-get_binary_op_function(PyUFuncObject *ufunc, int *otype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata)
+static PyArrayMethodObject *
+reducelike_promote_and_resolve(PyUFuncObject *ufunc,
+        PyArrayObject *arr, PyArrayObject *out,
+        PyArray_DTypeMeta *signature[3],
+        npy_bool enforce_uniform_args, PyArray_Descr *out_descrs[3],
+        char *method)
 {
-    int i;
-
-    NPY_UF_DBG_PRINT1("Getting binary op function for type number %d\n",
-                                *otype);
-
-    /* If the type is custom and there are userloops, search for it here */
-    if (ufunc->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
-        PyObject *key, *obj;
-        key = PyLong_FromLong(*otype);
-        if (key == NULL) {
-            return -1;
-        }
-        obj = PyDict_GetItemWithError(ufunc->userloops, key);
-        Py_DECREF(key);
-        if (obj == NULL && PyErr_Occurred()) {
-            return -1;
-        }
-        else if (obj != NULL) {
-            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
-            if (funcdata == NULL) {
-                return -1;
-            }
-            while (funcdata != NULL) {
-                int *types = funcdata->arg_types;
-
-                if (types[0] == *otype && types[1] == *otype &&
-                                                types[2] == *otype) {
-                    *out_innerloop = funcdata->func;
-                    *out_innerloopdata = funcdata->data;
-                    return 0;
-                }
+    /*
+     * Note that the `ops` is not realy correct.  But legacy resolution
+     * cannot quite handle the correct ops (e.g. a NULL first item if `out`
+     * is NULL), and it should only matter in very strange cases.
+     */
+    PyArrayObject *ops[3] = {arr, arr, NULL};
+    /*
+     * TODO: If `out` is not provided, arguably `initial` could define
+     *       the first DType (and maybe also the out one), that way
+     *       `np.add.reduce([1, 2, 3], initial=3.4)` would return a float
+     *       value.  As of 1.20, it returned an integer, so that should
+     *       probably go to an error/warning first.
+     */
+    PyArray_DTypeMeta *operation_DTypes[3] = {
+            NULL, NPY_DTYPE(PyArray_DESCR(arr)), NULL};
+    Py_INCREF(operation_DTypes[1]);
 
-                funcdata = funcdata->next;
-            }
-        }
+    if (out != NULL) {
+        operation_DTypes[0] = NPY_DTYPE(PyArray_DESCR(out));
+        Py_INCREF(operation_DTypes[0]);
+        operation_DTypes[2] = operation_DTypes[0];
+        Py_INCREF(operation_DTypes[2]);
     }
 
-    /* Search for a function with compatible inputs */
-    for (i = 0; i < ufunc->ntypes; ++i) {
-        char *types = ufunc->types + i*ufunc->nargs;
-
-        NPY_UF_DBG_PRINT3("Trying loop with signature %d %d -> %d\n",
-                                types[0], types[1], types[2]);
-
-        if (PyArray_CanCastSafely(*otype, types[0]) &&
-                    types[0] == types[1] &&
-                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
-            /* If the signature is "xx->x", we found the loop */
-            if (types[2] == types[0]) {
-                *out_innerloop = ufunc->functions[i];
-                *out_innerloopdata = ufunc->data[i];
-                *otype = types[0];
-                return 0;
-            }
-            /*
-             * Otherwise, we found the natural type of the reduction,
-             * replace otype and search again
-             */
-            else {
-                *otype = types[2];
-                break;
-            }
-        }
+    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+            ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE);
+    Py_DECREF(operation_DTypes[1]);
+    if (out != NULL) {
+        Py_DECREF(operation_DTypes[0]);
+        Py_DECREF(operation_DTypes[2]);
     }
-
-    /* Search for the exact function */
-    for (i = 0; i < ufunc->ntypes; ++i) {
-        char *types = ufunc->types + i*ufunc->nargs;
-
-        if (PyArray_CanCastSafely(*otype, types[0]) &&
-                    types[0] == types[1] &&
-                    types[1] == types[2] &&
-                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
-            /* Since the signature is "xx->x", we found the loop */
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            *otype = types[0];
-            return 0;
-        }
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    return -1;
-}
-
-static int
-reduce_type_resolver(PyUFuncObject *ufunc, PyArrayObject *arr,
-                        PyArray_Descr *odtype, PyArray_Descr **out_dtype)
-{
-    int i, retcode;
-    PyArrayObject *op[3] = {arr, arr, NULL};
-    PyArray_Descr *dtypes[3] = {NULL, NULL, NULL};
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
-    PyObject *type_tup = NULL;
-
-    *out_dtype = NULL;
-
     /*
-     * If odtype is specified, make a type tuple for the type
-     * resolution.
+     * Find the correct descriptors for the operation.  We use unsafe casting
+     * for historic reasons: The logic ufuncs required it to cast everything to
+     * boolean.  However, we now special case the logical ufuncs, so that the
+     * casting safety could in principle be set to the default same-kind.
+     * (although this should possibly happen through a deprecation)
      */
-    if (odtype != NULL) {
-        type_tup = PyTuple_Pack(3, odtype, odtype, Py_None);
-        if (type_tup == NULL) {
-            return -1;
-        }
-    }
-
-    /* Use the type resolution function to find our loop */
-    retcode = ufunc->type_resolver(
-                        ufunc, NPY_UNSAFE_CASTING,
-                        op, type_tup, dtypes);
-    Py_DECREF(type_tup);
-    if (retcode == -1) {
-        return -1;
-    }
-    else if (retcode == -2) {
-        PyErr_Format(PyExc_RuntimeError,
-                "type resolution returned NotImplemented to "
-                "reduce ufunc %s", ufunc_name);
-        return -1;
+    if (resolve_descriptors(3, ufunc, ufuncimpl,
+            ops, out_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
+        return NULL;
     }
 
     /*
-     * The first two type should be equivalent. Because of how
-     * reduce has historically behaved in NumPy, the return type
-     * could be different, and it is the return type on which the
-     * reduction occurs.
+     * The first operand and output should be the same array, so they should
+     * be identical.  The second argument can be different for reductions,
+     * but is checked to be identical for accumulate and reduceat.
      */
-    if (!PyArray_EquivTypes(dtypes[0], dtypes[1])) {
-        for (i = 0; i < 3; ++i) {
-            Py_DECREF(dtypes[i]);
-        }
-        PyErr_Format(PyExc_RuntimeError,
-                "could not find a type resolution appropriate for "
-                "reduce ufunc %s", ufunc_name);
-        return -1;
+    if (out_descrs[0] != out_descrs[2] || (
+            enforce_uniform_args && out_descrs[0] != out_descrs[1])) {
+        PyErr_Format(PyExc_TypeError,
+                "the resolved dtypes are not compatible with %s.%s",
+                ufunc_get_name_cstr(ufunc), method);
+        goto fail;
+    }
+    /* TODO: This really should _not_ be unsafe casting (same above)! */
+    if (validate_casting(ufuncimpl,
+            ufunc, ops, out_descrs, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
     }
 
-    Py_DECREF(dtypes[0]);
-    Py_DECREF(dtypes[1]);
-    *out_dtype = dtypes[2];
+    return ufuncimpl;
 
-    return 0;
+  fail:
+    for (int i = 0; i < 3; ++i) {
+        Py_DECREF(out_descrs[i]);
+    }
+    return NULL;
 }
 
+
 static int
-reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
-            npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
-            int needs_api, npy_intp skip_first_count, void *data)
+reduce_loop(PyArrayMethod_Context *context,
+        PyArrayMethod_StridedLoop *strided_loop, NpyAuxData *auxdata,
+        NpyIter *iter, char **dataptrs, npy_intp const *strides,
+        npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
+        int needs_api, npy_intp skip_first_count)
 {
-    PyArray_Descr *dtypes[3], **iter_dtypes;
-    PyUFuncObject *ufunc = (PyUFuncObject *)data;
-    char *dataptrs_copy[3];
-    npy_intp strides_copy[3];
+    int retval;
+    char *dataptrs_copy[4];
+    npy_intp strides_copy[4];
     npy_bool masked;
 
-    /* The normal selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
-
     NPY_BEGIN_THREADS_DEF;
     /* Get the number of operands, to determine whether "where" is used */
     masked = (NpyIter_GetNOp(iter) == 3);
 
-    /* Get the inner loop */
-    iter_dtypes = NpyIter_GetDescrArray(iter);
-    dtypes[0] = iter_dtypes[0];
-    dtypes[1] = iter_dtypes[1];
-    dtypes[2] = iter_dtypes[0];
-    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-                            &innerloop, &innerloopdata, &needs_api) < 0) {
-        return -1;
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
     }
 
-    NPY_BEGIN_THREADS_NDITER(iter);
-
     if (skip_first_count > 0) {
-        do {
+        assert(!masked);  /* Path currently not available for masked */
+        while (1) {
             npy_intp count = *countptr;
 
             /* Skip any first-visit elements */
@@ -2872,27 +2836,23 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
             strides_copy[0] = strides[0];
             strides_copy[1] = strides[1];
             strides_copy[2] = strides[0];
-            innerloop(dataptrs_copy, &count,
-                        strides_copy, innerloopdata);
 
-            if (needs_api && PyErr_Occurred()) {
+            retval = strided_loop(context,
+                    dataptrs_copy, &count, strides_copy, auxdata);
+            if (retval < 0) {
                 goto finish_loop;
             }
 
-            /* Jump to the faster loop when skipping is done */
-            if (skip_first_count == 0) {
-                if (iternext(iter)) {
-                    break;
-                }
-                else {
-                    goto finish_loop;
-                }
+            /* Advance loop, and abort on error (or finish) */
+            if (!iternext(iter)) {
+                goto finish_loop;
             }
-        } while (iternext(iter));
-    }
 
-    if (needs_api && PyErr_Occurred()) {
-        goto finish_loop;
+            /* When skipping is done break and continue with faster loop */
+            if (skip_first_count == 0) {
+                break;
+            }
+        }
     }
 
     do {
@@ -2903,42 +2863,23 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
         strides_copy[0] = strides[0];
         strides_copy[1] = strides[1];
         strides_copy[2] = strides[0];
-
-        if (!masked) {
-            innerloop(dataptrs_copy, countptr,
-                      strides_copy, innerloopdata);
+        if (masked) {
+            dataptrs_copy[3] = dataptrs[2];
+            strides_copy[3] = strides[2];
         }
-        else {
-            npy_intp count = *countptr;
-            char *maskptr = dataptrs[2];
-            npy_intp mask_stride = strides[2];
-            /* Optimization for when the mask is broadcast */
-            npy_intp n = mask_stride == 0 ? count : 1;
-            while (count) {
-                char mask = *maskptr;
-                maskptr += mask_stride;
-                while (n < count && mask == *maskptr) {
-                    n++;
-                    maskptr += mask_stride;
-                }
-                /* If mask set, apply inner loop on this contiguous region */
-                if (mask) {
-                    innerloop(dataptrs_copy, &n,
-                              strides_copy, innerloopdata);
-                }
-                dataptrs_copy[0] += n * strides[0];
-                dataptrs_copy[1] += n * strides[1];
-                dataptrs_copy[2] = dataptrs_copy[0];
-                count -= n;
-                n = 1;
-            }
+
+        retval = strided_loop(context,
+                dataptrs_copy, countptr, strides_copy, auxdata);
+        if (retval < 0) {
+            goto finish_loop;
         }
-    } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+
+    } while (iternext(iter));
 
 finish_loop:
     NPY_END_THREADS;
 
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return retval;
 }
 
 /*
@@ -2959,15 +2900,14 @@ finish_loop:
  * this function does not validate them.
  */
 static PyArrayObject *
-PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
-        int naxes, int *axes, PyArray_Descr *odtype, int keepdims,
+PyUFunc_Reduce(PyUFuncObject *ufunc,
+        PyArrayObject *arr, PyArrayObject *out,
+        int naxes, int *axes, PyArray_DTypeMeta *signature[3], int keepdims,
         PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
     npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
-    PyArray_Descr *dtype;
-    PyArrayObject *result;
     PyObject *identity;
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
@@ -2994,6 +2934,7 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     }
 
     /* Get the identity */
+    /* TODO: Both of these should be provided by the ArrayMethod! */
     identity = _get_identity(ufunc, &reorderable);
     if (identity == NULL) {
         return NULL;
@@ -3017,21 +2958,27 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         Py_INCREF(initial);  /* match the reference count in the if above */
     }
 
-    /* Get the reduction dtype */
-    if (reduce_type_resolver(ufunc, arr, odtype, &dtype) < 0) {
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_FALSE, descrs, "reduce");
+    if (ufuncimpl == NULL) {
         Py_DECREF(initial);
         return NULL;
     }
 
-    result = PyUFunc_ReduceWrapper(arr, out, wheremask, dtype, dtype,
-                                   NPY_UNSAFE_CASTING,
-                                   axis_flags, reorderable,
-                                   keepdims,
-                                   initial,
-                                   reduce_loop,
-                                   ufunc, buffersize, ufunc_name, errormask);
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
 
-    Py_DECREF(dtype);
+    PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+            arr, out, wheremask, axis_flags, reorderable, keepdims,
+            initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+
+    for (int i = 0; i < 3; i++) {
+        Py_DECREF(descrs[i]);
+    }
     Py_DECREF(initial);
     return result;
 }
@@ -3039,23 +2986,21 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
 static PyObject *
 PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
-                   int axis, int otype)
+                   int axis, PyArray_DTypeMeta *signature[3])
 {
     PyArrayObject *op[2];
-    PyArray_Descr *op_dtypes[2] = {NULL, NULL};
     int op_axes_arrays[2][NPY_MAXDIMS];
     int *op_axes[2] = {op_axes_arrays[0], op_axes_arrays[1]};
     npy_uint32 op_flags[2];
-    int idim, ndim, otype_final;
+    int idim, ndim;
     int needs_api, need_outer_iterator;
 
-    NpyIter *iter = NULL;
+    int res = 0;
 
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
 
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
+    NpyIter *iter = NULL;
 
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -3077,42 +3022,32 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     /* Take a reference to out for later returning */
     Py_XINCREF(out);
 
-    otype_final = otype;
-    if (get_binary_op_function(ufunc, &otype_final,
-                                &innerloop, &innerloopdata) < 0) {
-        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
-        PyErr_Format(PyExc_ValueError,
-                     "could not find a matching type for %s.accumulate, "
-                     "requested type has type code '%c'",
-                            ufunc_name, dtype ? dtype->type : '-');
-        Py_XDECREF(dtype);
-        goto fail;
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_TRUE, descrs, "accumulate");
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    ndim = PyArray_NDIM(arr);
+    /* The below code assumes that all descriptors are identical: */
+    assert(descrs[0] == descrs[1] && descrs[0] == descrs[2]);
 
-    /*
-     * Set up the output data type, using the input's exact
-     * data type if the type number didn't change to preserve
-     * metadata
-     */
-    if (PyArray_DESCR(arr)->type_num == otype_final) {
-        if (PyArray_ISNBO(PyArray_DESCR(arr)->byteorder)) {
-            op_dtypes[0] = PyArray_DESCR(arr);
-            Py_INCREF(op_dtypes[0]);
-        }
-        else {
-            op_dtypes[0] = PyArray_DescrNewByteorder(PyArray_DESCR(arr),
-                                                    NPY_NATIVE);
-        }
-    }
-    else {
-        op_dtypes[0] = PyArray_DescrFromType(otype_final);
-    }
-    if (op_dtypes[0] == NULL) {
+    if (PyDataType_REFCHK(descrs[2]) && descrs[2]->type_num != NPY_OBJECT) {
+        /* This can be removed, but the initial element copy needs fixing */
+        PyErr_SetString(PyExc_TypeError,
+                "accumulation currently only supports `object` dtype with "
+                "references");
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
+
+    ndim = PyArray_NDIM(arr);
+
 #if NPY_UF_DBG_TRACING
     printf("Found %s.accumulate inner loop with dtype :  ", ufunc_name);
     PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
@@ -3138,9 +3073,9 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     need_outer_iterator = (ndim > 1);
     /* We can't buffer, so must do UPDATEIFCOPY */
     if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) ||
-            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) ||
+            !PyArray_EquivTypes(descrs[1], PyArray_DESCR(arr)) ||
             (out &&
-             !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
+             !PyArray_EquivTypes(descrs[0], PyArray_DESCR(out)))) {
         need_outer_iterator = 1;
     }
     /* If input and output overlap in memory, use iterator to figure it out */
@@ -3153,7 +3088,6 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
                            NPY_ITER_REFS_OK|
                            NPY_ITER_COPY_IF_OVERLAP;
-        PyArray_Descr **op_dtypes_param = NULL;
 
         /*
          * The way accumulate is set up, we can't do buffering,
@@ -3170,13 +3104,11 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
          */
         op_flags[0] |= NPY_ITER_UPDATEIFCOPY|NPY_ITER_ALIGNED|NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
         op_flags[1] |= NPY_ITER_COPY|NPY_ITER_ALIGNED|NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-        op_dtypes_param = op_dtypes;
-        op_dtypes[1] = op_dtypes[0];
+
         NPY_UF_DBG_PRINT("Allocating outer iterator\n");
         iter = NpyIter_AdvancedNew(2, op, flags,
                                    NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                                   op_flags,
-                                   op_dtypes_param,
+                                   op_flags, descrs,
                                    ndim_iter, op_axes, NULL, 0);
         if (iter == NULL) {
             goto fail;
@@ -3194,14 +3126,14 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         }
     }
 
-    /* Get the output */
+    /* Get the output from the iterator if it was allocated */
     if (out == NULL) {
         if (iter) {
             op[0] = out = NpyIter_GetOperandArray(iter)[0];
             Py_INCREF(out);
         }
         else {
-            PyArray_Descr *dtype = op_dtypes[0];
+            PyArray_Descr *dtype = descrs[0];
             Py_INCREF(dtype);
             op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
                                     &PyArray_Type, dtype,
@@ -3210,10 +3142,31 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
             if (out == NULL) {
                 goto fail;
             }
-
         }
     }
 
+    npy_intp fixed_strides[3];
+    if (need_outer_iterator) {
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    }
+    else {
+        fixed_strides[0] = PyArray_STRIDES(op[0])[axis];
+        fixed_strides[1] = PyArray_STRIDES(op[1])[axis];
+        fixed_strides[2] = fixed_strides[0];
+    }
+
+
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufuncimpl->get_strided_loop(&context,
+            1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     /*
      * If the reduction axis has size zero, either return the reduction
      * unit for UFUNC_REDUCE, or return the zero-sized output array
@@ -3234,7 +3187,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
@@ -3242,8 +3195,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        needs_api = NpyIter_IterationNeedsAPI(iter);
-
+        needs_api |= NpyIter_IterationNeedsAPI(iter);
 
         /* Execute the loop with just the outer iterator */
         count_m1 = PyArray_DIM(op[1], axis)-1;
@@ -3257,7 +3209,9 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         stride_copy[1] = stride1;
         stride_copy[2] = stride0;
 
-        NPY_BEGIN_THREADS_NDITER(iter);
+        if (!needs_api) {
+            NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
+        }
 
         do {
             dataptr_copy[0] = dataptr[0];
@@ -3270,7 +3224,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
              * Output (dataptr[0]) and input (dataptr[1]) may point to
              * the same memory, e.g. np.add.accumulate(a, out=a).
              */
-            if (otype == NPY_OBJECT) {
+            if (descrs[2]->type_num == NPY_OBJECT) {
                 /*
                  * Incref before decref to avoid the possibility of the
                  * reference count being zero temporarily.
@@ -3290,18 +3244,17 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
                 dataptr_copy[2] += stride0;
                 NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                 (int)count_m1);
-                innerloop(dataptr_copy, &count_m1,
-                            stride_copy, innerloopdata);
+                res = strided_loop(&context,
+                        dataptr_copy, &count_m1, stride_copy, auxdata);
             }
-        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        } while (res == 0 && iternext(iter));
 
         NPY_END_THREADS;
     }
     else if (iter == NULL) {
         char *dataptr_copy[3];
-        npy_intp stride_copy[3];
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         /* Execute the loop with no iterators */
         npy_intp count = PyArray_DIM(op[1], axis);
@@ -3315,15 +3268,11 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
                                       PyArray_NDIM(op[0]))) {
             PyErr_SetString(PyExc_ValueError,
                     "provided out is the wrong size "
-                    "for the reduction");
+                    "for the accumulation.");
             goto fail;
         }
         stride0 = PyArray_STRIDE(op[0], axis);
 
-        stride_copy[0] = stride0;
-        stride_copy[1] = stride1;
-        stride_copy[2] = stride0;
-
         /* Turn the two items into three for the inner loop */
         dataptr_copy[0] = PyArray_BYTES(op[0]);
         dataptr_copy[1] = PyArray_BYTES(op[1]);
@@ -3335,7 +3284,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
          * Output (dataptr[0]) and input (dataptr[1]) may point to the
          * same memory, e.g. np.add.accumulate(a, out=a).
          */
-        if (otype == NPY_OBJECT) {
+        if (descrs[2]->type_num == NPY_OBJECT) {
             /*
              * Incref before decref to avoid the possibility of the
              * reference count being zero temporarily.
@@ -3356,25 +3305,34 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
             NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
 
-            needs_api = PyDataType_REFCHK(op_dtypes[0]);
+            needs_api = PyDataType_REFCHK(descrs[0]);
 
             if (!needs_api) {
                 NPY_BEGIN_THREADS_THRESHOLDED(count);
             }
 
-            innerloop(dataptr_copy, &count,
-                        stride_copy, innerloopdata);
+            res = strided_loop(&context,
+                    dataptr_copy, &count, fixed_strides, auxdata);
 
             NPY_END_THREADS;
         }
     }
 
 finish:
-    Py_XDECREF(op_dtypes[0]);
-    int res = 0;
+    NPY_AUXDATA_FREE(auxdata);
+    Py_DECREF(descrs[0]);
+    Py_DECREF(descrs[1]);
+    Py_DECREF(descrs[2]);
+
     if (!NpyIter_Deallocate(iter)) {
         res = -1;
     }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "accumulate");
+    }
+
     if (res < 0) {
         Py_DECREF(out);
         return NULL;
@@ -3384,7 +3342,11 @@ finish:
 
 fail:
     Py_XDECREF(out);
-    Py_XDECREF(op_dtypes[0]);
+
+    NPY_AUXDATA_FREE(auxdata);
+    Py_XDECREF(descrs[0]);
+    Py_XDECREF(descrs[1]);
+    Py_XDECREF(descrs[2]);
 
     NpyIter_Deallocate(iter);
 
@@ -3409,28 +3371,31 @@ fail:
  * indices[1::2] = range(1,len(array))
  *
  * output shape is based on the size of indices
+ *
+ * TODO: Reduceat duplicates too much code from accumulate!
  */
 static PyObject *
 PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
-                 PyArrayObject *out, int axis, int otype)
+                 PyArrayObject *out, int axis, PyArray_DTypeMeta *signature[3])
 {
     PyArrayObject *op[3];
-    PyArray_Descr *op_dtypes[3] = {NULL, NULL, NULL};
     int op_axes_arrays[3][NPY_MAXDIMS];
     int *op_axes[3] = {op_axes_arrays[0], op_axes_arrays[1],
                             op_axes_arrays[2]};
     npy_uint32 op_flags[3];
-    int idim, ndim, otype_final;
-    int need_outer_iterator = 0;
+    int idim, ndim;
+    int needs_api, need_outer_iterator = 0;
+
+    int res = 0;
 
     NpyIter *iter = NULL;
 
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
     /* The reduceat indices - ind must be validated outside this call */
     npy_intp *reduceat_ind;
     npy_intp i, ind_size, red_axis_size;
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
 
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     char *opname = "reduceat";
@@ -3470,42 +3435,32 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     /* Take a reference to out for later returning */
     Py_XINCREF(out);
 
-    otype_final = otype;
-    if (get_binary_op_function(ufunc, &otype_final,
-                                &innerloop, &innerloopdata) < 0) {
-        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
-        PyErr_Format(PyExc_ValueError,
-                     "could not find a matching type for %s.%s, "
-                     "requested type has type code '%c'",
-                            ufunc_name, opname, dtype ? dtype->type : '-');
-        Py_XDECREF(dtype);
-        goto fail;
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_TRUE, descrs, "reduceat");
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    ndim = PyArray_NDIM(arr);
+    /* The below code assumes that all descriptors are identical: */
+    assert(descrs[0] == descrs[1] && descrs[0] == descrs[2]);
 
-    /*
-     * Set up the output data type, using the input's exact
-     * data type if the type number didn't change to preserve
-     * metadata
-     */
-    if (PyArray_DESCR(arr)->type_num == otype_final) {
-        if (PyArray_ISNBO(PyArray_DESCR(arr)->byteorder)) {
-            op_dtypes[0] = PyArray_DESCR(arr);
-            Py_INCREF(op_dtypes[0]);
-        }
-        else {
-            op_dtypes[0] = PyArray_DescrNewByteorder(PyArray_DESCR(arr),
-                                                    NPY_NATIVE);
-        }
-    }
-    else {
-        op_dtypes[0] = PyArray_DescrFromType(otype_final);
-    }
-    if (op_dtypes[0] == NULL) {
+    if (PyDataType_REFCHK(descrs[2]) && descrs[2]->type_num != NPY_OBJECT) {
+        /* This can be removed, but the initial element copy needs fixing */
+        PyErr_SetString(PyExc_TypeError,
+                "reduceat currently only supports `object` dtype with "
+                "references");
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
+
+    ndim = PyArray_NDIM(arr);
+
 #if NPY_UF_DBG_TRACING
     printf("Found %s.%s inner loop with dtype :  ", ufunc_name, opname);
     PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
@@ -3532,11 +3487,13 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     op[2] = ind;
 
     if (out != NULL || ndim > 1 || !PyArray_ISALIGNED(arr) ||
-            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr))) {
+            !PyArray_EquivTypes(descrs[0], PyArray_DESCR(arr))) {
         need_outer_iterator = 1;
     }
 
     if (need_outer_iterator) {
+        PyArray_Descr *op_dtypes[3] = {descrs[0], descrs[1], NULL};
+
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
                            NPY_ITER_REFS_OK|
                            NPY_ITER_MULTI_INDEX|
@@ -3565,8 +3522,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         NPY_UF_DBG_PRINT("Allocating outer iterator\n");
         iter = NpyIter_AdvancedNew(3, op, flags,
                                    NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                                   op_flags,
-                                   op_dtypes,
+                                   op_flags, op_dtypes,
                                    ndim, op_axes, NULL, 0);
         if (iter == NULL) {
             goto fail;
@@ -3590,11 +3546,15 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
             Py_INCREF(out);
         }
     }
-    /* Allocate the output for when there's no outer iterator */
-    else if (out == NULL) {
-        Py_INCREF(op_dtypes[0]);
+    else {
+        /*
+         * Allocate the output for when there's no outer iterator, we always
+         * use the outer_iteration path when `out` is passed.
+         */
+        assert(out == NULL);
+        Py_INCREF(descrs[0]);
         op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
-                                    &PyArray_Type, op_dtypes[0],
+                                    &PyArray_Type, descrs[0],
                                     1, &ind_size, NULL, NULL,
                                     0, NULL);
         if (out == NULL) {
@@ -3602,6 +3562,28 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         }
     }
 
+    npy_intp fixed_strides[3];
+    if (need_outer_iterator) {
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    }
+    else {
+        fixed_strides[1] = PyArray_STRIDES(op[1])[axis];
+    }
+    /* The reduce axis does not advance here in the strided-loop */
+    fixed_strides[0] = 0;
+    fixed_strides[2] = 0;
+
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufuncimpl->get_strided_loop(&context,
+            1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     /*
      * If the output has zero elements, return now.
      */
@@ -3619,8 +3601,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         npy_intp stride0, stride1;
         npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
 
-        int itemsize = op_dtypes[0]->elsize;
-        int needs_api = NpyIter_IterationNeedsAPI(iter);
+        int itemsize = descrs[0]->elsize;
+        needs_api |= NpyIter_IterationNeedsAPI(iter);
 
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
@@ -3640,10 +3622,11 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         stride_copy[1] = stride1;
         stride_copy[2] = stride0;
 
-        NPY_BEGIN_THREADS_NDITER(iter);
+        if (!needs_api) {
+            NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
+        }
 
         do {
-
             for (i = 0; i < ind_size; ++i) {
                 npy_intp start = reduceat_ind[i],
                         end = (i == ind_size-1) ? count_m1+1 :
@@ -3661,7 +3644,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                  * to the same memory, e.g.
                  * np.add.reduceat(a, np.arange(len(a)), out=a).
                  */
-                if (otype == NPY_OBJECT) {
+                if (descrs[2]->type_num == NPY_OBJECT) {
                     /*
                      * Incref before decref to avoid the possibility of
                      * the reference count being zero temporarily.
@@ -3681,33 +3664,24 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                     dataptr_copy[1] += stride1;
                     NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                     (int)count);
-                    innerloop(dataptr_copy, &count,
-                                stride_copy, innerloopdata);
+                    res = strided_loop(&context,
+                            dataptr_copy, &count, stride_copy, auxdata);
                 }
             }
-        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        } while (res == 0 && iternext(iter));
 
         NPY_END_THREADS;
     }
     else if (iter == NULL) {
         char *dataptr_copy[3];
-        npy_intp stride_copy[3];
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
-
-        /* Execute the loop with no iterators */
-        npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
-
-        int needs_api = PyDataType_REFCHK(op_dtypes[0]);
+        npy_intp stride1 = PyArray_STRIDE(op[1], axis);
 
         NPY_UF_DBG_PRINT("UFunc: Reduce loop with no iterators\n");
 
-        stride_copy[0] = stride0;
-        stride_copy[1] = stride1;
-        stride_copy[2] = stride0;
-
         if (!needs_api) {
             NPY_BEGIN_THREADS;
         }
@@ -3729,7 +3703,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
              * the same memory, e.g.
              * np.add.reduceat(a, np.arange(len(a)), out=a).
              */
-            if (otype == NPY_OBJECT) {
+            if (descrs[2]->type_num == NPY_OBJECT) {
                 /*
                  * Incref before decref to avoid the possibility of the
                  * reference count being zero temporarily.
@@ -3749,8 +3723,11 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                 dataptr_copy[1] += stride1;
                 NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                 (int)count);
-                innerloop(dataptr_copy, &count,
-                            stride_copy, innerloopdata);
+                res = strided_loop(&context,
+                        dataptr_copy, &count, fixed_strides, auxdata);
+                if (res != 0) {
+                    break;
+                }
             }
         }
 
@@ -3758,8 +3735,21 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     }
 
 finish:
-    Py_XDECREF(op_dtypes[0]);
+    NPY_AUXDATA_FREE(auxdata);
+    Py_DECREF(descrs[0]);
+    Py_DECREF(descrs[1]);
+    Py_DECREF(descrs[2]);
+
     if (!NpyIter_Deallocate(iter)) {
+        res = -1;
+    }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "reduceat");
+    }
+
+    if (res < 0) {
         Py_DECREF(out);
         return NULL;
     }
@@ -3768,9 +3758,14 @@ finish:
 
 fail:
     Py_XDECREF(out);
-    Py_XDECREF(op_dtypes[0]);
+
+    NPY_AUXDATA_FREE(auxdata);
+    Py_XDECREF(descrs[0]);
+    Py_XDECREF(descrs[1]);
+    Py_XDECREF(descrs[2]);
 
     NpyIter_Deallocate(iter);
+
     return NULL;
 }
 
@@ -3868,7 +3863,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     PyArrayObject *mp = NULL, *wheremask = NULL, *ret = NULL;
     PyObject *op = NULL;
     PyArrayObject *indices = NULL;
-    PyArray_Descr *otype = NULL;
+    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
     PyArrayObject *out = NULL;
     int keepdims = 0;
     PyObject *initial = NULL;
@@ -4012,13 +4007,10 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
     if (otype_obj && otype_obj != Py_None) {
         /* Use `_get_dtype` because `dtype` is a DType and not the instance */
-        PyArray_DTypeMeta *dtype = _get_dtype(otype_obj);
-        if (dtype == NULL) {
+        signature[0] = _get_dtype(otype_obj);
+        if (signature[0] == NULL) {
             goto fail;
         }
-        otype = dtype->singleton;
-        Py_INCREF(otype);
-        Py_DECREF(dtype);
     }
     if (out_obj && !PyArray_OutputConverter(out_obj, &out)) {
         goto fail;
@@ -4038,15 +4030,6 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
 
     ndim = PyArray_NDIM(mp);
 
-    /* Check to see that type (and otype) is not FLEXIBLE */
-    if (PyArray_ISFLEXIBLE(mp) ||
-        (otype && PyTypeNum_ISFLEXIBLE(otype->type_num))) {
-        PyErr_Format(PyExc_TypeError,
-                     "cannot perform %s with flexible type",
-                     _reduce_type[operation]);
-        goto fail;
-    }
-
     /* Convert the 'axis' parameter into a list of axes */
     if (axes_obj == NULL) {
         /* apply defaults */
@@ -4109,14 +4092,12 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
 
      /*
-      * If out is specified it determines otype
-      * unless otype already specified.
+      * If no dtype is specified and out is not specified, we override the
+      * integer and bool dtype used for add and multiply.
+      *
+      * TODO: The following should be handled by a promoter!
       */
-    if (otype == NULL && out != NULL) {
-        otype = PyArray_DESCR(out);
-        Py_INCREF(otype);
-    }
-    if (otype == NULL) {
+    if (signature[0] == NULL && out == NULL) {
         /*
          * For integer types --- make sure at least a long
          * is used for add and multiply reduction to avoid overflow
@@ -4136,16 +4117,17 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
                     typenum = NPY_LONG;
                 }
             }
+            signature[0] = PyArray_DTypeFromTypeNum(typenum);
         }
-        otype = PyArray_DescrFromType(typenum);
     }
-
+    Py_XINCREF(signature[0]);
+    signature[2] = signature[0];
 
     switch(operation) {
     case UFUNC_REDUCE:
-        ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
-                             otype, keepdims, initial, wheremask);
-        Py_XDECREF(wheremask);
+        ret = PyUFunc_Reduce(ufunc,
+                mp, out, naxes, axes, signature, keepdims, initial, wheremask);
+        Py_XSETREF(wheremask, NULL);
         break;
     case UFUNC_ACCUMULATE:
         if (ndim == 0) {
@@ -4157,8 +4139,8 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
                         "accumulate does not allow multiple axes");
             goto fail;
         }
-        ret = (PyArrayObject *)PyUFunc_Accumulate(ufunc, mp, out, axes[0],
-                                                  otype->type_num);
+        ret = (PyArrayObject *)PyUFunc_Accumulate(ufunc,
+                mp, out, axes[0], signature);
         break;
     case UFUNC_REDUCEAT:
         if (ndim == 0) {
@@ -4171,19 +4153,22 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
             goto fail;
         }
         ret = (PyArrayObject *)PyUFunc_Reduceat(ufunc,
-                mp, indices, out, axes[0], otype->type_num);
+                mp, indices, out, axes[0], signature);
         Py_SETREF(indices, NULL);
         break;
     }
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(signature[0]);
+    Py_DECREF(signature[1]);
+    Py_DECREF(signature[2]);
+
     Py_DECREF(mp);
-    Py_DECREF(otype);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
 
-    if (ret == NULL) {
-        return NULL;
-    }
-
     /* Wrap and return the output */
     {
         /* Find __array_wrap__ - note that these rules are different to the
@@ -4211,7 +4196,10 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
 
 fail:
-    Py_XDECREF(otype);
+    Py_XDECREF(signature[0]);
+    Py_XDECREF(signature[1]);
+    Py_XDECREF(signature[2]);
+
     Py_XDECREF(mp);
     Py_XDECREF(wheremask);
     Py_XDECREF(indices);
@@ -4938,65 +4926,6 @@ fail:
 
 
 /*
- * TODO: The implementation below can be replaced with PyVectorcall_Call
- *       when available (should be Python 3.8+).
- */
-static PyObject *
-ufunc_generic_call(
-        PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
-{
-    Py_ssize_t len_args = PyTuple_GET_SIZE(args);
-    /*
-     * Wrapper for tp_call to tp_fastcall, to support both on older versions
-     * of Python. (and generally simplifying support of both versions in the
-     * same codebase.
-     */
-    if (kwds == NULL) {
-        return ufunc_generic_fastcall(ufunc,
-                PySequence_Fast_ITEMS(args), len_args, NULL, NPY_FALSE);
-    }
-
-    PyObject *new_args[NPY_MAXARGS];
-    Py_ssize_t len_kwds = PyDict_Size(kwds);
-
-    if (NPY_UNLIKELY(len_args + len_kwds > NPY_MAXARGS)) {
-        /*
-         * We do not have enough scratch-space, so we have to abort;
-         * In practice this error should not be seen by users.
-         */
-        PyErr_Format(PyExc_ValueError,
-                "%s() takes from %d to %d positional arguments but "
-                "%zd were given",
-                ufunc_get_name_cstr(ufunc) , ufunc->nin, ufunc->nargs, len_args);
-        return NULL;
-    }
-
-    /* Copy args into the scratch space */
-    for (Py_ssize_t i = 0; i < len_args; i++) {
-        new_args[i] = PyTuple_GET_ITEM(args, i);
-    }
-
-    PyObject *kwnames = PyTuple_New(len_kwds);
-
-    PyObject *key, *value;
-    Py_ssize_t pos = 0;
-    Py_ssize_t i = 0;
-    while (PyDict_Next(kwds, &pos, &key, &value)) {
-        Py_INCREF(key);
-        PyTuple_SET_ITEM(kwnames, i, key);
-        new_args[i + len_args] = value;
-        i++;
-    }
-
-    PyObject *res = ufunc_generic_fastcall(ufunc,
-            new_args, len_args, kwnames, NPY_FALSE);
-    Py_DECREF(kwnames);
-    return res;
-}
-
-
-#if PY_VERSION_HEX >= 0x03080000
-/*
  * Implement vectorcallfunc which should be defined with Python 3.8+.
  * In principle this could be backported, but the speed gain seems moderate
  * since ufunc calls often do not have keyword arguments and always have
@@ -5013,7 +4942,6 @@ ufunc_generic_vectorcall(PyObject *ufunc,
     return ufunc_generic_fastcall((PyUFuncObject *)ufunc,
             args, PyVectorcall_NARGS(len_args), kwnames, NPY_FALSE);
 }
-#endif  /* PY_VERSION_HEX >= 0x03080000 */
 
 
 NPY_NO_EXPORT PyObject *
@@ -5190,11 +5118,7 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
     ufunc->core_dim_flags = NULL;
     ufunc->userloops = NULL;
     ufunc->ptr = NULL;
-#if PY_VERSION_HEX >= 0x03080000
     ufunc->vectorcall = &ufunc_generic_vectorcall;
-#else
-    ufunc->reserved2 = NULL;
-#endif
     ufunc->reserved1 = 0;
     ufunc->iter_flags = 0;
 
@@ -5892,15 +5816,13 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     PyArrayObject *op2_array = NULL;
     PyArrayMapIterObject *iter = NULL;
     PyArrayIterObject *iter2 = NULL;
-    PyArray_Descr *dtypes[3] = {NULL, NULL, NULL};
     PyArrayObject *operands[3] = {NULL, NULL, NULL};
     PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
 
-    int needs_api = 0;
+    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+    PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
 
-    PyUFuncGenericFunction innerloop;
-    void *innerloopdata;
-    npy_intp i;
     int nop;
 
     /* override vars */
@@ -5913,6 +5835,10 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     int buffersize;
     int errormask = 0;
     char * err_msg = NULL;
+
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
     NPY_BEGIN_THREADS_DEF;
 
     if (ufunc->nin > 2) {
@@ -6000,26 +5926,51 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
     /*
      * Create dtypes array for either one or two input operands.
-     * The output operand is set to the first input operand
+     * Compare to the logic in `convert_ufunc_arguments`.
+     * TODO: It may be good to review some of this behaviour, since the
+     *       operand array is special (it is written to) similar to reductions.
+     *       Using unsafe-casting as done here, is likely not desirable.
      */
     operands[0] = op1_array;
+    operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+    Py_INCREF(operand_DTypes[0]);
+    int force_legacy_promotion = 0;
+    int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
     if (op2_array != NULL) {
         operands[1] = op2_array;
-        operands[2] = op1_array;
+        operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+        Py_INCREF(operand_DTypes[1]);
+        allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+        operands[2] = operands[0];
+        operand_DTypes[2] = operand_DTypes[0];
+        Py_INCREF(operand_DTypes[2]);
+
         nop = 3;
+        if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+                                       != (PyArray_NDIM(op2_array) == 0))) {
+                /* both are legacy and only one is 0-D: force legacy */
+                force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
+            }
     }
     else {
-        operands[1] = op1_array;
+        operands[1] = operands[0];
+        operand_DTypes[1] = operand_DTypes[0];
+        Py_INCREF(operand_DTypes[1]);
         operands[2] = NULL;
         nop = 2;
     }
 
-    if (ufunc->type_resolver(ufunc, NPY_UNSAFE_CASTING,
-                            operands, NULL, dtypes) < 0) {
+    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+            operands, signature, operand_DTypes,
+            force_legacy_promotion, allow_legacy_promotion);
+    if (ufuncimpl == NULL) {
         goto fail;
     }
-    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-        &innerloop, &innerloopdata, &needs_api) < 0) {
+
+    /* Find the correct descriptors for the operation */
+    if (resolve_descriptors(nop, ufunc, ufuncimpl,
+            operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
         goto fail;
     }
 
@@ -6080,21 +6031,44 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                         NPY_ITER_GROWINNER|
                         NPY_ITER_DELAY_BUFALLOC,
                         NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                        op_flags, dtypes,
+                        op_flags, operation_descrs,
                         -1, NULL, NULL, buffersize);
 
     if (iter_buffer == NULL) {
         goto fail;
     }
 
-    needs_api = needs_api | NpyIter_IterationNeedsAPI(iter_buffer);
-
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
         NpyIter_Deallocate(iter_buffer);
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    /* Use contiguous strides; if there is such a loop it may be faster */
+    npy_intp strides[3] = {
+            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+    if (nop == 3) {
+        strides[2] = operation_descrs[2]->elsize;
+    }
+
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6103,14 +6077,13 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
      */
-    i = iter->size;
-    while (i > 0)
+    int res = 0;
+    for (npy_intp i = iter->size; i > 0; i--)
     {
         char *dataptr[3];
         char **buffer_dataptr;
         /* one element at a time, no stride required but read by innerloop */
-        npy_intp count[3] = {1, 0xDEADBEEF, 0xDEADBEEF};
-        npy_intp stride[3] = {0xDEADBEEF, 0xDEADBEEF, 0xDEADBEEF};
+        npy_intp count = 1;
 
         /*
          * Set up data pointers for either one or two input operands.
@@ -6129,14 +6102,14 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /* Reset NpyIter data pointers which will trigger a buffer copy */
         NpyIter_ResetBasePointers(iter_buffer, dataptr, &err_msg);
         if (err_msg) {
+            res = -1;
             break;
         }
 
         buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
 
-        innerloop(buffer_dataptr, count, stride, innerloopdata);
-
-        if (needs_api && PyErr_Occurred()) {
+        res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+        if (res != 0) {
             break;
         }
 
@@ -6150,32 +6123,35 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         if (iter2 != NULL) {
             PyArray_ITER_NEXT(iter2);
         }
-
-        i--;
     }
 
     NPY_END_THREADS;
 
-    if (err_msg) {
+    if (res != 0 && err_msg) {
         PyErr_SetString(PyExc_ValueError, err_msg);
     }
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "at");
+    }
 
+    NPY_AUXDATA_FREE(auxdata);
     NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
-    for (i = 0; i < 3; i++) {
-        Py_XDECREF(dtypes[i]);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(operation_descrs[i]);
         Py_XDECREF(array_operands[i]);
     }
 
     /*
-     * An error should only be possible if needs_api is true, but this is not
-     * strictly correct for old-style ufuncs (e.g. `power` released the GIL
-     * but manually set an Exception).
+     * An error should only be possible if needs_api is true or `res != 0`,
+     * but this is not strictly correct for old-style ufuncs
+     * (e.g. `power` released the GIL but manually set an Exception).
      */
-    if (PyErr_Occurred()) {
+    if (res != 0 || PyErr_Occurred()) {
         return NULL;
     }
     else {
@@ -6190,10 +6166,11 @@ fail:
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
-    for (i = 0; i < 3; i++) {
-        Py_XDECREF(dtypes[i]);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(operation_descrs[i]);
         Py_XDECREF(array_operands[i]);
     }
+    NPY_AUXDATA_FREE(auxdata);
 
     return NULL;
 }
@@ -6396,19 +6373,15 @@ NPY_NO_EXPORT PyTypeObject PyUFunc_Type = {
     .tp_basicsize = sizeof(PyUFuncObject),
     .tp_dealloc = (destructor)ufunc_dealloc,
     .tp_repr = (reprfunc)ufunc_repr,
-    .tp_call = (ternaryfunc)ufunc_generic_call,
+    .tp_call = &PyVectorcall_Call,
     .tp_str = (reprfunc)ufunc_repr,
     .tp_flags = Py_TPFLAGS_DEFAULT |
-#if PY_VERSION_HEX >= 0x03080000
         _Py_TPFLAGS_HAVE_VECTORCALL |
-#endif
         Py_TPFLAGS_HAVE_GC,
     .tp_traverse = (traverseproc)ufunc_traverse,
     .tp_methods = ufunc_methods,
     .tp_getset = ufunc_getset,
-#if PY_VERSION_HEX >= 0x03080000
     .tp_vectorcall_offset = offsetof(PyUFuncObject, vectorcall),
-#endif
 };
 
 /* End of code for ufunc objects */
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 7e24bc493..9ed923cf5 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -247,6 +247,28 @@ PyUFunc_ValidateCasting(PyUFuncObject *ufunc,
 }
 
 
+/*
+ * Same as `PyUFunc_ValidateCasting` but only checks output casting.
+ */
+NPY_NO_EXPORT int
+PyUFunc_ValidateOutCasting(PyUFuncObject *ufunc,
+        NPY_CASTING casting, PyArrayObject **operands, PyArray_Descr **dtypes)
+{
+    int i, nin = ufunc->nin, nop = nin + ufunc->nout;
+
+    for (i = nin; i < nop; ++i) {
+        if (operands[i] == NULL) {
+            continue;
+        }
+        if (!PyArray_CanCastTypeTo(dtypes[i],
+                PyArray_DESCR(operands[i]), casting)) {
+            return raise_output_casting_error(
+                    ufunc, casting, dtypes[i], PyArray_DESCR(operands[i]), i);
+        }
+    }
+    return 0;
+}
+
 /*UFUNC_API
  *
  * This function applies the default type resolution rules
@@ -2142,6 +2164,10 @@ type_tuple_type_resolver(PyUFuncObject *self,
      * `signature=(None,)*nin + (dtype,)*nout`.  If the signature matches that
      * exactly (could be relaxed but that is not necessary for backcompat),
      * we also try `signature=(dtype,)*(nin+nout)`.
+     * Since reduction pass in `(dtype, None, dtype)` we broaden this to
+     * replacing all unspecified dtypes with the homogeneous output one.
+     * Note that this can (and often will) lead to unsafe casting.  This is
+     * normally rejected (but not currently for reductions!).
      * This used to be the main meaning for `dtype=dtype`, but some calls broke
      * the expectation, and changing it allows for `dtype=dtype` to be useful
      * for ufuncs like `np.ldexp` in the future while also normalizing it to
@@ -2160,13 +2186,12 @@ type_tuple_type_resolver(PyUFuncObject *self,
     if (homogeneous_type != NPY_NOTYPE) {
         for (int i = 0; i < nin; i++) {
             if (specified_types[i] != NPY_NOTYPE) {
-                homogeneous_type = NPY_NOTYPE;
-                break;
+                /* Never replace a specified type! */
+                continue;
             }
             specified_types[i] = homogeneous_type;
         }
-    }
-    if (homogeneous_type != NPY_NOTYPE) {
+
         /* Try again with the homogeneous specified types. */
         res = type_tuple_type_resolver_core(self,
                 op, input_casting, casting, specified_types, any_object,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index dd88a081a..84a2593f4 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -99,6 +99,10 @@ PyUFunc_DivmodTypeResolver(PyUFuncObject *ufunc,
                               PyObject *type_tup,
                               PyArray_Descr **out_dtypes);
 
+NPY_NO_EXPORT int
+PyUFunc_ValidateOutCasting(PyUFuncObject *ufunc,
+        NPY_CASTING casting, PyArrayObject **operands, PyArray_Descr **dtypes);
+
 /*
  * Does a linear search for the best inner loop of the ufunc.
  *
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index a9954dfc1..272555704 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -22,6 +22,7 @@
 
 #include "numpy/npy_math.h"
 #include "number.h"
+#include "dispatching.h"
 
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
@@ -305,5 +306,33 @@ int initumath(PyObject *m)
         return -1;
     }
 
+    /*
+     * Set up promoters for logical functions
+     * TODO: This should probably be done at a better place, or even in the
+     *       code generator directly.
+     */
+    s = _PyDict_GetItemStringWithError(d, "logical_and");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
+
+    s = _PyDict_GetItemStringWithError(d, "logical_or");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
+
+    s = _PyDict_GetItemStringWithError(d, "logical_xor");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
     return 0;
 }
diff --git a/numpy/core/tests/data/generate_umath_validation_data.cpp b/numpy/core/tests/data/generate_umath_validation_data.cpp
index 9d97ff4ab..418eae670 100644
--- a/numpy/core/tests/data/generate_umath_validation_data.cpp
+++ b/numpy/core/tests/data/generate_umath_validation_data.cpp
@@ -1,41 +1,46 @@
-#include<math.h>
-#include<stdio.h>
-#include<iostream>
-#include<algorithm>
-#include<vector>
-#include<random>
-#include<fstream>
-#include<time.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <random>
+#include <stdio.h>
+#include <time.h>
+#include <vector>
 
 struct ufunc {
     std::string name;
-    double (*f32func) (double);
-    long double (*f64func) (long double);
+    double (*f32func)(double);
+    long double (*f64func)(long double);
     float f32ulp;
     float f64ulp;
 };
 
-template<typename T>
-T RandomFloat(T a, T b) {
-    T random = ((T) rand()) / (T) RAND_MAX;
+template <typename T>
+T
+RandomFloat(T a, T b)
+{
+    T random = ((T)rand()) / (T)RAND_MAX;
     T diff = b - a;
     T r = random * diff;
     return a + r;
 }
 
-template<typename T>
-void append_random_array(std::vector<T>& arr, T min, T max, size_t N)
+template <typename T>
+void
+append_random_array(std::vector<T> &arr, T min, T max, size_t N)
 {
     for (size_t ii = 0; ii < N; ++ii)
         arr.emplace_back(RandomFloat<T>(min, max));
 }
 
-template<typename T1, typename T2>
-std::vector<T1> computeTrueVal(const std::vector<T1>& in, T2(*mathfunc)(T2)) {
+template <typename T1, typename T2>
+std::vector<T1>
+computeTrueVal(const std::vector<T1> &in, T2 (*mathfunc)(T2))
+{
     std::vector<T1> out;
     for (T1 elem : in) {
-        T2 elem_d = (T2) elem;
-        T1 out_elem = (T1) mathfunc(elem_d);
+        T2 elem_d = (T2)elem;
+        T1 out_elem = (T1)mathfunc(elem_d);
         out.emplace_back(out_elem);
     }
     return out;
@@ -49,17 +54,20 @@ std::vector<T1> computeTrueVal(const std::vector<T1>& in, T2(*mathfunc)(T2)) {
 #define MINDEN std::numeric_limits<T>::denorm_min()
 #define MINFLT std::numeric_limits<T>::min()
 #define MAXFLT std::numeric_limits<T>::max()
-#define INF    std::numeric_limits<T>::infinity()
-#define qNAN   std::numeric_limits<T>::quiet_NaN()
-#define sNAN   std::numeric_limits<T>::signaling_NaN()
+#define INF std::numeric_limits<T>::infinity()
+#define qNAN std::numeric_limits<T>::quiet_NaN()
+#define sNAN std::numeric_limits<T>::signaling_NaN()
 
-template<typename T>
-std::vector<T> generate_input_vector(std::string func) {
-    std::vector<T> input = {MINDEN, -MINDEN, MINFLT, -MINFLT, MAXFLT, -MAXFLT,
-                            INF, -INF, qNAN, sNAN, -1.0, 1.0, 0.0, -0.0};
+template <typename T>
+std::vector<T>
+generate_input_vector(std::string func)
+{
+    std::vector<T> input = {MINDEN,  -MINDEN, MINFLT, -MINFLT, MAXFLT,
+                            -MAXFLT, INF,     -INF,   qNAN,    sNAN,
+                            -1.0,    1.0,     0.0,    -0.0};
 
     // [-1.0, 1.0]
-    if ((func == "arcsin") || (func == "arccos") || (func == "arctanh")){
+    if ((func == "arcsin") || (func == "arccos") || (func == "arctanh")) {
         append_random_array<T>(input, -1.0, 1.0, 700);
     }
     // (0.0, INF]
@@ -98,57 +106,62 @@ std::vector<T> generate_input_vector(std::string func) {
     return input;
 }
 
-int main() {
-    srand (42);
+int
+main()
+{
+    srand(42);
     std::vector<struct ufunc> umathfunc = {
-        {"sin",sin,sin,2.37,3.3},
-        {"cos",cos,cos,2.36,3.38},
-        {"tan",tan,tan,3.91,3.93},
-        {"arcsin",asin,asin,3.12,2.55},
-        {"arccos",acos,acos,2.1,1.67},
-        {"arctan",atan,atan,2.3,2.52},
-        {"sinh",sinh,sinh,1.55,1.89},
-        {"cosh",cosh,cosh,2.48,1.97},
-        {"tanh",tanh,tanh,1.38,1.19},
-        {"arcsinh",asinh,asinh,1.01,1.48},
-        {"arccosh",acosh,acosh,1.16,1.05},
-        {"arctanh",atanh,atanh,1.45,1.46},
-        {"cbrt",cbrt,cbrt,1.94,1.82},
-        //{"exp",exp,exp,3.76,1.53},
-        {"exp2",exp2,exp2,1.01,1.04},
-        {"expm1",expm1,expm1,2.62,2.1},
-        //{"log",log,log,1.84,1.67},
-        {"log10",log10,log10,3.5,1.92},
-        {"log1p",log1p,log1p,1.96,1.93},
-        {"log2",log2,log2,2.12,1.84},
+            {"sin", sin, sin, 2.37, 3.3},
+            {"cos", cos, cos, 2.36, 3.38},
+            {"tan", tan, tan, 3.91, 3.93},
+            {"arcsin", asin, asin, 3.12, 2.55},
+            {"arccos", acos, acos, 2.1, 1.67},
+            {"arctan", atan, atan, 2.3, 2.52},
+            {"sinh", sinh, sinh, 1.55, 1.89},
+            {"cosh", cosh, cosh, 2.48, 1.97},
+            {"tanh", tanh, tanh, 1.38, 1.19},
+            {"arcsinh", asinh, asinh, 1.01, 1.48},
+            {"arccosh", acosh, acosh, 1.16, 1.05},
+            {"arctanh", atanh, atanh, 1.45, 1.46},
+            {"cbrt", cbrt, cbrt, 1.94, 1.82},
+            //{"exp",exp,exp,3.76,1.53},
+            {"exp2", exp2, exp2, 1.01, 1.04},
+            {"expm1", expm1, expm1, 2.62, 2.1},
+            //{"log",log,log,1.84,1.67},
+            {"log10", log10, log10, 3.5, 1.92},
+            {"log1p", log1p, log1p, 1.96, 1.93},
+            {"log2", log2, log2, 2.12, 1.84},
     };
 
     for (int ii = 0; ii < umathfunc.size(); ++ii) {
-         // ignore sin/cos
+        // ignore sin/cos
         if ((umathfunc[ii].name != "sin") && (umathfunc[ii].name != "cos")) {
-            std::string fileName = "umath-validation-set-" + umathfunc[ii].name + ".csv";
+            std::string fileName =
+                    "umath-validation-set-" + umathfunc[ii].name + ".csv";
             std::ofstream txtOut;
-            txtOut.open (fileName, std::ofstream::trunc);
+            txtOut.open(fileName, std::ofstream::trunc);
             txtOut << "dtype,input,output,ulperrortol" << std::endl;
 
             // Single Precision
             auto f32in = generate_input_vector<float>(umathfunc[ii].name);
-            auto f32out = computeTrueVal<float, double>(f32in, umathfunc[ii].f32func);
+            auto f32out = computeTrueVal<float, double>(f32in,
+                                                        umathfunc[ii].f32func);
             for (int jj = 0; jj < f32in.size(); ++jj) {
-                txtOut << "np.float32" << std::hex <<
-                          ",0x" << *reinterpret_cast<uint32_t*>(&f32in[jj]) <<
-                          ",0x" << *reinterpret_cast<uint32_t*>(&f32out[jj]) <<
-                          "," << ceil(umathfunc[ii].f32ulp) << std::endl;
+                txtOut << "np.float32" << std::hex << ",0x"
+                       << *reinterpret_cast<uint32_t *>(&f32in[jj]) << ",0x"
+                       << *reinterpret_cast<uint32_t *>(&f32out[jj]) << ","
+                       << ceil(umathfunc[ii].f32ulp) << std::endl;
             }
 
             // Double Precision
             auto f64in = generate_input_vector<double>(umathfunc[ii].name);
-            auto f64out = computeTrueVal<double, long double>(f64in, umathfunc[ii].f64func);
+            auto f64out = computeTrueVal<double, long double>(
+                    f64in, umathfunc[ii].f64func);
             for (int jj = 0; jj < f64in.size(); ++jj) {
-                txtOut << "np.float64" << std::hex <<
-                          ",0x" << *reinterpret_cast<uint64_t*>(&f64in[jj]) <<
-                          ",0x" << *reinterpret_cast<uint64_t*>(&f64out[jj]) <<
-                          "," << ceil(umathfunc[ii].f64ulp) << std::endl;
+                txtOut << "np.float64" << std::hex << ",0x"
+                       << *reinterpret_cast<uint64_t *>(&f64in[jj]) << ",0x"
+                       << *reinterpret_cast<uint64_t *>(&f64out[jj]) << ","
+                       << ceil(umathfunc[ii].f64ulp) << std::endl;
             }
             txtOut.close();
         }
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 291cdae89..d3c7211cd 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -598,3 +598,31 @@ def test_broadcast_arrays():
 def test_full_from_list(shape, fill_value, expected_output):
     output = np.full(shape, fill_value)
     assert_equal(output, expected_output)
+
+def test_astype_copyflag():
+    # test the various copyflag options
+    arr = np.arange(10, dtype=np.intp)
+
+    res_true = arr.astype(np.intp, copy=True)
+    assert not np.may_share_memory(arr, res_true)
+    res_always = arr.astype(np.intp, copy=np._CopyMode.ALWAYS)
+    assert not np.may_share_memory(arr, res_always)
+
+    res_false = arr.astype(np.intp, copy=False)
+    # `res_false is arr` currently, but check `may_share_memory`.
+    assert np.may_share_memory(arr, res_false)
+    res_if_needed = arr.astype(np.intp, copy=np._CopyMode.IF_NEEDED)
+    # `res_if_needed is arr` currently, but check `may_share_memory`.
+    assert np.may_share_memory(arr, res_if_needed)
+
+    res_never = arr.astype(np.intp, copy=np._CopyMode.NEVER)
+    assert np.may_share_memory(arr, res_never)
+
+    # Simple tests for when a copy is necessary:
+    res_false = arr.astype(np.float64, copy=False)
+    assert_array_equal(res_false, arr)
+    res_if_needed = arr.astype(np.float64, 
+                               copy=np._CopyMode.IF_NEEDED)
+    assert_array_equal(res_if_needed, arr)
+    assert_raises(ValueError, arr.astype, np.float64,
+                  copy=np._CopyMode.NEVER)
diff --git a/numpy/core/tests/test_casting_unittests.py b/numpy/core/tests/test_casting_unittests.py
index b0d8ff503..cb4792090 100644
--- a/numpy/core/tests/test_casting_unittests.py
+++ b/numpy/core/tests/test_casting_unittests.py
@@ -9,7 +9,6 @@ than integration tests.
 import pytest
 import textwrap
 import enum
-import itertools
 import random
 
 import numpy as np
diff --git a/numpy/core/tests/test_custom_dtypes.py b/numpy/core/tests/test_custom_dtypes.py
index 5eb82bc93..6bcc45d6b 100644
--- a/numpy/core/tests/test_custom_dtypes.py
+++ b/numpy/core/tests/test_custom_dtypes.py
@@ -101,18 +101,52 @@ class TestSFloat:
         expected_view = a.view(np.float64) * b.view(np.float64)
         assert_array_equal(res.view(np.float64), expected_view)
 
+    def test_possible_and_impossible_reduce(self):
+        # For reductions to work, the first and last operand must have the
+        # same dtype.  For this parametric DType that is not necessarily true.
+        a = self._get_array(2.)
+        # Addition reductin works (as of writing requires to pass initial
+        # because setting a scaled-float from the default `0` fails).
+        res = np.add.reduce(a, initial=0.)
+        assert res == a.astype(np.float64).sum()
+
+        # But each multiplication changes the factor, so a reduction is not
+        # possible (the relaxed version of the old refusal to handle any
+        # flexible dtype).
+        with pytest.raises(TypeError,
+                match="the resolved dtypes are not compatible"):
+            np.multiply.reduce(a)
+
+    def test_basic_ufunc_at(self):
+        float_a = np.array([1., 2., 3.])
+        b = self._get_array(2.)
+
+        float_b = b.view(np.float64).copy()
+        np.multiply.at(float_b, [1, 1, 1], float_a)
+        np.multiply.at(b, [1, 1, 1], float_a)
+
+        assert_array_equal(b.view(np.float64), float_b)
+
     def test_basic_multiply_promotion(self):
         float_a = np.array([1., 2., 3.])
         b = self._get_array(2.)
 
         res1 = float_a * b
         res2 = b * float_a
+
         # one factor is one, so we get the factor of b:
         assert res1.dtype == res2.dtype == b.dtype
         expected_view = float_a * b.view(np.float64)
         assert_array_equal(res1.view(np.float64), expected_view)
         assert_array_equal(res2.view(np.float64), expected_view)
 
+        # Check that promotion works when `out` is used:
+        np.multiply(b, float_a, out=res2)
+        with pytest.raises(TypeError):
+            # The promoter accepts this (maybe it should not), but the SFloat
+            # result cannot be cast to integer:
+            np.multiply(b, float_a, out=np.arange(3))
+
     def test_basic_addition(self):
         a = self._get_array(2.)
         b = self._get_array(4.)
@@ -145,3 +179,23 @@ class TestSFloat:
         # Check that casting the output fails also (done by the ufunc here)
         with pytest.raises(TypeError):
             np.add(a, a, out=c, casting="safe")
+
+    @pytest.mark.parametrize("ufunc",
+            [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_casts_to_bool(self, ufunc):
+        a = self._get_array(2.)
+        a[0] = 0.  # make sure first element is considered False.
+
+        float_equiv = a.astype(float)
+        expected = ufunc(float_equiv, float_equiv)
+        res = ufunc(a, a)
+        assert_array_equal(res, expected)
+
+        # also check that the same works for reductions:
+        expected = ufunc.reduce(float_equiv)
+        res = ufunc.reduce(a)
+        assert_array_equal(res, expected)
+
+        # The output casting does not match the bool, bool -> bool loop:
+        with pytest.raises(TypeError):
+            ufunc(a, a, out=np.empty(a.shape, dtype=int), casting="equiv")
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 69eba7ba0..b95d669a8 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -2029,6 +2029,21 @@ class TestDateTime:
         assert_equal(np.maximum.reduce(a),
                      np.timedelta64(7, 's'))
 
+    def test_datetime_no_subtract_reducelike(self):
+        # subtracting two datetime64 works, but we cannot reduce it, since
+        # the result of that subtraction will have a different dtype.
+        arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]")
+        msg = r"the resolved dtypes are not compatible with subtract\."
+
+        with pytest.raises(TypeError, match=msg + "reduce"):
+            np.subtract.reduce(arr)
+
+        with pytest.raises(TypeError, match=msg + "accumulate"):
+            np.subtract.accumulate(arr)
+
+        with pytest.raises(TypeError, match=msg + "reduceat"):
+            np.subtract.reduceat(arr, [0])
+
     def test_datetime_busday_offset(self):
         # First Monday in June
         assert_equal(
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 898ff8075..e0b66defc 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -13,7 +13,8 @@ import sys
 
 import numpy as np
 from numpy.testing import (
-    assert_raises, assert_warns, assert_, assert_array_equal, SkipTest, KnownFailureException
+    assert_raises, assert_warns, assert_, assert_array_equal, SkipTest,
+    KnownFailureException, break_cycles,
     )
 
 from numpy.core._multiarray_tests import fromstring_null_term_c_api
@@ -1215,3 +1216,57 @@ class TestPartitionBoolIndex(_DeprecationTestCase):
     def test_not_deprecated(self, func):
         self.assert_not_deprecated(lambda: func(1))
         self.assert_not_deprecated(lambda: func([0, 1]))
+
+
+class TestMachAr(_DeprecationTestCase):
+    # Deprecated 2021-10-19, NumPy 1.22
+    warning_cls = DeprecationWarning
+
+    def test_deprecated(self):
+        self.assert_deprecated(lambda: np.MachAr)
+
+    def test_deprecated_module(self):
+        self.assert_deprecated(lambda: getattr(np.core, "machar"))
+
+    def test_deprecated_attr(self):
+        finfo = np.finfo(float)
+        self.assert_deprecated(lambda: getattr(finfo, "machar"))
+
+
+class TestQuantileInterpolationDeprecation(_DeprecationTestCase):
+    # Deprecated 2021-11-08, NumPy 1.22
+    @pytest.mark.parametrize("func",
+        [np.percentile, np.quantile, np.nanpercentile, np.nanquantile])
+    def test_deprecated(self, func):
+        self.assert_deprecated(
+            lambda: func([0., 1.], 0., interpolation="linear"))
+        self.assert_deprecated(
+            lambda: func([0., 1.], 0., interpolation="nearest"))
+
+    @pytest.mark.parametrize("func",
+            [np.percentile, np.quantile, np.nanpercentile, np.nanquantile])
+    def test_both_passed(self, func):
+        with warnings.catch_warnings():
+            # catch the DeprecationWarning so that it does not raise:
+            warnings.simplefilter("always", DeprecationWarning)
+            with pytest.raises(TypeError):
+                func([0., 1.], 0., interpolation="nearest", method="nearest")
+
+
+class TestMemEventHook(_DeprecationTestCase):
+    # Deprecated 2021-11-18, NumPy 1.23
+    def test_mem_seteventhook(self):
+        # The actual tests are within the C code in
+        # multiarray/_multiarray_tests.c.src
+        import numpy.core._multiarray_tests as ma_tests
+        with pytest.warns(DeprecationWarning,
+                          match='PyDataMem_SetEventHook is deprecated'):
+            ma_tests.test_pydatamem_seteventhook_start()
+        # force an allocation and free of a numpy array
+        # needs to be larger then limit of small memory cacher in ctors.c
+        a = np.zeros(1000)
+        del a
+        break_cycles()
+        with pytest.warns(DeprecationWarning,
+                          match='PyDataMem_SetEventHook is deprecated'):
+            ma_tests.test_pydatamem_seteventhook_end()
diff --git a/numpy/core/tests/test_dlpack.py b/numpy/core/tests/test_dlpack.py
new file mode 100644
index 000000000..f848b2008
--- /dev/null
+++ b/numpy/core/tests/test_dlpack.py
@@ -0,0 +1,109 @@
+import sys
+import pytest
+
+import numpy as np
+from numpy.testing import assert_array_equal, IS_PYPY
+
+
+class TestDLPack:
+    @pytest.mark.skipif(IS_PYPY, reason="PyPy can't get refcounts.")
+    def test_dunder_dlpack_refcount(self):
+        x = np.arange(5)
+        y = x.__dlpack__()
+        assert sys.getrefcount(x) == 3
+        del y
+        assert sys.getrefcount(x) == 2
+
+    def test_dunder_dlpack_stream(self):
+        x = np.arange(5)
+        x.__dlpack__(stream=None)
+
+        with pytest.raises(RuntimeError):
+            x.__dlpack__(stream=1)
+
+    def test_strides_not_multiple_of_itemsize(self):
+        dt = np.dtype([('int', np.int32), ('char', np.int8)])
+        y = np.zeros((5,), dtype=dt)
+        z = y['int']
+
+        with pytest.raises(RuntimeError):
+            np._from_dlpack(z)
+
+    @pytest.mark.skipif(IS_PYPY, reason="PyPy can't get refcounts.")
+    def test_from_dlpack_refcount(self):
+        x = np.arange(5)
+        y = np._from_dlpack(x)
+        assert sys.getrefcount(x) == 3
+        del y
+        assert sys.getrefcount(x) == 2
+
+    @pytest.mark.parametrize("dtype", [
+        np.int8, np.int16, np.int32, np.int64,
+        np.uint8, np.uint16, np.uint32, np.uint64,
+        np.float16, np.float32, np.float64,
+        np.complex64, np.complex128
+    ])
+    def test_dtype_passthrough(self, dtype):
+        x = np.arange(5, dtype=dtype)
+        y = np._from_dlpack(x)
+
+        assert y.dtype == x.dtype
+        assert_array_equal(x, y)
+
+    def test_invalid_dtype(self):
+        x = np.asarray(np.datetime64('2021-05-27'))
+
+        with pytest.raises(TypeError):
+            np._from_dlpack(x)
+
+    def test_invalid_byte_swapping(self):
+        dt = np.dtype('=i8').newbyteorder()
+        x = np.arange(5, dtype=dt)
+
+        with pytest.raises(TypeError):
+            np._from_dlpack(x)
+
+    def test_non_contiguous(self):
+        x = np.arange(25).reshape((5, 5))
+
+        y1 = x[0]
+        assert_array_equal(y1, np._from_dlpack(y1))
+
+        y2 = x[:, 0]
+        assert_array_equal(y2, np._from_dlpack(y2))
+
+        y3 = x[1, :]
+        assert_array_equal(y3, np._from_dlpack(y3))
+
+        y4 = x[1]
+        assert_array_equal(y4, np._from_dlpack(y4))
+
+        y5 = np.diagonal(x).copy()
+        assert_array_equal(y5, np._from_dlpack(y5))
+
+    @pytest.mark.parametrize("ndim", range(33))
+    def test_higher_dims(self, ndim):
+        shape = (1,) * ndim
+        x = np.zeros(shape, dtype=np.float64)
+
+        assert shape == np._from_dlpack(x).shape
+
+    def test_dlpack_device(self):
+        x = np.arange(5)
+        assert x.__dlpack_device__() == (1, 0)
+        assert np._from_dlpack(x).__dlpack_device__() == (1, 0)
+
+    def dlpack_deleter_exception(self):
+        x = np.arange(5)
+        _ = x.__dlpack__()
+        raise RuntimeError
+    
+    def test_dlpack_destructor_exception(self):
+        with pytest.raises(RuntimeError):
+            self.dlpack_deleter_exception()
+
+    def test_readonly(self):
+        x = np.arange(5)
+        x.flags.writeable = False
+        with pytest.raises(TypeError):
+            x.__dlpack__()
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 61dce2494..e49604e4d 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -3,7 +3,6 @@ import operator
 import pytest
 import ctypes
 import gc
-import warnings
 import types
 from typing import Any
 
@@ -628,6 +627,12 @@ class TestSubarray:
         t2 = np.dtype('2i4', align=True)
         assert_equal(t1.alignment, t2.alignment)
 
+    def test_aligned_empty(self):
+        # Mainly regression test for gh-19696: construction failed completely
+        dt = np.dtype([], align=True)
+        assert dt == np.dtype([])
+        dt = np.dtype({"names": [], "formats": [], "itemsize": 0}, align=True)
+        assert dt == np.dtype([])
 
 def iter_struct_object_dtypes():
     """
@@ -724,26 +729,30 @@ class TestStructuredObjectRefcounting:
     def test_structured_object_indexing(self, shape, index, items_changed,
                                         dt, pat, count, singleton):
         """Structured object reference counting for advanced indexing."""
-        zero = 0
-        one = 1
+        # Use two small negative values (should be singletons, but less likely
+        # to run into race-conditions).  This failed in some threaded envs
+        # When using 0 and 1.  If it fails again, should remove all explicit
+        # checks, and rely on `pytest-leaks` reference count checker only.
+        val0 = -4
+        val1 = -5
 
-        arr = np.zeros(shape, dt)
+        arr = np.full(shape, val0, dt)
 
         gc.collect()
-        before_zero = sys.getrefcount(zero)
-        before_one = sys.getrefcount(one)
+        before_val0 = sys.getrefcount(val0)
+        before_val1 = sys.getrefcount(val1)
         # Test item getting:
         part = arr[index]
-        after_zero = sys.getrefcount(zero)
-        assert after_zero - before_zero == count * items_changed
+        after_val0 = sys.getrefcount(val0)
+        assert after_val0 - before_val0 == count * items_changed
         del part
         # Test item setting:
-        arr[index] = one
+        arr[index] = val1
         gc.collect()
-        after_zero = sys.getrefcount(zero)
-        after_one = sys.getrefcount(one)
-        assert before_zero - after_zero == count * items_changed
-        assert after_one - before_one == count * items_changed
+        after_val0 = sys.getrefcount(val0)
+        after_val1 = sys.getrefcount(val1)
+        assert before_val0 - after_val0 == count * items_changed
+        assert after_val1 - before_val1 == count * items_changed
 
     @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
                              iter_struct_object_dtypes())
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 78c5e527b..172311624 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -1,5 +1,7 @@
 import itertools
 
+import pytest
+
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_almost_equal,
@@ -744,6 +746,52 @@ class TestEinsum:
         np.einsum('ij,jk->ik', x, x, out=out)
         assert_array_equal(out.base, correct_base)
 
+    @pytest.mark.parametrize("dtype",
+             np.typecodes["AllFloat"] + np.typecodes["AllInteger"])
+    def test_different_paths(self, dtype):
+        # Test originally added to cover broken float16 path: gh-20305
+        # Likely most are covered elsewhere, at least partially.
+        dtype = np.dtype(dtype)
+        # Simple test, designed to excersize most specialized code paths,
+        # note the +0.5 for floats.  This makes sure we use a float value
+        # where the results must be exact.
+        arr = (np.arange(7) + 0.5).astype(dtype)
+        scalar = np.array(2, dtype=dtype)
+
+        # contig -> scalar:
+        res = np.einsum('i->', arr)
+        assert res == arr.sum()
+        # contig, contig -> contig:
+        res = np.einsum('i,i->i', arr, arr)
+        assert_array_equal(res, arr * arr)
+        # noncontig, noncontig -> contig:
+        res = np.einsum('i,i->i', arr.repeat(2)[::2], arr.repeat(2)[::2])
+        assert_array_equal(res, arr * arr)
+        # contig + contig -> scalar
+        assert np.einsum('i,i->', arr, arr) == (arr * arr).sum()
+        # contig + scalar -> contig (with out)
+        out = np.ones(7, dtype=dtype)
+        res = np.einsum('i,->i', arr, dtype.type(2), out=out)
+        assert_array_equal(res, arr * dtype.type(2))
+        # scalar + contig -> contig (with out)
+        res = np.einsum(',i->i', scalar, arr)
+        assert_array_equal(res, arr * dtype.type(2))
+        # scalar + contig -> scalar
+        res = np.einsum(',i->', scalar, arr)
+        # Use einsum to compare to not have difference due to sum round-offs:
+        assert res == np.einsum('i->', scalar * arr)
+        # contig + scalar -> scalar
+        res = np.einsum('i,->', arr, scalar)
+        # Use einsum to compare to not have difference due to sum round-offs:
+        assert res == np.einsum('i->', scalar * arr)
+        # contig + contig + contig -> scalar
+        arr = np.array([0.5, 0.5, 0.25, 4.5, 3.], dtype=dtype)
+        res = np.einsum('i,i,i->', arr, arr, arr)
+        assert_array_equal(res, (arr * arr * arr).sum())
+        # four arrays:
+        res = np.einsum('i,i,i,i->', arr, arr, arr, arr)
+        assert_array_equal(res, (arr * arr * arr * arr).sum())
+
     def test_small_boolean_arrays(self):
         # See gh-5946.
         # Use array of True embedded in False.
diff --git a/numpy/core/tests/test_getlimits.py b/numpy/core/tests/test_getlimits.py
index de7b3e769..c5148db2c 100644
--- a/numpy/core/tests/test_getlimits.py
+++ b/numpy/core/tests/test_getlimits.py
@@ -46,7 +46,7 @@ class TestFinfo:
                        [np.float16, np.float32, np.float64, np.complex64,
                         np.complex128]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machar', 'machep',
+            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
                          'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
                          'nmant', 'precision', 'resolution', 'tiny',
                          'smallest_normal', 'smallest_subnormal'):
diff --git a/numpy/core/tests/test_machar.py b/numpy/core/tests/test_machar.py
index 673f309f1..3a66ec51f 100644
--- a/numpy/core/tests/test_machar.py
+++ b/numpy/core/tests/test_machar.py
@@ -3,7 +3,7 @@ Test machar. Given recent changes to hardcode type data, we might want to get
 rid of both MachAr and this test at some point.
 
 """
-from numpy.core.machar import MachAr
+from numpy.core._machar import MachAr
 import numpy.core.numerictypes as ntypes
 from numpy import errstate, array
 
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
new file mode 100644
index 000000000..3dae36d5a
--- /dev/null
+++ b/numpy/core/tests/test_mem_policy.py
@@ -0,0 +1,423 @@
+import asyncio
+import gc
+import os
+import pytest
+import numpy as np
+import threading
+import warnings
+from numpy.testing import extbuild, assert_warns
+import sys
+
+
+@pytest.fixture
+def get_module(tmp_path):
+    """ Add a memory policy that returns a false pointer 64 bytes into the
+    actual allocation, and fill the prefix with some text. Then check at each
+    memory manipulation that the prefix exists, to make sure all alloc/realloc/
+    free/calloc go via the functions here.
+    """
+    if sys.platform.startswith('cygwin'):
+        pytest.skip('link fails on cygwin')
+    functions = [
+        ("get_default_policy", "METH_NOARGS", """
+             Py_INCREF(PyDataMem_DefaultHandler);
+             return PyDataMem_DefaultHandler;
+         """),
+        ("set_secret_data_policy", "METH_NOARGS", """
+             PyObject *secret_data =
+                 PyCapsule_New(&secret_data_handler, "mem_handler", NULL);
+             if (secret_data == NULL) {
+                 return NULL;
+             }
+             PyObject *old = PyDataMem_SetHandler(secret_data);
+             Py_DECREF(secret_data);
+             return old;
+         """),
+        ("set_old_policy", "METH_O", """
+             PyObject *old;
+             if (args != NULL && PyCapsule_CheckExact(args)) {
+                 old = PyDataMem_SetHandler(args);
+             }
+             else {
+                 old = PyDataMem_SetHandler(NULL);
+             }
+             return old;
+         """),
+        ("get_array", "METH_NOARGS", """
+            char *buf = (char *)malloc(20);
+            npy_intp dims[1];
+            dims[0] = 20;
+            PyArray_Descr *descr =  PyArray_DescrNewFromType(NPY_UINT8);
+            return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, NULL,
+                                        buf, NPY_ARRAY_WRITEABLE, NULL);
+         """),
+        ("set_own", "METH_O", """
+            if (!PyArray_Check(args)) {
+                PyErr_SetString(PyExc_ValueError,
+                             "need an ndarray");
+                return NULL;
+            }
+            PyArray_ENABLEFLAGS((PyArrayObject*)args, NPY_ARRAY_OWNDATA);
+            // Maybe try this too?
+            // PyArray_BASE(PyArrayObject *)args) = NULL;
+            Py_RETURN_NONE;
+         """),
+        ("get_array_with_base", "METH_NOARGS", """
+            char *buf = (char *)malloc(20);
+            npy_intp dims[1];
+            dims[0] = 20;
+            PyArray_Descr *descr =  PyArray_DescrNewFromType(NPY_UINT8);
+            PyObject *arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims,
+                                                 NULL, buf,
+                                                 NPY_ARRAY_WRITEABLE, NULL);
+            if (arr == NULL) return NULL;
+            PyObject *obj = PyCapsule_New(buf, "buf capsule",
+                                          (PyCapsule_Destructor)&warn_on_free);
+            if (obj == NULL) {
+                Py_DECREF(arr);
+                return NULL;
+            }
+            if (PyArray_SetBaseObject((PyArrayObject *)arr, obj) < 0) {
+                Py_DECREF(arr);
+                Py_DECREF(obj);
+                return NULL;
+            }
+            return arr;
+
+         """),
+    ]
+    prologue = '''
+        #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+        #include <numpy/arrayobject.h>
+        /*
+         * This struct allows the dynamic configuration of the allocator funcs
+         * of the `secret_data_allocator`. It is provided here for
+         * demonstration purposes, as a valid `ctx` use-case scenario.
+         */
+        typedef struct {
+            void *(*malloc)(size_t);
+            void *(*calloc)(size_t, size_t);
+            void *(*realloc)(void *, size_t);
+            void (*free)(void *);
+        } SecretDataAllocatorFuncs;
+
+        NPY_NO_EXPORT void *
+        shift_alloc(void *ctx, size_t sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            char *real = (char *)funcs->malloc(sz + 64);
+            if (real == NULL) {
+                return NULL;
+            }
+            snprintf(real, 64, "originally allocated %ld", (unsigned long)sz);
+            return (void *)(real + 64);
+        }
+        NPY_NO_EXPORT void *
+        shift_zero(void *ctx, size_t sz, size_t cnt) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            char *real = (char *)funcs->calloc(sz + 64, cnt);
+            if (real == NULL) {
+                return NULL;
+            }
+            snprintf(real, 64, "originally allocated %ld via zero",
+                     (unsigned long)sz);
+            return (void *)(real + 64);
+        }
+        NPY_NO_EXPORT void
+        shift_free(void *ctx, void * p, npy_uintp sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            if (p == NULL) {
+                return ;
+            }
+            char *real = (char *)p - 64;
+            if (strncmp(real, "originally allocated", 20) != 0) {
+                fprintf(stdout, "uh-oh, unmatched shift_free, "
+                        "no appropriate prefix\\n");
+                /* Make C runtime crash by calling free on the wrong address */
+                funcs->free((char *)p + 10);
+                /* funcs->free(real); */
+            }
+            else {
+                npy_uintp i = (npy_uintp)atoi(real +20);
+                if (i != sz) {
+                    fprintf(stderr, "uh-oh, unmatched shift_free"
+                            "(ptr, %ld) but allocated %ld\\n", sz, i);
+                    /* This happens in some places, only print */
+                    funcs->free(real);
+                }
+                else {
+                    funcs->free(real);
+                }
+            }
+        }
+        NPY_NO_EXPORT void *
+        shift_realloc(void *ctx, void * p, npy_uintp sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            if (p != NULL) {
+                char *real = (char *)p - 64;
+                if (strncmp(real, "originally allocated", 20) != 0) {
+                    fprintf(stdout, "uh-oh, unmatched shift_realloc\\n");
+                    return realloc(p, sz);
+                }
+                return (void *)((char *)funcs->realloc(real, sz + 64) + 64);
+            }
+            else {
+                char *real = (char *)funcs->realloc(p, sz + 64);
+                if (real == NULL) {
+                    return NULL;
+                }
+                snprintf(real, 64, "originally allocated "
+                         "%ld  via realloc", (unsigned long)sz);
+                return (void *)(real + 64);
+            }
+        }
+        /* As an example, we use the standard {m|c|re}alloc/free funcs. */
+        static SecretDataAllocatorFuncs secret_data_handler_ctx = {
+            malloc,
+            calloc,
+            realloc,
+            free
+        };
+        static PyDataMem_Handler secret_data_handler = {
+            "secret_data_allocator",
+            1,
+            {
+                &secret_data_handler_ctx, /* ctx */
+                shift_alloc,              /* malloc */
+                shift_zero,               /* calloc */
+                shift_realloc,            /* realloc */
+                shift_free                /* free */
+            }
+        };
+        void warn_on_free(void *capsule) {
+            PyErr_WarnEx(PyExc_UserWarning, "in warn_on_free", 1);
+            void * obj = PyCapsule_GetPointer(capsule,
+                                              PyCapsule_GetName(capsule));
+            free(obj);
+        };
+        '''
+    more_init = "import_array();"
+    try:
+        import mem_policy
+        return mem_policy
+    except ImportError:
+        pass
+    # if it does not exist, build and load it
+    return extbuild.build_and_import_extension('mem_policy',
+                                               functions,
+                                               prologue=prologue,
+                                               include_dirs=[np.get_include()],
+                                               build_dir=tmp_path,
+                                               more_init=more_init)
+
+
+def test_set_policy(get_module):
+
+    get_handler_name = np.core.multiarray.get_handler_name
+    get_handler_version = np.core.multiarray.get_handler_version
+    orig_policy_name = get_handler_name()
+
+    a = np.arange(10).reshape((2, 5))  # a doesn't own its own data
+    assert get_handler_name(a) is None
+    assert get_handler_version(a) is None
+    assert get_handler_name(a.base) == orig_policy_name
+    assert get_handler_version(a.base) == 1
+
+    orig_policy = get_module.set_secret_data_policy()
+
+    b = np.arange(10).reshape((2, 5))  # b doesn't own its own data
+    assert get_handler_name(b) is None
+    assert get_handler_version(b) is None
+    assert get_handler_name(b.base) == 'secret_data_allocator'
+    assert get_handler_version(b.base) == 1
+
+    if orig_policy_name == 'default_allocator':
+        get_module.set_old_policy(None)  # tests PyDataMem_SetHandler(NULL)
+        assert get_handler_name() == 'default_allocator'
+    else:
+        get_module.set_old_policy(orig_policy)
+        assert get_handler_name() == orig_policy_name
+
+
+def test_default_policy_singleton(get_module):
+    get_handler_name = np.core.multiarray.get_handler_name
+
+    # set the policy to default
+    orig_policy = get_module.set_old_policy(None)
+
+    assert get_handler_name() == 'default_allocator'
+
+    # re-set the policy to default
+    def_policy_1 = get_module.set_old_policy(None)
+
+    assert get_handler_name() == 'default_allocator'
+
+    # set the policy to original
+    def_policy_2 = get_module.set_old_policy(orig_policy)
+
+    # since default policy is a singleton,
+    # these should be the same object
+    assert def_policy_1 is def_policy_2 is get_module.get_default_policy()
+
+
+def test_policy_propagation(get_module):
+    # The memory policy goes hand-in-hand with flags.owndata
+
+    class MyArr(np.ndarray):
+        pass
+
+    get_handler_name = np.core.multiarray.get_handler_name
+    orig_policy_name = get_handler_name()
+    a = np.arange(10).view(MyArr).reshape((2, 5))
+    assert get_handler_name(a) is None
+    assert a.flags.owndata is False
+
+    assert get_handler_name(a.base) is None
+    assert a.base.flags.owndata is False
+
+    assert get_handler_name(a.base.base) == orig_policy_name
+    assert a.base.base.flags.owndata is True
+
+
+async def concurrent_context1(get_module, orig_policy_name, event):
+    if orig_policy_name == 'default_allocator':
+        get_module.set_secret_data_policy()
+        assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    else:
+        get_module.set_old_policy(None)
+        assert np.core.multiarray.get_handler_name() == 'default_allocator'
+    event.set()
+
+
+async def concurrent_context2(get_module, orig_policy_name, event):
+    await event.wait()
+    # the policy is not affected by changes in parallel contexts
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+    # change policy in the child context
+    if orig_policy_name == 'default_allocator':
+        get_module.set_secret_data_policy()
+        assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    else:
+        get_module.set_old_policy(None)
+        assert np.core.multiarray.get_handler_name() == 'default_allocator'
+
+
+async def async_test_context_locality(get_module):
+    orig_policy_name = np.core.multiarray.get_handler_name()
+
+    event = asyncio.Event()
+    # the child contexts inherit the parent policy
+    concurrent_task1 = asyncio.create_task(
+        concurrent_context1(get_module, orig_policy_name, event))
+    concurrent_task2 = asyncio.create_task(
+        concurrent_context2(get_module, orig_policy_name, event))
+    await concurrent_task1
+    await concurrent_task2
+
+    # the parent context is not affected by child policy changes
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+
+
+def test_context_locality(get_module):
+    if (sys.implementation.name == 'pypy'
+            and sys.pypy_version_info[:3] < (7, 3, 6)):
+        pytest.skip('no context-locality support in PyPy < 7.3.6')
+    asyncio.run(async_test_context_locality(get_module))
+
+
+def concurrent_thread1(get_module, event):
+    get_module.set_secret_data_policy()
+    assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    event.set()
+
+
+def concurrent_thread2(get_module, event):
+    event.wait()
+    # the policy is not affected by changes in parallel threads
+    assert np.core.multiarray.get_handler_name() == 'default_allocator'
+    # change policy in the child thread
+    get_module.set_secret_data_policy()
+
+
+def test_thread_locality(get_module):
+    orig_policy_name = np.core.multiarray.get_handler_name()
+
+    event = threading.Event()
+    # the child threads do not inherit the parent policy
+    concurrent_task1 = threading.Thread(target=concurrent_thread1,
+                                        args=(get_module, event))
+    concurrent_task2 = threading.Thread(target=concurrent_thread2,
+                                        args=(get_module, event))
+    concurrent_task1.start()
+    concurrent_task2.start()
+    concurrent_task1.join()
+    concurrent_task2.join()
+
+    # the parent thread is not affected by child policy changes
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+
+
+@pytest.mark.slow
+def test_new_policy(get_module):
+    a = np.arange(10)
+    orig_policy_name = np.core.multiarray.get_handler_name(a)
+
+    orig_policy = get_module.set_secret_data_policy()
+
+    b = np.arange(10)
+    assert np.core.multiarray.get_handler_name(b) == 'secret_data_allocator'
+
+    # test array manipulation. This is slow
+    if orig_policy_name == 'default_allocator':
+        # when the np.core.test tests recurse into this test, the
+        # policy will be set so this "if" will be false, preventing
+        # infinite recursion
+        #
+        # if needed, debug this by
+        # - running tests with -- -s (to not capture stdout/stderr
+        # - setting extra_argv=['-vv'] here
+        assert np.core.test('full', verbose=2, extra_argv=['-vv'])
+        # also try the ma tests, the pickling test is quite tricky
+        assert np.ma.test('full', verbose=2, extra_argv=['-vv'])
+
+    get_module.set_old_policy(orig_policy)
+
+    c = np.arange(10)
+    assert np.core.multiarray.get_handler_name(c) == orig_policy_name
+
+@pytest.mark.xfail(sys.implementation.name == "pypy",
+                   reason=("bad interaction between getenv and "
+                           "os.environ inside pytest"))
+@pytest.mark.parametrize("policy", ["0", "1", None])
+def test_switch_owner(get_module, policy):
+    a = get_module.get_array()
+    assert np.core.multiarray.get_handler_name(a) is None
+    get_module.set_own(a)
+    oldval = os.environ.get('NUMPY_WARN_IF_NO_MEM_POLICY', None)
+    if policy is None:
+        if 'NUMPY_WARN_IF_NO_MEM_POLICY' in os.environ:
+            os.environ.pop('NUMPY_WARN_IF_NO_MEM_POLICY')
+    else:
+        os.environ['NUMPY_WARN_IF_NO_MEM_POLICY'] = policy
+    try:
+        # The policy should be NULL, so we have to assume we can call
+        # "free".  A warning is given if the policy == "1"
+        if policy == "1":
+            with assert_warns(RuntimeWarning) as w:
+                del a
+                gc.collect()
+        else:
+            del a
+            gc.collect()
+
+    finally:
+        if oldval is None:
+            if 'NUMPY_WARN_IF_NO_MEM_POLICY' in os.environ:
+                os.environ.pop('NUMPY_WARN_IF_NO_MEM_POLICY')
+        else:
+            os.environ['NUMPY_WARN_IF_NO_MEM_POLICY'] = oldval
+
+def test_owner_is_base(get_module):
+    a = get_module.get_array_with_base()
+    with pytest.warns(UserWarning, match='warn_on_free'):
+        del a
+        gc.collect()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 0da36bbea..23182470b 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -7814,6 +7814,216 @@ class TestNewBufferProtocol:
         _multiarray_tests.corrupt_or_fix_bufferinfo(obj)
 
 
+class TestArrayCreationCopyArgument(object):
+
+    class RaiseOnBool:
+
+        def __bool__(self):
+            raise ValueError
+
+    true_vals = [True, np._CopyMode.ALWAYS, np.True_]
+    false_vals = [False, np._CopyMode.IF_NEEDED, np.False_]
+
+    def test_scalars(self):
+        # Test both numpy and python scalars
+        for dtype in np.typecodes["All"]:
+            arr = np.zeros((), dtype=dtype)
+            scalar = arr[()]
+            pyscalar = arr.item(0)
+
+            # Test never-copy raises error:
+            assert_raises(ValueError, np.array, scalar, 
+                            copy=np._CopyMode.NEVER)
+            assert_raises(ValueError, np.array, pyscalar, 
+                            copy=np._CopyMode.NEVER)
+            assert_raises(ValueError, np.array, pyscalar,
+                            copy=self.RaiseOnBool())
+            assert_raises(ValueError, _multiarray_tests.npy_ensurenocopy,
+                            [1])
+            # Casting with a dtype (to unsigned integers) can be special:
+            with pytest.raises(ValueError):
+                np.array(pyscalar, dtype=np.int64, copy=np._CopyMode.NEVER)
+
+    def test_compatible_cast(self):
+
+        # Some types are compatible even though they are different, no
+        # copy is necessary for them. This is mostly true for some integers
+        def int_types(byteswap=False):
+            int_types = (np.typecodes["Integer"] +
+                         np.typecodes["UnsignedInteger"])
+            for int_type in int_types:
+                yield np.dtype(int_type)
+                if byteswap:
+                    yield np.dtype(int_type).newbyteorder()
+
+        for int1 in int_types():
+            for int2 in int_types(True):
+                arr = np.arange(10, dtype=int1)
+
+                for copy in self.true_vals:
+                    res = np.array(arr, copy=copy, dtype=int2)
+                    assert res is not arr and res.flags.owndata
+                    assert_array_equal(res, arr)
+
+                if int1 == int2:
+                    # Casting is not necessary, base check is sufficient here
+                    for copy in self.false_vals:
+                        res = np.array(arr, copy=copy, dtype=int2)
+                        assert res is arr or res.base is arr
+
+                    res = np.array(arr,
+                                   copy=np._CopyMode.NEVER,
+                                   dtype=int2)
+                    assert res is arr or res.base is arr
+
+                else:
+                    # Casting is necessary, assert copy works:
+                    for copy in self.false_vals:
+                        res = np.array(arr, copy=copy, dtype=int2)
+                        assert res is not arr and res.flags.owndata
+                        assert_array_equal(res, arr)
+
+                    assert_raises(ValueError, np.array,
+                                  arr, copy=np._CopyMode.NEVER,
+                                  dtype=int2)
+                    assert_raises(ValueError, np.array,
+                                  arr, copy=None,
+                                  dtype=int2)
+
+    def test_buffer_interface(self):
+
+        # Buffer interface gives direct memory access (no copy)
+        arr = np.arange(10)
+        view = memoryview(arr)
+
+        # Checking bases is a bit tricky since numpy creates another
+        # memoryview, so use may_share_memory.
+        for copy in self.true_vals:
+            res = np.array(view, copy=copy)
+            assert not np.may_share_memory(arr, res)
+        for copy in self.false_vals:
+            res = np.array(view, copy=copy)
+            assert np.may_share_memory(arr, res)
+        res = np.array(view, copy=np._CopyMode.NEVER)
+        assert np.may_share_memory(arr, res)
+
+    def test_array_interfaces(self):
+        # Array interface gives direct memory access (much like a memoryview)
+        base_arr = np.arange(10)
+
+        class ArrayLike:
+            __array_interface__ = base_arr.__array_interface__
+
+        arr = ArrayLike()
+
+        for copy, val in [(True, None), (np._CopyMode.ALWAYS, None),
+                          (False, arr), (np._CopyMode.IF_NEEDED, arr),
+                          (np._CopyMode.NEVER, arr)]:
+            res = np.array(arr, copy=copy)
+            assert res.base is val
+
+    def test___array__(self):
+        base_arr = np.arange(10)
+
+        class ArrayLike:
+            def __array__(self):
+                # __array__ should return a copy, numpy cannot know this
+                # however.
+                return base_arr
+
+        arr = ArrayLike()
+
+        for copy in self.true_vals:
+            res = np.array(arr, copy=copy)
+            assert_array_equal(res, base_arr)
+            # An additional copy is currently forced by numpy in this case,
+            # you could argue, numpy does not trust the ArrayLike. This
+            # may be open for change:
+            assert res is not base_arr
+
+        for copy in self.false_vals:
+            res = np.array(arr, copy=False)
+            assert_array_equal(res, base_arr)
+            assert res is base_arr  # numpy trusts the ArrayLike
+
+        with pytest.raises(ValueError):
+            np.array(arr, copy=np._CopyMode.NEVER)
+
+    @pytest.mark.parametrize(
+            "arr", [np.ones(()), np.arange(81).reshape((9, 9))])
+    @pytest.mark.parametrize("order1", ["C", "F", None])
+    @pytest.mark.parametrize("order2", ["C", "F", "A", "K"])
+    def test_order_mismatch(self, arr, order1, order2):
+        # The order is the main (python side) reason that can cause
+        # a never-copy to fail.
+        # Prepare C-order, F-order and non-contiguous arrays:
+        arr = arr.copy(order1)
+        if order1 == "C":
+            assert arr.flags.c_contiguous
+        elif order1 == "F":
+            assert arr.flags.f_contiguous
+        elif arr.ndim != 0:
+            # Make array non-contiguous
+            arr = arr[::2, ::2]
+            assert not arr.flags.forc
+
+        # Whether a copy is necessary depends on the order of arr:
+        if order2 == "C":
+            no_copy_necessary = arr.flags.c_contiguous
+        elif order2 == "F":
+            no_copy_necessary = arr.flags.f_contiguous
+        else:
+            # Keeporder and Anyorder are OK with non-contiguous output.
+            # This is not consistent with the `astype` behaviour which
+            # enforces contiguity for "A". It is probably historic from when
+            # "K" did not exist.
+            no_copy_necessary = True
+
+        # Test it for both the array and a memoryview
+        for view in [arr, memoryview(arr)]:
+            for copy in self.true_vals:
+                res = np.array(view, copy=copy, order=order2)
+                assert res is not arr and res.flags.owndata
+                assert_array_equal(arr, res)
+
+            if no_copy_necessary:
+                for copy in self.false_vals:
+                    res = np.array(view, copy=copy, order=order2)
+                    # res.base.obj refers to the memoryview
+                    if not IS_PYPY:
+                        assert res is arr or res.base.obj is arr
+
+                res = np.array(view, copy=np._CopyMode.NEVER,
+                               order=order2)
+                if not IS_PYPY:
+                    assert res is arr or res.base.obj is arr
+            else:
+                for copy in self.false_vals:
+                    res = np.array(arr, copy=copy, order=order2)
+                    assert_array_equal(arr, res)
+                assert_raises(ValueError, np.array,
+                              view, copy=np._CopyMode.NEVER,
+                              order=order2)
+                assert_raises(ValueError, np.array,
+                              view, copy=None,
+                              order=order2)
+
+    def test_striding_not_ok(self):
+        arr = np.array([[1, 2, 4], [3, 4, 5]])
+        assert_raises(ValueError, np.array,
+                      arr.T, copy=np._CopyMode.NEVER,
+                      order='C')
+        assert_raises(ValueError, np.array,
+                      arr.T, copy=np._CopyMode.NEVER,
+                      order='C', dtype=np.int64)
+        assert_raises(ValueError, np.array,
+                      arr, copy=np._CopyMode.NEVER,
+                      order='F')
+        assert_raises(ValueError, np.array,
+                      arr, copy=np._CopyMode.NEVER,
+                      order='F', dtype=np.int64)
+
+
 class TestArrayAttributeDeletion:
 
     def test_multiarray_writable_attributes_deletion(self):
@@ -7977,18 +8187,6 @@ def test_scalar_element_deletion():
     assert_raises(ValueError, a[0].__delitem__, 'x')
 
 
-class TestMemEventHook:
-    def test_mem_seteventhook(self):
-        # The actual tests are within the C code in
-        # multiarray/_multiarray_tests.c.src
-        _multiarray_tests.test_pydatamem_seteventhook_start()
-        # force an allocation and free of a numpy array
-        # needs to be larger then limit of small memory cacher in ctors.c
-        a = np.zeros(1000)
-        del a
-        break_cycles()
-        _multiarray_tests.test_pydatamem_seteventhook_end()
-
 class TestMapIter:
     def test_mapiter(self):
         # The actual tests are within the C code in
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index fbf6da0e1..ed775cac6 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -9,7 +9,7 @@ import numpy.core._multiarray_tests as _multiarray_tests
 from numpy import array, arange, nditer, all
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises,
-    HAS_REFCOUNT, suppress_warnings
+    HAS_REFCOUNT, suppress_warnings, break_cycles
     )
 
 
@@ -3128,6 +3128,8 @@ def test_warn_noclose():
         assert len(sup.log) == 1
 
 
+@pytest.mark.skipif(sys.version_info[:2] == (3, 9) and sys.platform == "win32",
+                    reason="Errors with Python 3.9 on Windows")
 @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
 @pytest.mark.parametrize(["in_dtype", "buf_dtype"],
         [("i", "O"), ("O", "i"),  # most simple cases
@@ -3148,6 +3150,8 @@ def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
 
     # Note that resetting does not free references
     del it
+    break_cycles()
+    break_cycles()
     assert count == sys.getrefcount(value)
 
     # Repeat the test with `iternext`
@@ -3157,6 +3161,8 @@ def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
         it.iternext()
 
     del it  # should ensure cleanup
+    break_cycles()
+    break_cycles()
     assert count == sys.getrefcount(value)
 
 
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index e36f76c53..ad9437911 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -16,7 +16,7 @@ from numpy.testing import (
     )
 from numpy.core._rational_tests import rational
 
-from hypothesis import assume, given, strategies as st
+from hypothesis import given, strategies as st
 from hypothesis.extra import numpy as hynp
 
 
@@ -646,7 +646,7 @@ class TestFloatExceptions:
             if np.dtype(ftype).kind == 'f':
                 # Get some extreme values for the type
                 fi = np.finfo(ftype)
-                ft_tiny = fi.machar.tiny
+                ft_tiny = fi._machar.tiny
                 ft_max = fi.max
                 ft_eps = fi.eps
                 underflow = 'underflow'
@@ -655,7 +655,7 @@ class TestFloatExceptions:
                 # 'c', complex, corresponding real dtype
                 rtype = type(ftype(0).real)
                 fi = np.finfo(rtype)
-                ft_tiny = ftype(fi.machar.tiny)
+                ft_tiny = ftype(fi._machar.tiny)
                 ft_max = ftype(fi.max)
                 ft_eps = ftype(fi.eps)
                 # The complex types raise different exceptions
@@ -932,25 +932,6 @@ class TestTypes:
         # Promote with object:
         assert_equal(promote_types('O', S+'30'), np.dtype('O'))
 
-    @pytest.mark.parametrize(["dtype1", "dtype2"],
-            [[np.dtype("V6"), np.dtype("V10")],
-             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
-             [np.dtype("i8,i8"), np.dtype("i4,i4")],
-            ])
-    def test_invalid_void_promotion(self, dtype1, dtype2):
-        # Mainly test structured void promotion, which currently allows
-        # byte-swapping, but nothing else:
-        with pytest.raises(TypeError):
-            np.promote_types(dtype1, dtype2)
-
-    @pytest.mark.parametrize(["dtype1", "dtype2"],
-            [[np.dtype("V10"), np.dtype("V10")],
-             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
-             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
-            ])
-    def test_valid_void_promotion(self, dtype1, dtype2):
-        assert np.promote_types(dtype1, dtype2) is dtype1
-
     @pytest.mark.parametrize("dtype",
            list(np.typecodes["All"]) +
            ["i,i", "S3", "S100", "U3", "U100", rational])
@@ -1503,6 +1484,18 @@ class TestNonzero:
         a = np.array([[False], [TrueThenFalse()]])
         assert_raises(RuntimeError, np.nonzero, a)
 
+    def test_nonzero_sideffects_structured_void(self):
+        # Checks that structured void does not mutate alignment flag of
+        # original array.
+        arr = np.zeros(5, dtype="i1,i8,i8")  # `ones` may short-circuit
+        assert arr.flags.aligned  # structs are considered "aligned"
+        assert not arr["f2"].flags.aligned
+        # make sure that nonzero/count_nonzero do not flip the flag:
+        np.nonzero(arr)
+        assert arr.flags.aligned
+        np.count_nonzero(arr)
+        assert arr.flags.aligned
+
     def test_nonzero_exception_safe(self):
         # gh-13930
 
diff --git a/numpy/core/tests/test_scalar_methods.py b/numpy/core/tests/test_scalar_methods.py
index 6077c8f75..eef4c1433 100644
--- a/numpy/core/tests/test_scalar_methods.py
+++ b/numpy/core/tests/test_scalar_methods.py
@@ -183,3 +183,21 @@ def test_class_getitem_38(cls: Type[np.number]) -> None:
     match = "Type subscription requires python >= 3.9"
     with pytest.raises(TypeError, match=match):
         cls[Any]
+
+
+class TestBitCount:
+    # derived in part from the cpython test "test_bit_count"
+
+    @pytest.mark.parametrize("itype", np.sctypes['int']+np.sctypes['uint'])
+    def test_small(self, itype):
+        for a in range(max(np.iinfo(itype).min, 0), 128):
+            msg = f"Smoke test for {itype}({a}).bit_count()"
+            assert itype(a).bit_count() == bin(a).count("1"), msg
+
+    def test_bit_count(self):
+        for exp in [10, 17, 63]:
+            a = 2**exp
+            assert np.uint64(a).bit_count() == 1
+            assert np.uint64(a - 1).bit_count() == exp
+            assert np.uint64(a ^ 63).bit_count() == 7
+            assert np.uint64((a - 1) ^ 510).bit_count() == exp - 8
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index becd65b11..90078a2ea 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -5,14 +5,14 @@ import itertools
 import operator
 import platform
 import pytest
-from hypothesis import given, settings, Verbosity, assume
+from hypothesis import given, settings, Verbosity
 from hypothesis.strategies import sampled_from
 
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_almost_equal,
     assert_array_equal, IS_PYPY, suppress_warnings, _gen_alignment_data,
-    assert_warns, assert_raises_regex,
+    assert_warns,
     )
 
 types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index f0c60953b..12a67c44d 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -329,7 +329,38 @@ class _SIMD_FP(_Test_Utility):
         data_square = [x*x for x in data]
         square = self.square(vdata)
         assert square == data_square
-        
+
+    @pytest.mark.parametrize("intrin, func", [("self.ceil", math.ceil),
+    ("self.trunc", math.trunc)])
+    def test_rounding(self, intrin, func):
+        """
+        Test intrinsics:
+            npyv_ceil_##SFX
+            npyv_trunc_##SFX
+        """
+        intrin_name = intrin
+        intrin = eval(intrin)
+        pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan()
+        # special cases
+        round_cases = ((nan, nan), (pinf, pinf), (ninf, ninf))
+        for case, desired in round_cases:
+            data_round = [desired]*self.nlanes
+            _round = intrin(self.setall(case))
+            assert _round == pytest.approx(data_round, nan_ok=True)
+        for x in range(0, 2**20, 256**2):
+            for w in (-1.05, -1.10, -1.15, 1.05, 1.10, 1.15):
+                data = [x*w+a for a in range(self.nlanes)]
+                vdata = self.load(data)
+                data_round = [func(x) for x in data]
+                _round = intrin(vdata)
+                assert _round == data_round
+        # signed zero
+        if "ceil" in intrin_name or "trunc" in intrin_name:
+            for w in (-0.25, -0.30, -0.45):
+                _round = self._to_unsigned(intrin(self.setall(w)))
+                data_round = self._to_unsigned(self.setall(-0.0))
+                assert _round == data_round
+    
     def test_max(self):
         """
         Test intrinsics:
@@ -818,6 +849,7 @@ class _SIMD_ALL(_Test_Utility):
         if self._is_fp():
             return
 
+        int_min = self._int_min()
         def trunc_div(a, d):
             """
             Divide towards zero works with large integers > 2^53,
@@ -830,57 +862,31 @@ class _SIMD_ALL(_Test_Utility):
                 return a // d
             return (a + sign_d - sign_a) // d + 1
 
-        int_min = self._int_min() if self._is_signed() else 1
-        int_max = self._int_max()
-        rdata = (
-            0, 1, self.nlanes, int_max-self.nlanes,
-            int_min, int_min//2 + 1
-        )
-        divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2)
-
-        for x, d in itertools.product(rdata, divisors):
-            data = self._data(x)
-            vdata = self.load(data)
-            data_divc = [trunc_div(a, d) for a in data]
-            divisor = self.divisor(d)
-            divc = self.divc(vdata, divisor)
-            assert divc == data_divc
-
-        if not self._is_signed():
-            return
-
-        safe_neg = lambda x: -x-1 if -x > int_max else -x
-        # test round division for signed integers
-        for x, d in itertools.product(rdata, divisors):
-            d_neg = safe_neg(d)
-            data = self._data(x)
-            data_neg = [safe_neg(a) for a in data]
-            vdata = self.load(data)
-            vdata_neg = self.load(data_neg)
-            divisor = self.divisor(d)
-            divisor_neg = self.divisor(d_neg)
-
-            # round towards zero
-            data_divc = [trunc_div(a, d_neg) for a in data]
-            divc = self.divc(vdata, divisor_neg)
-            assert divc == data_divc
-            data_divc = [trunc_div(a, d) for a in data_neg]
-            divc = self.divc(vdata_neg, divisor)
+        data = [1, -int_min]  # to test overflow
+        data += range(0, 2**8, 2**5)
+        data += range(0, 2**8, 2**5-1)
+        bsize = self._scalar_size()
+        if bsize > 8:
+            data += range(2**8, 2**16, 2**13)
+            data += range(2**8, 2**16, 2**13-1)
+        if bsize > 16:
+            data += range(2**16, 2**32, 2**29)
+            data += range(2**16, 2**32, 2**29-1)
+        if bsize > 32:
+            data += range(2**32, 2**64, 2**61)
+            data += range(2**32, 2**64, 2**61-1)
+        # negate
+        data += [-x for x in data]
+        for dividend, divisor in itertools.product(data, data):
+            divisor = self.setall(divisor)[0]  # cast
+            if divisor == 0:
+                continue
+            dividend = self.load(self._data(dividend))
+            data_divc = [trunc_div(a, divisor) for a in dividend]
+            divisor_parms = self.divisor(divisor)
+            divc = self.divc(dividend, divisor_parms)
             assert divc == data_divc
 
-        # test truncate sign if the dividend is zero
-        vzero = self.zero()
-        for d in (-1, -10, -100, int_min//2, int_min):
-            divisor = self.divisor(d)
-            divc = self.divc(vzero, divisor)
-            assert divc == vzero
-
-        # test overflow
-        vmin = self.setall(int_min)
-        divisor = self.divisor(-1)
-        divc = self.divc(vmin, divisor)
-        assert divc == vmin
-
     def test_arithmetic_reduce_sum(self):
         """
         Test reduce sum intrinsics:
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 30929ce91..ef0bac957 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1362,6 +1362,14 @@ class TestUfunc:
                            np.array([[2]*i for i in [1, 3, 6, 10]], dtype=object),
                           )
 
+    def test_object_array_accumulate_failure(self):
+        # Typical accumulation on object works as expected:
+        res = np.add.accumulate(np.array([1, 0, 2], dtype=object))
+        assert_array_equal(res, np.array([1, 1, 3], dtype=object))
+        # But errors are propagated from the inner-loop if they occur:
+        with pytest.raises(TypeError):
+            np.add.accumulate([1, None, 2])
+
     def test_object_array_reduceat_inplace(self):
         # Checks that in-place reduceats work, see also gh-7465
         arr = np.empty(4, dtype=object)
@@ -1381,6 +1389,15 @@ class TestUfunc:
         np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
         assert_array_equal(arr, out)
 
+    def test_object_array_reduceat_failure(self):
+        # Reduceat works as expected when no invalid operation occurs (None is
+        # not involved in an operation here)
+        res = np.add.reduceat(np.array([1, None, 2], dtype=object), [1, 2])
+        assert_array_equal(res, np.array([None, 2], dtype=object))
+        # But errors when None would be involved in an operation:
+        with pytest.raises(TypeError):
+            np.add.reduceat([1, None, 2], [0, 2])
+
     def test_zerosize_reduction(self):
         # Test with default dtype and object dtype
         for a in [[], np.array([], dtype=object)]:
@@ -2098,6 +2115,25 @@ class TestUfunc:
         with pytest.raises(TypeError):
             ufunc(a, a, signature=signature)
 
+    @pytest.mark.parametrize("ufunc",
+            [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_support_anything(self, ufunc):
+        # The logical ufuncs support even input that can't be promoted:
+        a = np.array('1')
+        c = np.array([1., 2.])
+        assert_array_equal(ufunc(a, c), ufunc([True, True], True))
+        assert ufunc.reduce(a) == True
+
+    @pytest.mark.parametrize("ufunc",
+             [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_out_cast_check(self, ufunc):
+        a = np.array('1')
+        c = np.array([1., 2.])
+        out = a.copy()
+        with pytest.raises(TypeError):
+            # It would be safe, but not equiv casting:
+            ufunc(a, c, out=out, casting="equiv")
+
     def test_reduce_noncontig_output(self):
         # Check that reduction deals with non-contiguous output arrays
         # appropriately.
@@ -2119,6 +2155,22 @@ class TestUfunc:
         assert_equal(y_base[1,:], y_base_copy[1,:])
         assert_equal(y_base[3,:], y_base_copy[3,:])
 
+    @pytest.mark.parametrize("with_cast", [True, False])
+    def test_reduceat_and_accumulate_out_shape_mismatch(self, with_cast):
+        # Should raise an error mentioning "shape" or "size"
+        arr = np.arange(5)
+        out = np.arange(3)  # definitely wrong shape
+        if with_cast:
+            # If a cast is necessary on the output, we can be sure to use
+            # the generic NpyIter (non-fast) path.
+            out = out.astype(np.float64)
+
+        with pytest.raises(ValueError, match="(shape|size)"):
+            np.add.reduceat(arr, [0, 3], out=out)
+
+        with pytest.raises(ValueError, match="(shape|size)"):
+            np.add.accumulate(arr, out=out)
+
     @pytest.mark.parametrize('out_shape',
                              [(), (1,), (3,), (1, 1), (1, 3), (4, 3)])
     @pytest.mark.parametrize('keepdims', [True, False])
@@ -2331,8 +2383,9 @@ def test_reduce_casterrors(offset):
     out = np.array(-1, dtype=np.intp)
 
     count = sys.getrefcount(value)
-    with pytest.raises(ValueError):
-        # This is an unsafe cast, but we currently always allow that:
+    with pytest.raises(ValueError, match="invalid literal"):
+        # This is an unsafe cast, but we currently always allow that.
+        # Note that the double loop is picked, but the cast fails.
         np.add.reduce(arr, dtype=np.intp, out=out)
     assert count == sys.getrefcount(value)
     # If an error occurred during casting, the operation is done at most until
@@ -2340,3 +2393,20 @@ def test_reduce_casterrors(offset):
     # if the error happened immediately.
     # This does not define behaviour, the output is invalid and thus undefined
     assert out[()] < value * offset
+
+
+@pytest.mark.parametrize("method",
+        [np.add.accumulate, np.add.reduce,
+         pytest.param(lambda x: np.add.reduceat(x, [0]), id="reduceat"),
+         pytest.param(lambda x: np.log.at(x, [2]), id="at")])
+def test_ufunc_methods_floaterrors(method):
+    # adding inf and -inf (or log(-inf) creates an invalid float and warns
+    arr = np.array([np.inf, 0, -np.inf])
+    with np.errstate(all="warn"):
+        with pytest.warns(RuntimeWarning, match="invalid value"):
+            method(arr)
+
+    arr = np.array([np.inf, 0, -np.inf])
+    with np.errstate(all="raise"):
+        with pytest.raises(FloatingPointError):
+            method(arr)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 8ff81ea51..fc7c592f0 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -15,7 +15,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp, assert_warns
+    _gen_alignment_data, assert_array_almost_equal_nulp
     )
 
 def get_glibc_version():
@@ -28,9 +28,7 @@ def get_glibc_version():
 
 
 glibcver = get_glibc_version()
-glibc_newerthan_2_17 = pytest.mark.xfail(
-        glibcver != '0.0' and glibcver < '2.17',
-        reason="Older glibc versions may not raise appropriate FP exceptions")
+glibc_older_than = lambda x: (glibcver != '0.0' and glibcver < x)
 
 def on_powerpc():
     """ True if we are running on a Power PC platform."""
@@ -50,14 +48,6 @@ def bad_arcsinh():
     # The eps for float128 is 1-e33, so this is way bigger
     return abs((v1 / v2) - 1.0) > 1e-23
 
-if platform.machine() == 'aarch64' and bad_arcsinh():
-    skip_longcomplex_msg = ('Trig functions of np.longcomplex values known to be '
-                            'inaccurate on aarch64 for some compilation '
-                            'configurations, should be fixed by building on a '
-                            'platform using glibc>2.17')
-else:
-    skip_longcomplex_msg = ''
-
 
 class _FilterInvalids:
     def setup(self):
@@ -1022,9 +1012,11 @@ class TestSpecialFloats:
             yf = np.array(y, dtype=dt)
             assert_equal(np.exp(yf), xf)
 
-    # Older version of glibc may not raise the correct FP exceptions
     # See: https://github.com/numpy/numpy/issues/19192
-    @glibc_newerthan_2_17
+    @pytest.mark.xfail(
+        glibc_older_than("2.17"),
+        reason="Older glibc versions may not raise appropriate FP exceptions"
+    )
     def test_exp_exceptions(self):
         with np.errstate(over='raise'):
             assert_raises(FloatingPointError, np.exp, np.float32(100.))
@@ -1405,8 +1397,10 @@ class TestAVXFloat32Transcendental:
         M = np.int_(N/20)
         index = np.random.randint(low=0, high=N, size=M)
         x_f32 = np.float32(np.random.uniform(low=-100.,high=100.,size=N))
-        # test coverage for elements > 117435.992f for which glibc is used
-        x_f32[index] = np.float32(10E+10*np.random.rand(M))
+        if not glibc_older_than("2.17"):
+            # test coverage for elements > 117435.992f for which glibc is used
+            # this is known to be problematic on old glibc, so skip it there
+            x_f32[index] = np.float32(10E+10*np.random.rand(M))
         x_f64 = np.float64(x_f32)
         assert_array_max_ulp(np.sin(x_f32), np.float32(np.sin(x_f64)), maxulp=2)
         assert_array_max_ulp(np.cos(x_f32), np.float32(np.cos(x_f64)), maxulp=2)
@@ -3439,13 +3433,14 @@ class TestComplexFunctions:
         x_series = np.logspace(-20, -3.001, 200)
         x_basic = np.logspace(-2.999, 0, 10, endpoint=False)
 
-        if dtype is np.longcomplex:
+        if glibc_older_than("2.19") and dtype is np.longcomplex:
+            if (platform.machine() == 'aarch64' and bad_arcsinh()):
+                pytest.skip("Trig functions of np.longcomplex values known "
+                            "to be inaccurate on aarch64 for some compilation "
+                            "configurations.")
             # It's not guaranteed that the system-provided arc functions
             # are accurate down to a few epsilons. (Eg. on Linux 64-bit)
             # So, give more leeway for long complex tests here:
-            # Can use 2.1 for > Ubuntu LTS Trusty (2014), glibc = 2.19.
-            if skip_longcomplex_msg:
-                pytest.skip(skip_longcomplex_msg)
             check(x_series, 50.0*eps)
         else:
             check(x_series, 2.1*eps)
@@ -3886,3 +3881,11 @@ def test_bad_legacy_ufunc_silent_errors():
 
     with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
         ncu_tests.always_error.at(arr, [0, 1, 2], arr)
+
+
+@pytest.mark.parametrize('x1', [np.arange(3.0), [0.0, 1.0, 2.0]])
+def test_bad_legacy_gufunc_silent_errors(x1):
+    # Verify that an exception raised in a gufunc loop propagates correctly.
+    # The signature of always_error_gufunc is '(i),()->()'.
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error_gufunc(x1, 0.0)
diff --git a/numpy/core/tests/test_umath_accuracy.py b/numpy/core/tests/test_umath_accuracy.py
index a703c697a..32e2dca66 100644
--- a/numpy/core/tests/test_umath_accuracy.py
+++ b/numpy/core/tests/test_umath_accuracy.py
@@ -1,5 +1,4 @@
 import numpy as np
-import platform
 import os
 from os import path
 import sys