271 files changed, 7635 insertions, 2366 deletions
diff --git a/numpy/core/__init__.py b/numpy/core/__init__.py
index dad9293e1..332f9940e 100644
--- a/numpy/core/__init__.py
+++ b/numpy/core/__init__.py
@@ -9,6 +9,7 @@ are available in the main ``numpy`` namespace - use that instead.
 from numpy.version import version as __version__
 
 import os
+import warnings
 
 # disables OpenBLAS affinity setting of the main thread that limits
 # python threads or processes to one core
@@ -80,8 +81,8 @@ from .memmap import *
 from .defchararray import chararray
 from . import function_base
 from .function_base import *
-from . import machar
-from .machar import *
+from . import _machar
+from ._machar import *
 from . import getlimits
 from .getlimits import *
 from . import shape_base
@@ -109,7 +110,6 @@ __all__ += fromnumeric.__all__
 __all__ += ['record', 'recarray', 'format_parser']
 __all__ += ['chararray']
 __all__ += function_base.__all__
-__all__ += machar.__all__
 __all__ += getlimits.__all__
 __all__ += shape_base.__all__
 __all__ += einsumfunc.__all__
@@ -151,6 +151,17 @@ def _DType_reduce(DType):
     return _DType_reconstruct, (scalar_type,)
 
 
+def __getattr__(name):
+    # Deprecated 2021-10-20, NumPy 1.22
+    if name == "machar":
+        warnings.warn(
+            "The `np.core.machar` module is deprecated (NumPy 1.22)",
+            DeprecationWarning, stacklevel=2,
+        )
+        return _machar
+    raise AttributeError(f"Module {__name__!r} has no attribute {name!r}")
+
+
 import copyreg
 
 copyreg.pickle(ufunc, _ufunc_reduce)
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index 06f2a6376..c8a24db0c 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -328,7 +328,7 @@ add_newdoc('numpy.core', 'nditer',
     ...     with it:
     ...         for (a, b, c) in it:
     ...             addop(a, b, out=c)
-    ...     return it.operands[2]
+    ...         return it.operands[2]
 
     Here is the same function, but following the C-style pattern:
 
@@ -478,7 +478,7 @@ add_newdoc('numpy.core', 'nditer', ('iternext',
 
 add_newdoc('numpy.core', 'nditer', ('remove_axis',
     """
-    remove_axis(i)
+    remove_axis(i, /)
 
     Removes axis `i` from the iterator. Requires that the flag "multi_index"
     be enabled.
@@ -504,6 +504,9 @@ add_newdoc('numpy.core', 'nditer', ('reset',
 
 add_newdoc('numpy.core', 'nested_iters',
     """
+    nested_iters(op, axes, flags=None, op_flags=None, op_dtypes=None, \
+    order="K", casting="safe", buffersize=0)
+
     Create nditers for use in nested loops
 
     Create a tuple of `nditer` objects which iterate in nested loops over
@@ -796,7 +799,7 @@ add_newdoc('numpy.core.multiarray', 'array',
     object : array_like
         An array, any object exposing the array interface, an object whose
         __array__ method returns an array, or any (nested) sequence.
-        If object is a scalar, a 0-dimensional array containing object is 
+        If object is a scalar, a 0-dimensional array containing object is
         returned.
     dtype : data-type, optional
         The desired data-type for the array.  If not given, then the type will
@@ -2201,8 +2204,8 @@ add_newdoc('numpy.core.multiarray', 'ndarray',
     empty : Create an array, but leave its allocated memory unchanged (i.e.,
             it contains "garbage").
     dtype : Create a data-type.
-    numpy.typing.NDArray : A :term:`generic <generic type>` version
-                           of ndarray.
+    numpy.typing.NDArray : An ndarray alias :term:`generic <generic type>`
+                           w.r.t. its `dtype.type <numpy.dtype.type>`.
 
     Notes
     -----
@@ -2798,6 +2801,39 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('__copy__',
     """))
 
 
+add_newdoc('numpy.core.multiarray', 'ndarray', ('__class_getitem__',
+    """a.__class_getitem__(item, /)
+
+    Return a parametrized wrapper around the `~numpy.ndarray` type.
+
+    .. versionadded:: 1.22
+
+    Returns
+    -------
+    alias : types.GenericAlias
+        A parametrized `~numpy.ndarray` type.
+
+    Examples
+    --------
+    >>> from typing import Any
+    >>> import numpy as np
+
+    >>> np.ndarray[Any, np.dtype[Any]]
+    numpy.ndarray[typing.Any, numpy.dtype[typing.Any]]
+
+    Notes
+    -----
+    This method is only available for python 3.9 and later.
+
+    See Also
+    --------
+    :pep:`585` : Type hinting generics in standard collections.
+    numpy.typing.NDArray : An ndarray alias :term:`generic <generic type>`
+                        w.r.t. its `dtype.type <numpy.dtype.type>`.
+
+    """))
+
+
 add_newdoc('numpy.core.multiarray', 'ndarray', ('__deepcopy__',
     """a.__deepcopy__(memo, /) -> Deep copy of array.
 
@@ -3541,7 +3577,7 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('newbyteorder',
         * 'S' - swap dtype from current to opposite endian
         * {'<', 'little'} - little endian
         * {'>', 'big'} - big endian
-        * '=' - native order, equivalent to `sys.byteorder`
+        * {'=', 'native'} - native order, equivalent to `sys.byteorder`
         * {'|', 'I'} - ignore (no change to byte order)
 
         The default value ('S') results in swapping the current
@@ -4008,6 +4044,9 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('partition',
         The order of all elements in the partitions is undefined.
         If provided with a sequence of kth it will partition all elements
         indexed by kth of them into their sorted position at once.
+
+        .. deprecated:: 1.22.0
+            Passing booleans as index is deprecated.
     axis : int, optional
         Axis along which to sort. Default is -1, which means sort along the
         last axis.
@@ -4688,6 +4727,16 @@ add_newdoc('numpy.core.umath', '_add_newdoc_ufunc',
     and then throwing away the ufunc.
     """)
 
+add_newdoc('numpy.core.multiarray', 'get_handler_name',
+    """
+    get_handler_name(a: ndarray) -> str,None
+
+    Return the name of the memory handler used by `a`. If not provided, return
+    the name of the memory handler that will be used to allocate data for the
+    next `ndarray` in this context. May return None if `a` does not own its
+    memory, in which case you can traverse ``a.base`` for a memory handler.
+    """)
+
 add_newdoc('numpy.core.multiarray', '_set_madvise_hugepage',
     """
     _set_madvise_hugepage(enabled: bool) -> bool
@@ -6001,7 +6050,7 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('newbyteorder',
         * 'S' - swap dtype from current to opposite endian
         * {'<', 'little'} - little endian
         * {'>', 'big'} - big endian
-        * '=' - native order
+        * {'=', 'native'} - native order
         * {'|', 'I'} - ignore (no change to byte order)
 
     Returns
@@ -6044,6 +6093,97 @@ add_newdoc('numpy.core.multiarray', 'dtype', ('newbyteorder',
 
     """))
 
+add_newdoc('numpy.core.multiarray', 'dtype', ('__class_getitem__',
+    """
+    __class_getitem__(item, /)
+
+    Return a parametrized wrapper around the `~numpy.dtype` type.
+
+    .. versionadded:: 1.22
+
+    Returns
+    -------
+    alias : types.GenericAlias
+        A parametrized `~numpy.dtype` type.
+
+    Examples
+    --------
+    >>> import numpy as np
+
+    >>> np.dtype[np.int64]
+    numpy.dtype[numpy.int64]
+
+    Notes
+    -----
+    This method is only available for python 3.9 and later.
+
+    See Also
+    --------
+    :pep:`585` : Type hinting generics in standard collections.
+
+    """))
+
+add_newdoc('numpy.core.multiarray', 'dtype', ('__ge__',
+    """
+    __ge__(value, /)
+
+    Return ``self >= value``.
+
+    Equivalent to ``np.can_cast(value, self, casting="safe")``.
+
+    See Also
+    --------
+    can_cast : Returns True if cast between data types can occur according to
+               the casting rule.
+
+    """))
+
+add_newdoc('numpy.core.multiarray', 'dtype', ('__le__',
+    """
+    __le__(value, /)
+
+    Return ``self <= value``.
+
+    Equivalent to ``np.can_cast(self, value, casting="safe")``.
+
+    See Also
+    --------
+    can_cast : Returns True if cast between data types can occur according to
+               the casting rule.
+
+    """))
+
+add_newdoc('numpy.core.multiarray', 'dtype', ('__gt__',
+    """
+    __ge__(value, /)
+
+    Return ``self > value``.
+
+    Equivalent to
+    ``self != value and np.can_cast(value, self, casting="safe")``.
+
+    See Also
+    --------
+    can_cast : Returns True if cast between data types can occur according to
+               the casting rule.
+
+    """))
+
+add_newdoc('numpy.core.multiarray', 'dtype', ('__lt__',
+    """
+    __lt__(value, /)
+
+    Return ``self < value``.
+
+    Equivalent to
+    ``self != value and np.can_cast(self, value, casting="safe")``.
+
+    See Also
+    --------
+    can_cast : Returns True if cast between data types can occur according to
+               the casting rule.
+
+    """))
 
 ##############################################################################
 #
@@ -6372,7 +6512,7 @@ add_newdoc('numpy.core.numerictypes', 'generic', ('newbyteorder',
     * 'S' - swap dtype from current to opposite endian
     * {'<', 'little'} - little endian
     * {'>', 'big'} - big endian
-    * '=' - native order
+    * {'=', 'native'} - native order
     * {'|', 'I'} - ignore (no change to byte order)
 
     Parameters
@@ -6465,6 +6605,36 @@ add_newdoc('numpy.core.numerictypes', 'generic',
 add_newdoc('numpy.core.numerictypes', 'generic',
            refer_to_array_attribute('view'))
 
+add_newdoc('numpy.core.numerictypes', 'number', ('__class_getitem__',
+    """
+    __class_getitem__(item, /)
+
+    Return a parametrized wrapper around the `~numpy.number` type.
+
+    .. versionadded:: 1.22
+
+    Returns
+    -------
+    alias : types.GenericAlias
+        A parametrized `~numpy.number` type.
+
+    Examples
+    --------
+    >>> from typing import Any
+    >>> import numpy as np
+
+    >>> np.signedinteger[Any]
+    numpy.signedinteger[typing.Any]
+
+    Notes
+    -----
+    This method is only available for python 3.9 and later.
+
+    See Also
+    --------
+    :pep:`585` : Type hinting generics in standard collections.
+
+    """))
 
 ##############################################################################
 #
diff --git a/numpy/core/_add_newdocs_scalars.py b/numpy/core/_add_newdocs_scalars.py
index 8773d6c96..94859a9d5 100644
--- a/numpy/core/_add_newdocs_scalars.py
+++ b/numpy/core/_add_newdocs_scalars.py
@@ -290,3 +290,22 @@ for float_name in ('half', 'single', 'double', 'longdouble'):
         >>> np.{float_name}(3.2).is_integer()
         False
         """))
+
+for int_name in ('int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32',
+        'int64', 'uint64', 'int64', 'uint64', 'int64', 'uint64'):
+    # Add negative examples for signed cases by checking typecode
+    add_newdoc('numpy.core.numerictypes', int_name, ('bit_count',
+        f"""
+        {int_name}.bit_count() -> int
+
+        Computes the number of 1-bits in the absolute value of the input.
+        Analogous to the builtin `int.bit_count` or ``popcount`` in C++.
+
+        Examples
+        --------
+        >>> np.{int_name}(127).bit_count()
+        7""" +
+        (f"""
+        >>> np.{int_name}(-127).bit_count()
+        7
+        """ if dtype(int_name).char.islower() else "")))
diff --git a/numpy/core/_dtype.py b/numpy/core/_dtype.py
index 4249071ff..c3a22b1c6 100644
--- a/numpy/core/_dtype.py
+++ b/numpy/core/_dtype.py
@@ -200,30 +200,37 @@ def _struct_dict_str(dtype, includealignedflag):
 
     # Build up a string to make the dictionary
 
+    if np.core.arrayprint._get_legacy_print_mode() <= 121:
+        colon = ":"
+        fieldsep = ","
+    else:
+        colon = ": "
+        fieldsep = ", "
+
     # First, the names
-    ret = "{'names':["
-    ret += ",".join(repr(name) for name in names)
+    ret = "{'names'%s[" % colon
+    ret += fieldsep.join(repr(name) for name in names)
 
     # Second, the formats
-    ret += "], 'formats':["
-    ret += ",".join(
+    ret += "], 'formats'%s[" % colon
+    ret += fieldsep.join(
         _construction_repr(fld_dtype, short=True) for fld_dtype in fld_dtypes)
 
     # Third, the offsets
-    ret += "], 'offsets':["
-    ret += ",".join("%d" % offset for offset in offsets)
+    ret += "], 'offsets'%s[" % colon
+    ret += fieldsep.join("%d" % offset for offset in offsets)
 
     # Fourth, the titles
     if any(title is not None for title in titles):
-        ret += "], 'titles':["
-        ret += ",".join(repr(title) for title in titles)
+        ret += "], 'titles'%s[" % colon
+        ret += fieldsep.join(repr(title) for title in titles)
 
     # Fifth, the itemsize
-    ret += "], 'itemsize':%d" % dtype.itemsize
+    ret += "], 'itemsize'%s%d" % (colon, dtype.itemsize)
 
     if (includealignedflag and dtype.isalignedstruct):
         # Finally, the aligned flag
-        ret += ", 'aligned':True}"
+        ret += ", 'aligned'%sTrue}" % colon
     else:
         ret += "}"
 
diff --git a/numpy/core/machar.py b/numpy/core/_machar.py
index 04dad4d77..ace19a429 100644
--- a/numpy/core/machar.py
+++ b/numpy/core/_machar.py
@@ -1,5 +1,5 @@
 """
-Machine arithmetics - determine the parameters of the
+Machine arithmetic - determine the parameters of the
 floating-point arithmetic system
 
 Author: Pearu Peterson, September 2003
@@ -13,6 +13,7 @@ from numpy.core.overrides import set_module
 
 # Need to speed this up...especially for longfloat
 
+# Deprecated 2021-10-20, NumPy 1.22
 @set_module('numpy')
 class MachAr:
     """
diff --git a/numpy/core/_methods.py b/numpy/core/_methods.py
index e475b94df..a239e2c87 100644
--- a/numpy/core/_methods.py
+++ b/numpy/core/_methods.py
@@ -221,8 +221,10 @@ def _var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, *,
     if isinstance(arrmean, mu.ndarray):
         arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
                                  subok=False)
-    else:
+    elif hasattr(arrmean, "dtype"):
         arrmean = arrmean.dtype.type(arrmean / rcount)
+    else:
+        arrmean = arrmean / rcount
 
     # Compute sum of squared deviations from mean
     # Note that x may not be inexact and that we need it to be an array,
diff --git a/numpy/core/_ufunc_config.pyi b/numpy/core/_ufunc_config.pyi
index 9c8cc8ab6..cd7129bcb 100644
--- a/numpy/core/_ufunc_config.pyi
+++ b/numpy/core/_ufunc_config.pyi
@@ -1,11 +1,10 @@
-from typing import Optional, Union, Callable, Any, Literal, Protocol, TypedDict
+from typing import Optional, Union, Callable, Any, Literal, TypedDict
+
+from numpy import _SupportsWrite
 
 _ErrKind = Literal["ignore", "warn", "raise", "call", "print", "log"]
 _ErrFunc = Callable[[str, int], Any]
 
-class _SupportsWrite(Protocol):
-    def write(self, msg: str, /) -> Any: ...
-
 class _ErrDict(TypedDict):
     divide: _ErrKind
     over: _ErrKind
@@ -30,8 +29,8 @@ def geterr() -> _ErrDict: ...
 def setbufsize(size: int) -> int: ...
 def getbufsize() -> int: ...
 def seterrcall(
-    func: Union[None, _ErrFunc, _SupportsWrite]
-) -> Union[None, _ErrFunc, _SupportsWrite]: ...
-def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite]: ...
+    func: Union[None, _ErrFunc, _SupportsWrite[str]]
+) -> Union[None, _ErrFunc, _SupportsWrite[str]]: ...
+def geterrcall() -> Union[None, _ErrFunc, _SupportsWrite[str]]: ...
 
 # See `numpy/__init__.pyi` for the `errstate` class
diff --git a/numpy/core/arrayprint.py b/numpy/core/arrayprint.py
index 2a4bef669..d7e9bf795 100644
--- a/numpy/core/arrayprint.py
+++ b/numpy/core/arrayprint.py
@@ -24,6 +24,7 @@ __docformat__ = 'restructuredtext'
 
 import functools
 import numbers
+import sys
 try:
     from _thread import get_ident
 except ImportError:
@@ -56,12 +57,17 @@ _format_options = {
     'infstr': 'inf',
     'sign': '-',
     'formatter': None,
-    'legacy': False}
+    # Internally stored as an int to simplify comparisons; converted from/to
+    # str/False on the way in/out.
+    'legacy': sys.maxsize}
 
 def _make_options_dict(precision=None, threshold=None, edgeitems=None,
                        linewidth=None, suppress=None, nanstr=None, infstr=None,
                        sign=None, formatter=None, floatmode=None, legacy=None):
-    """ make a dictionary out of the non-None arguments, plus sanity checks """
+    """
+    Make a dictionary out of the non-None arguments, plus conversion of
+    *legacy* and sanity checks.
+    """
 
     options = {k: v for k, v in locals().items() if v is not None}
 
@@ -76,9 +82,18 @@ def _make_options_dict(precision=None, threshold=None, edgeitems=None,
     if sign not in [None, '-', '+', ' ']:
         raise ValueError("sign option must be one of ' ', '+', or '-'")
 
-    if legacy not in [None, False, '1.13']:
-        warnings.warn("legacy printing option can currently only be '1.13' or "
-                      "`False`", stacklevel=3)
+    if legacy == False:
+        options['legacy'] = sys.maxsize
+    elif legacy == '1.13':
+        options['legacy'] = 113
+    elif legacy == '1.21':
+        options['legacy'] = 121
+    elif legacy is None:
+        pass  # OK, do nothing.
+    else:
+        warnings.warn(
+            "legacy printing option can currently only be '1.13', '1.21', or "
+            "`False`", stacklevel=3)
 
     if threshold is not None:
         # forbid the bad threshold arg suggested by stack overflow, gh-12351
@@ -186,11 +201,21 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
     legacy : string or `False`, optional
         If set to the string `'1.13'` enables 1.13 legacy printing mode. This
         approximates numpy 1.13 print output by including a space in the sign
-        position of floats and different behavior for 0d arrays. If set to
-        `False`, disables legacy mode. Unrecognized strings will be ignored
-        with a warning for forward compatibility.
+        position of floats and different behavior for 0d arrays. This also
+        enables 1.21 legacy printing mode (described below).
+
+        If set to the string `'1.21'` enables 1.21 legacy printing mode. This
+        approximates numpy 1.21 print output of complex structured dtypes
+        by not inserting spaces after commas that separate fields and after
+        colons.
+
+        If set to `False`, disables legacy mode.
+
+        Unrecognized strings will be ignored with a warning for forward
+        compatibility.
 
         .. versionadded:: 1.14.0
+        .. versionchanged:: 1.22.0
 
     See Also
     --------
@@ -257,11 +282,13 @@ def set_printoptions(precision=None, threshold=None, edgeitems=None,
     _format_options.update(opt)
 
     # set the C variable for legacy mode
-    if _format_options['legacy'] == '1.13':
+    if _format_options['legacy'] == 113:
         set_legacy_print_mode(113)
         # reset the sign option in legacy mode to avoid confusion
         _format_options['sign'] = '-'
-    elif _format_options['legacy'] is False:
+    elif _format_options['legacy'] == 121:
+        set_legacy_print_mode(121)
+    elif _format_options['legacy'] == sys.maxsize:
         set_legacy_print_mode(0)
 
 
@@ -292,7 +319,16 @@ def get_printoptions():
     set_printoptions, printoptions, set_string_function
 
     """
-    return _format_options.copy()
+    opts = _format_options.copy()
+    opts['legacy'] = {
+        113: '1.13', 121: '1.21', sys.maxsize: False,
+    }[opts['legacy']]
+    return opts
+
+
+def _get_legacy_print_mode():
+    """Return the legacy print mode as an int."""
+    return _format_options['legacy']
 
 
 @set_module('numpy')
@@ -678,7 +714,7 @@ def array2string(a, max_line_width=None, precision=None,
     options = _format_options.copy()
     options.update(overrides)
 
-    if options['legacy'] == '1.13':
+    if options['legacy'] <= 113:
         if style is np._NoValue:
             style = repr
 
@@ -690,7 +726,7 @@ def array2string(a, max_line_width=None, precision=None,
                       " except in 1.13 'legacy' mode",
                       DeprecationWarning, stacklevel=3)
 
-    if options['legacy'] != '1.13':
+    if options['legacy'] > 113:
         options['linewidth'] -= len(suffix)
 
     # treat as a null array if any of shape elements == 0
@@ -702,7 +738,7 @@ def array2string(a, max_line_width=None, precision=None,
 
 def _extendLine(s, line, word, line_width, next_line_prefix, legacy):
     needs_wrap = len(line) + len(word) > line_width
-    if legacy != '1.13':
+    if legacy > 113:
         # don't wrap lines if it won't help
         if len(line) <= len(next_line_prefix):
             needs_wrap = False
@@ -719,7 +755,7 @@ def _extendLine_pretty(s, line, word, line_width, next_line_prefix, legacy):
     Extends line with nicely formatted (possibly multi-line) string ``word``.
     """
     words = word.splitlines()
-    if len(words) == 1 or legacy == '1.13':
+    if len(words) == 1 or legacy <= 113:
         return _extendLine(s, line, word, line_width, next_line_prefix, legacy)
 
     max_word_length = max(len(word) for word in words)
@@ -765,7 +801,7 @@ def _formatArray(a, format_function, line_width, next_line_prefix,
         # when recursing, add a space to align with the [ added, and reduce the
         # length of the line by 1
         next_hanging_indent = hanging_indent + ' '
-        if legacy == '1.13':
+        if legacy <= 113:
             next_width = curr_width
         else:
             next_width = curr_width - len(']')
@@ -785,7 +821,7 @@ def _formatArray(a, format_function, line_width, next_line_prefix,
         # last axis (rows) - wrap elements if they would not fit on one line
         if axes_left == 1:
             # the length up until the beginning of the separator / bracket
-            if legacy == '1.13':
+            if legacy <= 113:
                 elem_width = curr_width - len(separator.rstrip())
             else:
                 elem_width = curr_width - max(len(separator.rstrip()), len(']'))
@@ -800,7 +836,7 @@ def _formatArray(a, format_function, line_width, next_line_prefix,
             if show_summary:
                 s, line = _extendLine(
                     s, line, summary_insert, elem_width, hanging_indent, legacy)
-                if legacy == '1.13':
+                if legacy <= 113:
                     line += ", "
                 else:
                     line += separator
@@ -811,7 +847,7 @@ def _formatArray(a, format_function, line_width, next_line_prefix,
                     s, line, word, elem_width, hanging_indent, legacy)
                 line += separator
 
-            if legacy == '1.13':
+            if legacy <= 113:
                 # width of the separator is not considered on 1.13
                 elem_width = curr_width
             word = recurser(index + (-1,), next_hanging_indent, next_width)
@@ -830,7 +866,7 @@ def _formatArray(a, format_function, line_width, next_line_prefix,
                 s += hanging_indent + nested + line_sep
 
             if show_summary:
-                if legacy == '1.13':
+                if legacy <= 113:
                     # trailing space, fixed nbr of newlines, and fixed separator
                     s += hanging_indent + summary_insert + ", \n"
                 else:
@@ -875,7 +911,7 @@ class FloatingFormat:
             sign = '+' if sign else '-'
 
         self._legacy = legacy
-        if self._legacy == '1.13':
+        if self._legacy <= 113:
             # when not 0d, legacy does not support '-'
             if data.shape != () and sign == '-':
                 sign = ' '
@@ -919,7 +955,7 @@ class FloatingFormat:
             self.min_digits = None
         elif self.exp_format:
             trim, unique = '.', True
-            if self.floatmode == 'fixed' or self._legacy == '1.13':
+            if self.floatmode == 'fixed' or self._legacy <= 113:
                 trim, unique = 'k', False
             strs = (dragon4_scientific(x, precision=self.precision,
                                unique=unique, trim=trim, sign=self.sign == '+')
@@ -934,7 +970,7 @@ class FloatingFormat:
             self.unique = unique
 
             # for back-compat with np 1.13, use 2 spaces & sign and full prec
-            if self._legacy == '1.13':
+            if self._legacy <= 113:
                 self.pad_left = 3
             else:
                 # this should be only 1 or 2. Can be calculated from sign.
@@ -951,7 +987,7 @@ class FloatingFormat:
                                        sign=self.sign == '+')
                     for x in finite_vals)
             int_part, frac_part = zip(*(s.split('.') for s in strs))
-            if self._legacy == '1.13':
+            if self._legacy <= 113:
                 self.pad_left = 1 + max(len(s.lstrip('-+')) for s in int_part)
             else:
                 self.pad_left = max(len(s) for s in int_part)
@@ -966,7 +1002,7 @@ class FloatingFormat:
                 self.trim = '.'
                 self.min_digits = 0
 
-        if self._legacy != '1.13':
+        if self._legacy > 113:
             # account for sign = ' ' by adding one to pad_left
             if self.sign == ' ' and not any(np.signbit(finite_vals)):
                 self.pad_left += 1
@@ -1215,7 +1251,7 @@ class ComplexFloatingFormat:
             sign = '+' if sign else '-'
 
         floatmode_real = floatmode_imag = floatmode
-        if legacy == '1.13':
+        if legacy <= 113:
             floatmode_real = 'maxprec_equal'
             floatmode_imag = 'maxprec'
 
@@ -1286,7 +1322,7 @@ class DatetimeFormat(_TimelikeFormat):
         super().__init__(x)
 
     def __call__(self, x):
-        if self.legacy == '1.13':
+        if self.legacy <= 113:
             return self._format_non_nat(x)
         return super().__call__(x)
 
@@ -1390,7 +1426,7 @@ def dtype_is_implied(dtype):
     array([1, 2, 3], dtype=int8)
     """
     dtype = np.dtype(dtype)
-    if _format_options['legacy'] == '1.13' and dtype.type == bool_:
+    if _format_options['legacy'] <= 113 and dtype.type == bool_:
         return False
 
     # not just void types can be structured, and names are not part of the repr
@@ -1445,7 +1481,7 @@ def _array_repr_implementation(
     prefix = class_name + "("
     suffix = ")" if skipdtype else ","
 
-    if (_format_options['legacy'] == '1.13' and
+    if (_format_options['legacy'] <= 113 and
             arr.shape == () and not arr.dtype.names):
         lst = repr(arr.item())
     elif arr.size > 0 or arr.shape == (0,):
@@ -1466,7 +1502,7 @@ def _array_repr_implementation(
     # Note: This line gives the correct result even when rfind returns -1.
     last_line_len = len(arr_str) - (arr_str.rfind('\n') + 1)
     spacer = " "
-    if _format_options['legacy'] == '1.13':
+    if _format_options['legacy'] <= 113:
         if issubclass(arr.dtype.type, flexible):
             spacer = '\n' + ' '*len(class_name + "(")
     elif last_line_len + len(dtype_str) + 1 > max_line_width:
@@ -1540,7 +1576,7 @@ def _array_str_implementation(
         a, max_line_width=None, precision=None, suppress_small=None,
         array2string=array2string):
     """Internal version of array_str() that allows overriding array2string."""
-    if (_format_options['legacy'] == '1.13' and
+    if (_format_options['legacy'] <= 113 and
             a.shape == () and not a.dtype.names):
         return str(a.item())
 
diff --git a/numpy/core/arrayprint.pyi b/numpy/core/arrayprint.pyi
index df22efed6..0d338206f 100644
--- a/numpy/core/arrayprint.pyi
+++ b/numpy/core/arrayprint.pyi
@@ -1,8 +1,8 @@
 from types import TracebackType
 from typing import Any, Optional, Callable, Union, Type, Literal, TypedDict, SupportsIndex
 
-# Using a private class is by no means ideal, but it is simply a consquence
-# of a `contextlib.context` returning an instance of aformentioned class
+# Using a private class is by no means ideal, but it is simply a consequence
+# of a `contextlib.context` returning an instance of aforementioned class
 from contextlib import _GeneratorContextManager
 
 from numpy import (
@@ -53,7 +53,7 @@ class _FormatOptions(TypedDict):
     formatter: Optional[_FormatDict]
     sign: Literal["-", "+", " "]
     floatmode: _FloatMode
-    legacy: Literal[False, "1.13"]
+    legacy: Literal[False, "1.13", "1.21"]
 
 def set_printoptions(
     precision: Optional[SupportsIndex] = ...,
@@ -67,7 +67,7 @@ def set_printoptions(
     sign: Optional[Literal["-", "+", " "]] = ...,
     floatmode: Optional[_FloatMode] = ...,
     *,
-    legacy: Optional[Literal[False, "1.13"]] = ...
+    legacy: Optional[Literal[False, "1.13", "1.21"]] = ...
 ) -> None: ...
 def get_printoptions() -> _FormatOptions: ...
 def array2string(
@@ -87,7 +87,7 @@ def array2string(
     sign: Optional[Literal["-", "+", " "]] = ...,
     floatmode: Optional[_FloatMode] = ...,
     suffix: str = ...,
-    legacy: Optional[Literal[False, "1.13"]] = ...,
+    legacy: Optional[Literal[False, "1.13", "1.21"]] = ...,
 ) -> str: ...
 def format_float_scientific(
     x: _FloatLike_co,
@@ -137,5 +137,5 @@ def printoptions(
     sign: Optional[Literal["-", "+", " "]] = ...,
     floatmode: Optional[_FloatMode] = ...,
     *,
-    legacy: Optional[Literal[False, "1.13"]] = ...
+    legacy: Optional[Literal[False, "1.13", "1.21"]] = ...
 ) -> _GeneratorContextManager[_FormatOptions]: ...
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index a02c7153a..38ee4dac2 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -56,5 +56,7 @@
 # DType related API additions.
 # A new field was added to the end of PyArrayObject_fields.
 # Version 14 (NumPy 1.21) No change.
-# Version 14 (NumPy 1.22) No change.
 0x0000000e = 17a0f366e55ec05e5c5c149123478452
+
+# Version 15 (NumPy 1.22) Configurable memory allocations
+0x0000000f = 0c420aed67010594eb81f23ddfb02a88
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 4891e8f23..3a27a34cd 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -359,7 +359,7 @@ defdict = {
           docstrings.get('numpy.core.umath.fmod'),
           None,
           TD(ints),
-          TD(flts, f='fmod', astype={'e':'f'}),
+          TD(flts, f='fmod', astype={'e': 'f'}),
           TD(P, f='fmod'),
           ),
 'square':
@@ -390,7 +390,7 @@ defdict = {
           docstrings.get('numpy.core.umath.power'),
           None,
           TD(ints),
-          TD(inexact, f='pow', astype={'e':'f'}),
+          TD(inexact, f='pow', astype={'e': 'f'}),
           TD(O, f='npy_ObjectPower'),
           ),
 'float_power':
@@ -551,13 +551,13 @@ defdict = {
     Ufunc(2, 1, MinusInfinity,
           docstrings.get('numpy.core.umath.logaddexp'),
           None,
-          TD(flts, f="logaddexp", astype={'e':'f'})
+          TD(flts, f="logaddexp", astype={'e': 'f'})
           ),
 'logaddexp2':
     Ufunc(2, 1, MinusInfinity,
           docstrings.get('numpy.core.umath.logaddexp2'),
           None,
-          TD(flts, f="logaddexp2", astype={'e':'f'})
+          TD(flts, f="logaddexp2", astype={'e': 'f'})
           ),
 'bitwise_and':
     Ufunc(2, 1, AllOnes,
@@ -605,80 +605,93 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.heaviside'),
           None,
-          TD(flts, f='heaviside', astype={'e':'f'}),
+          TD(flts, f='heaviside', astype={'e': 'f'}),
           ),
 'degrees':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.degrees'),
           None,
-          TD(fltsP, f='degrees', astype={'e':'f'}),
+          TD(fltsP, f='degrees', astype={'e': 'f'}),
           ),
 'rad2deg':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rad2deg'),
           None,
-          TD(fltsP, f='rad2deg', astype={'e':'f'}),
+          TD(fltsP, f='rad2deg', astype={'e': 'f'}),
           ),
 'radians':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.radians'),
           None,
-          TD(fltsP, f='radians', astype={'e':'f'}),
+          TD(fltsP, f='radians', astype={'e': 'f'}),
           ),
 'deg2rad':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.deg2rad'),
           None,
-          TD(fltsP, f='deg2rad', astype={'e':'f'}),
+          TD(fltsP, f='deg2rad', astype={'e': 'f'}),
           ),
 'arccos':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arccos'),
           None,
-          TD(inexact, f='acos', astype={'e':'f'}),
+          TD('e', f='acos', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='acos', astype={'e': 'f'}),
           TD(P, f='arccos'),
           ),
 'arccosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arccosh'),
           None,
-          TD(inexact, f='acosh', astype={'e':'f'}),
+          TD('e', f='acosh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='acosh', astype={'e': 'f'}),
           TD(P, f='arccosh'),
           ),
 'arcsin':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arcsin'),
           None,
-          TD(inexact, f='asin', astype={'e':'f'}),
+          TD('e', f='asin', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='asin', astype={'e': 'f'}),
           TD(P, f='arcsin'),
           ),
 'arcsinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arcsinh'),
           None,
-          TD(inexact, f='asinh', astype={'e':'f'}),
+          TD('e', f='asinh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='asinh', astype={'e': 'f'}),
           TD(P, f='arcsinh'),
           ),
 'arctan':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arctan'),
           None,
-          TD(inexact, f='atan', astype={'e':'f'}),
+          TD('e', f='atan', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='atan', astype={'e': 'f'}),
           TD(P, f='arctan'),
           ),
 'arctanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arctanh'),
           None,
-          TD(inexact, f='atanh', astype={'e':'f'}),
+          TD('e', f='atanh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='atanh', astype={'e': 'f'}),
           TD(P, f='arctanh'),
           ),
 'cos':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
-          TD('e', f='cos', astype={'e':'f'}),
+          TD('e', f='cos', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
@@ -686,8 +699,9 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
-          TD('e', f='sin', astype={'e':'f'}),
+          TD('e', f='sin', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
@@ -695,35 +709,43 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.tan'),
           None,
-          TD(inexact, f='tan', astype={'e':'f'}),
+          TD('e', f='tan', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='tan', astype={'e': 'f'}),
           TD(P, f='tan'),
           ),
 'cosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cosh'),
           None,
-          TD(inexact, f='cosh', astype={'e':'f'}),
+          TD('e', f='cosh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='cosh', astype={'e': 'f'}),
           TD(P, f='cosh'),
           ),
 'sinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sinh'),
           None,
-          TD(inexact, f='sinh', astype={'e':'f'}),
+          TD('e', f='sinh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='sinh', astype={'e': 'f'}),
           TD(P, f='sinh'),
           ),
 'tanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.tanh'),
           None,
-          TD(inexact, f='tanh', astype={'e':'f'}),
+          TD('e', f='tanh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='tanh', astype={'e': 'f'}),
           TD(P, f='tanh'),
           ),
 'exp':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
-          TD('e', f='exp', astype={'e':'f'}),
+          TD('e', f='exp', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='exp'),
           TD(P, f='exp'),
@@ -732,21 +754,25 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp2'),
           None,
-          TD(inexact, f='exp2', astype={'e':'f'}),
+          TD('e', f='exp2', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='exp2', astype={'e': 'f'}),
           TD(P, f='exp2'),
           ),
 'expm1':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.expm1'),
           None,
-          TD(inexact, f='expm1', astype={'e':'f'}),
+          TD('e', f='expm1', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='expm1', astype={'e': 'f'}),
           TD(P, f='expm1'),
           ),
 'log':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
-          TD('e', f='log', astype={'e':'f'}),
+          TD('e', f='log', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
@@ -755,28 +781,34 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log2'),
           None,
-          TD(inexact, f='log2', astype={'e':'f'}),
+          TD('e', f='log2', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log2', astype={'e': 'f'}),
           TD(P, f='log2'),
           ),
 'log10':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log10'),
           None,
-          TD(inexact, f='log10', astype={'e':'f'}),
+          TD('e', f='log10', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log10', astype={'e': 'f'}),
           TD(P, f='log10'),
           ),
 'log1p':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log1p'),
           None,
-          TD(inexact, f='log1p', astype={'e':'f'}),
+          TD('e', f='log1p', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log1p', astype={'e': 'f'}),
           TD(P, f='log1p'),
           ),
 'sqrt':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sqrt'),
           None,
-          TD('e', f='sqrt', astype={'e':'f'}),
+          TD('e', f='sqrt', astype={'e': 'f'}),
           TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
@@ -785,14 +817,16 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cbrt'),
           None,
-          TD(flts, f='cbrt', astype={'e':'f'}),
+          TD('e', f='cbrt', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(flts, f='cbrt', astype={'e': 'f'}),
           TD(P, f='cbrt'),
           ),
 'ceil':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.ceil'),
           None,
-          TD('e', f='ceil', astype={'e':'f'}),
+          TD('e', f='ceil', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
@@ -801,7 +835,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.trunc'),
           None,
-          TD('e', f='trunc', astype={'e':'f'}),
+          TD('e', f='trunc', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
@@ -810,14 +844,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.fabs'),
           None,
-          TD(flts, f='fabs', astype={'e':'f'}),
+          TD(flts, f='fabs', astype={'e': 'f'}),
           TD(P, f='fabs'),
        ),
 'floor':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.floor'),
           None,
-          TD('e', f='floor', astype={'e':'f'}),
+          TD('e', f='floor', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
@@ -826,7 +860,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rint'),
           None,
-          TD('e', f='rint', astype={'e':'f'}),
+          TD('e', f='rint', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
@@ -835,7 +869,7 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.arctan2'),
           None,
-          TD(flts, f='atan2', astype={'e':'f'}),
+          TD(flts, f='atan2', astype={'e': 'f'}),
           TD(P, f='arctan2'),
           ),
 'remainder':
@@ -858,7 +892,7 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.hypot'),
           None,
-          TD(flts, f='hypot', astype={'e':'f'}),
+          TD(flts, f='hypot', astype={'e': 'f'}),
           TD(P, f='hypot'),
           ),
 'isnan':
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index fbd323368..3813c6ad7 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -76,9 +76,9 @@ multiarray_types_api = {
     # End 1.6 API
 }
 
-#define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
-#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[7])
-#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[8])
+# define NPY_NUMUSERTYPES (*(int *)PyArray_API[6])
+# define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[7])
+# define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[8])
 
 multiarray_funcs_api = {
     'PyArray_GetNDArrayCVersion':           (0,),
@@ -350,6 +350,9 @@ multiarray_funcs_api = {
     'PyArray_ResolveWritebackIfCopy':       (302,),
     'PyArray_SetWritebackIfCopyBase':       (303,),
     # End 1.14 API
+    'PyDataMem_SetHandler':                 (304,),
+    'PyDataMem_GetHandler':                 (305,),
+    # End 1.21 API
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index f19946be4..8d9316f2c 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -201,7 +201,8 @@ add_newdoc('numpy.core.umath', 'arccos',
     References
     ----------
     M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-    10th printing, 1964, pp. 79. http://www.math.sfu.ca/~cbm/aands/
+    10th printing, 1964, pp. 79.
+    https://personal.math.ubc.ca/~cbm/aands/page_79.htm
 
     Examples
     --------
@@ -258,7 +259,8 @@ add_newdoc('numpy.core.umath', 'arccosh',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 86.
+           https://personal.math.ubc.ca/~cbm/aands/page_86.htm
     .. [2] Wikipedia, "Inverse hyperbolic function",
            https://en.wikipedia.org/wiki/Arccosh
 
@@ -312,7 +314,7 @@ add_newdoc('numpy.core.umath', 'arcsin',
     ----------
     Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
     10th printing, New York: Dover, 1964, pp. 79ff.
-    http://www.math.sfu.ca/~cbm/aands/
+    https://personal.math.ubc.ca/~cbm/aands/page_79.htm
 
     Examples
     --------
@@ -360,7 +362,8 @@ add_newdoc('numpy.core.umath', 'arcsinh',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 86.
+           https://personal.math.ubc.ca/~cbm/aands/page_86.htm
     .. [2] Wikipedia, "Inverse hyperbolic function",
            https://en.wikipedia.org/wiki/Arcsinh
 
@@ -415,7 +418,7 @@ add_newdoc('numpy.core.umath', 'arctan',
     ----------
     Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,
     10th printing, New York: Dover, 1964, pp. 79.
-    http://www.math.sfu.ca/~cbm/aands/
+    https://personal.math.ubc.ca/~cbm/aands/page_79.htm
 
     Examples
     --------
@@ -560,7 +563,8 @@ add_newdoc('numpy.core.umath', 'arctanh',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 86. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 86.
+           https://personal.math.ubc.ca/~cbm/aands/page_86.htm
     .. [2] Wikipedia, "Inverse hyperbolic function",
            https://en.wikipedia.org/wiki/Arctanh
 
@@ -664,7 +668,7 @@ add_newdoc('numpy.core.umath', 'bitwise_or',
 
     Examples
     --------
-    The number 13 has the binaray representation ``00001101``. Likewise,
+    The number 13 has the binary representation ``00001101``. Likewise,
     16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is
     then ``000111011``, or 29:
 
@@ -1087,9 +1091,7 @@ add_newdoc('numpy.core.umath', 'divide',
 
     Behavior on division by zero can be changed using ``seterr``.
 
-    In Python 2, when both ``x1`` and ``x2`` are of an integer type,
-    ``divide`` will behave like ``floor_divide``. In Python 3, it behaves
-    like ``true_divide``.
+    Behaves like ``true_divide``.
 
     Examples
     --------
@@ -1102,27 +1104,6 @@ add_newdoc('numpy.core.umath', 'divide',
            [ Inf,  4. ,  2.5],
            [ Inf,  7. ,  4. ]])
 
-    Note the behavior with integer types (Python 2 only):
-
-    >>> np.divide(2, 4)
-    0
-    >>> np.divide(2, 4.)
-    0.5
-
-    Division by zero always yields zero in integer arithmetic (again,
-    Python 2 only), and does not raise an exception or a warning:
-
-    >>> np.divide(np.array([0, 1], dtype=int), np.array([0, 0], dtype=int))
-    array([0, 0])
-
-    Division by zero can, however, be caught using ``seterr``:
-
-    >>> old_err_state = np.seterr(divide='raise')
-    >>> np.divide(1, 0)
-    Traceback (most recent call last):
-      File "<stdin>", line 1, in <module>
-    FloatingPointError: divide by zero encountered in divide
-
     >>> ignored_states = np.seterr(**old_err_state)
     >>> np.divide(1, 0)
     0
@@ -1222,7 +1203,7 @@ add_newdoc('numpy.core.umath', 'exp',
            https://en.wikipedia.org/wiki/Exponential_function
     .. [2] M. Abramovitz and I. A. Stegun, "Handbook of Mathematical Functions
            with Formulas, Graphs, and Mathematical Tables," Dover, 1964, p. 69,
-           http://www.math.sfu.ca/~cbm/aands/page_69.htm
+           https://personal.math.ubc.ca/~cbm/aands/page_69.htm
 
     Examples
     --------
@@ -1439,7 +1420,7 @@ add_newdoc('numpy.core.umath', 'floor_divide',
 
 add_newdoc('numpy.core.umath', 'fmod',
     """
-    Return the element-wise remainder of division.
+    Returns the element-wise remainder of division.
 
     This is the NumPy implementation of the C library function fmod, the
     remainder has the same sign as the dividend `x1`. It is equivalent to
@@ -2052,7 +2033,8 @@ add_newdoc('numpy.core.umath', 'log',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 67.
+           https://personal.math.ubc.ca/~cbm/aands/page_67.htm
     .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
@@ -2101,7 +2083,8 @@ add_newdoc('numpy.core.umath', 'log10',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 67.
+           https://personal.math.ubc.ca/~cbm/aands/page_67.htm
     .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
@@ -2289,7 +2272,8 @@ add_newdoc('numpy.core.umath', 'log1p',
     References
     ----------
     .. [1] M. Abramowitz and I.A. Stegun, "Handbook of Mathematical Functions",
-           10th printing, 1964, pp. 67. http://www.math.sfu.ca/~cbm/aands/
+           10th printing, 1964, pp. 67.
+           https://personal.math.ubc.ca/~cbm/aands/page_67.htm
     .. [2] Wikipedia, "Logarithm". https://en.wikipedia.org/wiki/Logarithm
 
     Examples
@@ -3081,8 +3065,14 @@ add_newdoc('numpy.core.umath', 'power',
     First array elements raised to powers from second array, element-wise.
 
     Raise each base in `x1` to the positionally-corresponding power in
-    `x2`.  `x1` and `x2` must be broadcastable to the same shape. Note that an
-    integer type raised to a negative integer power will raise a ValueError.
+    `x2`.  `x1` and `x2` must be broadcastable to the same shape.
+
+    An integer type raised to a negative integer power will raise a
+    ``ValueError``.
+
+    Negative values raised to a non-integral value will return ``nan``.
+    To get complex results, cast the input to complex, or specify the
+    ``dtype`` to be ``complex`` (see the example below).
 
     Parameters
     ----------
@@ -3137,6 +3127,21 @@ add_newdoc('numpy.core.umath', 'power',
     >>> x1 ** x2
     array([ 0,  1,  8, 27, 16,  5])
 
+    Negative values raised to a non-integral value will result in ``nan``
+    (and a warning will be generated).
+
+    >>> x3 = np.array([-1.0, -4.0])
+    >>> with np.errstate(invalid='ignore'):
+    ...     p = np.power(x3, 1.5)
+    ...
+    >>> p
+    array([nan, nan])
+
+    To get complex results, give the argument ``dtype=complex``.
+
+    >>> np.power(x3, 1.5, dtype=complex)
+    array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])
+
     """)
 
 add_newdoc('numpy.core.umath', 'float_power',
@@ -3150,6 +3155,10 @@ add_newdoc('numpy.core.umath', 'float_power',
     inexact.  The intent is that the function will return a usable result for
     negative powers and seldom overflow for positive powers.
 
+    Negative values raised to a non-integral value will return ``nan``.
+    To get complex results, cast the input to complex, or specify the
+    ``dtype`` to be ``complex`` (see the example below).
+
     .. versionadded:: 1.12.0
 
     Parameters
@@ -3197,6 +3206,21 @@ add_newdoc('numpy.core.umath', 'float_power',
     array([[  0.,   1.,   8.,  27.,  16.,   5.],
            [  0.,   1.,   8.,  27.,  16.,   5.]])
 
+    Negative values raised to a non-integral value will result in ``nan``
+    (and a warning will be generated).
+
+    >>> x3 = np.array([-1, -4])
+    >>> with np.errstate(invalid='ignore'):
+    ...     p = np.float_power(x3, 1.5)
+    ...
+    >>> p
+    array([nan, nan])
+
+    To get complex results, give the argument ``dtype=complex``.
+
+    >>> np.float_power(x3, 1.5, dtype=complex)
+    array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])
+
     """)
 
 add_newdoc('numpy.core.umath', 'radians',
@@ -3308,7 +3332,7 @@ add_newdoc('numpy.core.umath', 'reciprocal',
 
 add_newdoc('numpy.core.umath', 'remainder',
     """
-    Return element-wise remainder of division.
+    Returns the element-wise remainder of division.
 
     Computes the remainder complementary to the `floor_divide` function.  It is
     equivalent to the Python modulus operator``x1 % x2`` and has the same sign
@@ -4002,7 +4026,7 @@ add_newdoc('numpy.core.umath', 'tanh',
     ----------
     .. [1] M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.
            New York, NY: Dover, 1972, pg. 83.
-           http://www.math.sfu.ca/~cbm/aands/
+           https://personal.math.ubc.ca/~cbm/aands/page_83.htm
 
     .. [2] Wikipedia, "Hyperbolic function",
            https://en.wikipedia.org/wiki/Hyperbolic_function
@@ -4031,9 +4055,8 @@ add_newdoc('numpy.core.umath', 'true_divide',
     """
     Returns a true division of the inputs, element-wise.
 
-    Instead of the Python traditional 'floor division', this returns a true
-    division.  True division adjusts the output type to present the best
-    answer, regardless of input types.
+    Unlike 'floor division', true division adjusts the output type
+    to present the best answer, regardless of input types.
 
     Parameters
     ----------
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
index e264fa210..3521e778e 100644
--- a/numpy/core/defchararray.py
+++ b/numpy/core/defchararray.py
@@ -2609,6 +2609,7 @@ class chararray(ndarray):
         return isdecimal(self)
 
 
+@set_module("numpy.char")
 def array(obj, itemsize=None, copy=True, unicode=None, order=None):
     """
     Create a `chararray`.
@@ -2742,6 +2743,7 @@ def array(obj, itemsize=None, copy=True, unicode=None, order=None):
     return val.view(chararray)
 
 
+@set_module("numpy.char")
 def asarray(obj, itemsize=None, unicode=None, order=None):
     """
     Convert the input to a `chararray`, copying the data only if
diff --git a/numpy/core/defchararray.pyi b/numpy/core/defchararray.pyi
new file mode 100644
index 000000000..28d247b05
--- /dev/null
+++ b/numpy/core/defchararray.pyi
@@ -0,0 +1,422 @@
+from typing import (
+    Literal as L,
+    overload,
+    TypeVar,
+    Any,
+    List,
+)
+
+from numpy import (
+    chararray as chararray,
+    dtype,
+    str_,
+    bytes_,
+    int_,
+    bool_,
+    object_,
+    _OrderKACF,
+)
+
+from numpy.typing import (
+    NDArray,
+    _ArrayLikeStr_co as U_co,
+    _ArrayLikeBytes_co as S_co,
+    _ArrayLikeInt_co as i_co,
+    _ArrayLikeBool_co as b_co,
+)
+
+from numpy.core.multiarray import compare_chararrays as compare_chararrays
+
+_SCT = TypeVar("_SCT", str_, bytes_)
+_CharArray = chararray[Any, dtype[_SCT]]
+
+__all__: List[str]
+
+# Comparison
+@overload
+def equal(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def equal(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+@overload
+def not_equal(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def not_equal(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+@overload
+def greater_equal(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def greater_equal(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+@overload
+def less_equal(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def less_equal(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+@overload
+def greater(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def greater(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+@overload
+def less(x1: U_co, x2: U_co) -> NDArray[bool_]: ...
+@overload
+def less(x1: S_co, x2: S_co) -> NDArray[bool_]: ...
+
+# String operations
+@overload
+def add(x1: U_co, x2: U_co) -> NDArray[str_]: ...
+@overload
+def add(x1: S_co, x2: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def multiply(a: U_co, i: i_co) -> NDArray[str_]: ...
+@overload
+def multiply(a: S_co, i: i_co) -> NDArray[bytes_]: ...
+
+@overload
+def mod(a: U_co, value: Any) -> NDArray[str_]: ...
+@overload
+def mod(a: S_co, value: Any) -> NDArray[bytes_]: ...
+
+@overload
+def capitalize(a: U_co) -> NDArray[str_]: ...
+@overload
+def capitalize(a: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def center(a: U_co, width: i_co, fillchar: U_co = ...) -> NDArray[str_]: ...
+@overload
+def center(a: S_co, width: i_co, fillchar: S_co = ...) -> NDArray[bytes_]: ...
+
+def decode(
+    a: S_co,
+    encoding: None | str = ...,
+    errors: None | str = ...,
+) -> NDArray[str_]: ...
+
+def encode(
+    a: U_co,
+    encoding: None | str = ...,
+    errors: None | str = ...,
+) -> NDArray[bytes_]: ...
+
+@overload
+def expandtabs(a: U_co, tabsize: i_co = ...) -> NDArray[str_]: ...
+@overload
+def expandtabs(a: S_co, tabsize: i_co = ...) -> NDArray[bytes_]: ...
+
+@overload
+def join(sep: U_co, seq: U_co) -> NDArray[str_]: ...
+@overload
+def join(sep: S_co, seq: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def ljust(a: U_co, width: i_co, fillchar: U_co = ...) -> NDArray[str_]: ...
+@overload
+def ljust(a: S_co, width: i_co, fillchar: S_co = ...) -> NDArray[bytes_]: ...
+
+@overload
+def lower(a: U_co) -> NDArray[str_]: ...
+@overload
+def lower(a: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def lstrip(a: U_co, chars: None | U_co = ...) -> NDArray[str_]: ...
+@overload
+def lstrip(a: S_co, chars: None | S_co = ...) -> NDArray[bytes_]: ...
+
+@overload
+def partition(a: U_co, sep: U_co) -> NDArray[str_]: ...
+@overload
+def partition(a: S_co, sep: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def replace(
+    a: U_co,
+    old: U_co,
+    new: U_co,
+    count: None | i_co = ...,
+) -> NDArray[str_]: ...
+@overload
+def replace(
+    a: S_co,
+    old: S_co,
+    new: S_co,
+    count: None | i_co = ...,
+) -> NDArray[bytes_]: ...
+
+@overload
+def rjust(
+    a: U_co,
+    width: i_co,
+    fillchar: U_co = ...,
+) -> NDArray[str_]: ...
+@overload
+def rjust(
+    a: S_co,
+    width: i_co,
+    fillchar: S_co = ...,
+) -> NDArray[bytes_]: ...
+
+@overload
+def rpartition(a: U_co, sep: U_co) -> NDArray[str_]: ...
+@overload
+def rpartition(a: S_co, sep: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def rsplit(
+    a: U_co,
+    sep: None | U_co = ...,
+    maxsplit: None | i_co = ...,
+) -> NDArray[object_]: ...
+@overload
+def rsplit(
+    a: S_co,
+    sep: None | S_co = ...,
+    maxsplit: None | i_co = ...,
+) -> NDArray[object_]: ...
+
+@overload
+def rstrip(a: U_co, chars: None | U_co = ...) -> NDArray[str_]: ...
+@overload
+def rstrip(a: S_co, chars: None | S_co = ...) -> NDArray[bytes_]: ...
+
+@overload
+def split(
+    a: U_co,
+    sep: None | U_co = ...,
+    maxsplit: None | i_co = ...,
+) -> NDArray[object_]: ...
+@overload
+def split(
+    a: S_co,
+    sep: None | S_co = ...,
+    maxsplit: None | i_co = ...,
+) -> NDArray[object_]: ...
+
+@overload
+def splitlines(a: U_co, keepends: None | b_co = ...) -> NDArray[object_]: ...
+@overload
+def splitlines(a: S_co, keepends: None | b_co = ...) -> NDArray[object_]: ...
+
+@overload
+def strip(a: U_co, chars: None | U_co = ...) -> NDArray[str_]: ...
+@overload
+def strip(a: S_co, chars: None | S_co = ...) -> NDArray[bytes_]: ...
+
+@overload
+def swapcase(a: U_co) -> NDArray[str_]: ...
+@overload
+def swapcase(a: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def title(a: U_co) -> NDArray[str_]: ...
+@overload
+def title(a: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def translate(
+    a: U_co,
+    table: U_co,
+    deletechars: None | U_co = ...,
+) -> NDArray[str_]: ...
+@overload
+def translate(
+    a: S_co,
+    table: S_co,
+    deletechars: None | S_co = ...,
+) -> NDArray[bytes_]: ...
+
+@overload
+def upper(a: U_co) -> NDArray[str_]: ...
+@overload
+def upper(a: S_co) -> NDArray[bytes_]: ...
+
+@overload
+def zfill(a: U_co, width: i_co) -> NDArray[str_]: ...
+@overload
+def zfill(a: S_co, width: i_co) -> NDArray[bytes_]: ...
+
+# String information
+@overload
+def count(
+    a: U_co,
+    sub: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+@overload
+def count(
+    a: S_co,
+    sub: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+
+@overload
+def endswith(
+    a: U_co,
+    suffix: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[bool_]: ...
+@overload
+def endswith(
+    a: S_co,
+    suffix: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[bool_]: ...
+
+@overload
+def find(
+    a: U_co,
+    sub: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+@overload
+def find(
+    a: S_co,
+    sub: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+
+@overload
+def index(
+    a: U_co,
+    sub: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+@overload
+def index(
+    a: S_co,
+    sub: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+
+def isalpha(a: U_co | S_co) -> NDArray[bool_]: ...
+def isalnum(a: U_co | S_co) -> NDArray[bool_]: ...
+def isdecimal(a: U_co | S_co) -> NDArray[bool_]: ...
+def isdigit(a: U_co | S_co) -> NDArray[bool_]: ...
+def islower(a: U_co | S_co) -> NDArray[bool_]: ...
+def isnumeric(a: U_co | S_co) -> NDArray[bool_]: ...
+def isspace(a: U_co | S_co) -> NDArray[bool_]: ...
+def istitle(a: U_co | S_co) -> NDArray[bool_]: ...
+def isupper(a: U_co | S_co) -> NDArray[bool_]: ...
+
+@overload
+def rfind(
+    a: U_co,
+    sub: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+@overload
+def rfind(
+    a: S_co,
+    sub: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+
+@overload
+def rindex(
+    a: U_co,
+    sub: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+@overload
+def rindex(
+    a: S_co,
+    sub: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[int_]: ...
+
+@overload
+def startswith(
+    a: U_co,
+    prefix: U_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[bool_]: ...
+@overload
+def startswith(
+    a: S_co,
+    prefix: S_co,
+    start: i_co = ...,
+    end: None | i_co = ...,
+) -> NDArray[bool_]: ...
+
+def str_len(A: U_co | S_co) -> NDArray[int_]: ...
+
+# Overload 1 and 2: str- or bytes-based array-likes
+# overload 3: arbitrary object with unicode=False  (-> bytes_)
+# overload 4: arbitrary object with unicode=True  (-> str_)
+@overload
+def array(
+    obj: U_co,
+    itemsize: None | int = ...,
+    copy: bool = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[str_]: ...
+@overload
+def array(
+    obj: S_co,
+    itemsize: None | int = ...,
+    copy: bool = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[bytes_]: ...
+@overload
+def array(
+    obj: object,
+    itemsize: None | int = ...,
+    copy: bool = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[bytes_]: ...
+@overload
+def array(
+    obj: object,
+    itemsize: None | int = ...,
+    copy: bool = ...,
+    unicode: L[True] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[str_]: ...
+
+@overload
+def asarray(
+    obj: U_co,
+    itemsize: None | int = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[str_]: ...
+@overload
+def asarray(
+    obj: S_co,
+    itemsize: None | int = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[bytes_]: ...
+@overload
+def asarray(
+    obj: object,
+    itemsize: None | int = ...,
+    unicode: L[False] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[bytes_]: ...
+@overload
+def asarray(
+    obj: object,
+    itemsize: None | int = ...,
+    unicode: L[True] = ...,
+    order: _OrderKACF = ...,
+) -> _CharArray[str_]: ...
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index 18157641a..c78d3db23 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -987,7 +987,7 @@ def einsum_path(*operands, optimize='greedy', einsum_call=False):
 
 
 def _einsum_dispatcher(*operands, out=None, optimize=None, **kwargs):
-    # Arguably we dispatch on more arguments that we really should; see note in
+    # Arguably we dispatch on more arguments than we really should; see note in
     # _einsum_path_dispatcher for why.
     yield from operands
     yield out
diff --git a/numpy/core/einsumfunc.pyi b/numpy/core/einsumfunc.pyi
index 52025d502..aabb04c47 100644
--- a/numpy/core/einsumfunc.pyi
+++ b/numpy/core/einsumfunc.pyi
@@ -41,7 +41,7 @@ __all__: List[str]
 # TODO: Properly handle the `casting`-based combinatorics
 # TODO: We need to evaluate the content `__subscripts` in order
 # to identify whether or an array or scalar is returned. At a cursory
-# glance this seems like something that can quite easilly be done with
+# glance this seems like something that can quite easily be done with
 # a mypy plugin.
 # Something like `is_scalar = bool(__subscripts.partition("->")[-1])`
 @overload
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 5ecb1e666..3242124ac 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -689,6 +689,9 @@ def partition(a, kth, axis=-1, kind='introselect', order=None):
         it. The order of all elements in the partitions is undefined. If
         provided with a sequence of k-th it will partition all elements
         indexed by k-th  of them into their sorted position at once.
+
+        .. deprecated:: 1.22.0
+            Passing booleans as index is deprecated.
     axis : int or None, optional
         Axis along which to sort. If None, the array is flattened before
         sorting. The default is -1, which sorts along the last axis.
@@ -781,6 +784,9 @@ def argpartition(a, kth, axis=-1, kind='introselect', order=None):
         elements in the partitions is undefined. If provided with a
         sequence of k-th it will partition all of them into their sorted
         position at once.
+
+        .. deprecated:: 1.22.0
+            Passing booleans as index is deprecated.
     axis : int or None, optional
         Axis along which to sort. The default is -1 (the last axis). If
         None, the flattened array is used.
@@ -1138,6 +1144,8 @@ def argmax(a, axis=None, out=None, *, keepdims=np._NoValue):
         in the result as dimensions with size one. With this option,
         the result will broadcast correctly against the array.
 
+        .. versionadded:: 1.22.0
+
     Returns
     -------
     index_array : ndarray of ints
@@ -1232,6 +1240,8 @@ def argmin(a, axis=None, out=None, *, keepdims=np._NoValue):
         in the result as dimensions with size one. With this option,
         the result will broadcast correctly against the array.
 
+        .. versionadded:: 1.22.0
+
     Returns
     -------
     index_array : ndarray of ints
diff --git a/numpy/core/function_base.pyi b/numpy/core/function_base.pyi
index c35629aa7..68d3b3a98 100644
--- a/numpy/core/function_base.pyi
+++ b/numpy/core/function_base.pyi
@@ -1,4 +1,4 @@
-from typing import overload, Tuple, Union, Sequence, Any, SupportsIndex, Literal
+from typing import overload, Tuple, Union, Sequence, Any, SupportsIndex, Literal, List
 
 from numpy import ndarray
 from numpy.typing import ArrayLike, DTypeLike, _SupportsArray, _NumberLike_co
@@ -8,6 +8,9 @@ _ArrayLikeNested = Sequence[Sequence[Any]]
 _ArrayLikeNumber = Union[
     _NumberLike_co, Sequence[_NumberLike_co], ndarray, _SupportsArray, _ArrayLikeNested
 ]
+
+__all__: List[str]
+
 @overload
 def linspace(
     start: _ArrayLikeNumber,
@@ -47,3 +50,11 @@ def geomspace(
     dtype: DTypeLike = ...,
     axis: SupportsIndex = ...,
 ) -> ndarray: ...
+
+# Re-exported to `np.lib.function_base`
+def add_newdoc(
+    place: str,
+    obj: str,
+    doc: str | Tuple[str, str] | List[Tuple[str, str]],
+    warn_on_python: bool = ...,
+) -> None: ...
diff --git a/numpy/core/getlimits.py b/numpy/core/getlimits.py
index 0f7031bac..ab4a4d2be 100644
--- a/numpy/core/getlimits.py
+++ b/numpy/core/getlimits.py
@@ -5,13 +5,12 @@ __all__ = ['finfo', 'iinfo']
 
 import warnings
 
-from .machar import MachAr
+from ._machar import MachAr
 from .overrides import set_module
 from . import numeric
 from . import numerictypes as ntypes
 from .numeric import array, inf, NaN
 from .umath import log10, exp2, nextafter, isnan
-from . import umath
 
 
 def _fr0(a):
@@ -386,6 +385,8 @@ class finfo:
     machar : MachAr
         The object which calculated these parameters and holds more
         detailed information.
+
+        .. deprecated:: 1.22
     machep : int
         The exponent that yields `eps`.
     max : floating point number of the appropriate type
@@ -502,7 +503,7 @@ class finfo:
         self.eps = machar.eps.flat[0]
         self.nexp = machar.iexp
         self.nmant = machar.it
-        self.machar = machar
+        self._machar = machar
         self._str_tiny = machar._str_xmin.strip()
         self._str_max = machar._str_xmax.strip()
         self._str_epsneg = machar._str_epsneg.strip()
@@ -552,11 +553,11 @@ class finfo:
         """
         # This check is necessary because the value for smallest_normal is
         # platform dependent for longdouble types.
-        if isnan(self.machar.smallest_normal.flat[0]):
+        if isnan(self._machar.smallest_normal.flat[0]):
             warnings.warn(
                 'The value of smallest normal is undefined for double double',
                 UserWarning, stacklevel=2)
-        return self.machar.smallest_normal.flat[0]
+        return self._machar.smallest_normal.flat[0]
 
     @property
     def tiny(self):
@@ -575,6 +576,20 @@ class finfo:
         """
         return self.smallest_normal
 
+    @property
+    def machar(self):
+        """The object which calculated these parameters and holds more
+        detailed information.
+
+        .. deprecated:: 1.22
+        """
+        # Deprecated 2021-10-27, NumPy 1.22
+        warnings.warn(
+            "`finfo.machar` is deprecated (NumPy 1.22)",
+            DeprecationWarning, stacklevel=2,
+        )
+        return self._machar
+
 
 @set_module('numpy')
 class iinfo:
diff --git a/numpy/core/getlimits.pyi b/numpy/core/getlimits.pyi
index ca22e18f7..66d062995 100644
--- a/numpy/core/getlimits.pyi
+++ b/numpy/core/getlimits.pyi
@@ -1,58 +1,8 @@
-from typing import Any, Generic, List, Type, TypeVar
+from typing import List
 
 from numpy import (
     finfo as finfo,
     iinfo as iinfo,
-    floating,
-    signedinteger,
 )
 
-from numpy.typing import NBitBase, NDArray
-
-_NBit = TypeVar("_NBit", bound=NBitBase)
-
 __all__: List[str]
-
-class MachArLike(Generic[_NBit]):
-    def __init__(
-        self,
-        ftype: Type[floating[_NBit]],
-        *,
-        eps: floating[Any],
-        epsneg: floating[Any],
-        huge: floating[Any],
-        tiny: floating[Any],
-        ibeta: int,
-        smallest_subnormal: None | floating[Any] = ...,
-        # Expand `**kwargs` into keyword-only arguments
-        machep: int,
-        negep: int,
-        minexp: int,
-        maxexp: int,
-        it: int,
-        iexp: int,
-        irnd: int,
-        ngrd: int,
-    ) -> None: ...
-    @property
-    def smallest_subnormal(self) -> NDArray[floating[_NBit]]: ...
-    eps: NDArray[floating[_NBit]]
-    epsilon: NDArray[floating[_NBit]]
-    epsneg: NDArray[floating[_NBit]]
-    huge: NDArray[floating[_NBit]]
-    ibeta: signedinteger[_NBit]
-    iexp: int
-    irnd: int
-    it: int
-    machep: int
-    maxexp: int
-    minexp: int
-    negep: int
-    ngrd: int
-    precision: int
-    resolution: NDArray[floating[_NBit]]
-    smallest_normal: NDArray[floating[_NBit]]
-    tiny: NDArray[floating[_NBit]]
-    title: str
-    xmax: NDArray[floating[_NBit]]
-    xmin: NDArray[floating[_NBit]]
diff --git a/numpy/core/include/numpy/.doxyfile b/numpy/core/include/numpy/.doxyfile
new file mode 100644
index 000000000..ed2aefff7
--- /dev/null
+++ b/numpy/core/include/numpy/.doxyfile
@@ -0,0 +1,2 @@
+INCLUDE_PATH += @CUR_DIR
+PREDEFINED += NPY_INTERNAL_BUILD
diff --git a/numpy/core/include/numpy/_neighborhood_iterator_imp.h b/numpy/core/include/numpy/_neighborhood_iterator_imp.h
index e8860cbc7..07e2363d0 100644
--- a/numpy/core/include/numpy/_neighborhood_iterator_imp.h
+++ b/numpy/core/include/numpy/_neighborhood_iterator_imp.h
@@ -1,4 +1,4 @@
-#ifndef _NPY_INCLUDE_NEIGHBORHOOD_IMP
+#ifndef NUMPY_CORE_INCLUDE_NUMPY__NEIGHBORHOOD_IMP_H_
 #error You should not include this header directly
 #endif
 /*
diff --git a/numpy/core/include/numpy/arrayobject.h b/numpy/core/include/numpy/arrayobject.h
index 4f46d6b1a..da47bb096 100644
--- a/numpy/core/include/numpy/arrayobject.h
+++ b/numpy/core/include/numpy/arrayobject.h
@@ -1,4 +1,5 @@
-#ifndef Py_ARRAYOBJECT_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_ARRAYOBJECT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_ARRAYOBJECT_H_
 #define Py_ARRAYOBJECT_H
 
 #include "ndarrayobject.h"
@@ -8,4 +9,4 @@
 #include "noprefix.h"
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_ARRAYOBJECT_H_ */
diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
index 14a31988f..a20a68016 100644
--- a/numpy/core/include/numpy/arrayscalars.h
+++ b/numpy/core/include/numpy/arrayscalars.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAYSCALARS_H_
-#define _NPY_ARRAYSCALARS_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_ARRAYSCALARS_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_ARRAYSCALARS_H_
 
 #ifndef _MULTIARRAYMODULE
 typedef struct {
@@ -179,4 +179,4 @@ typedef struct {
 #define PyArrayScalar_ASSIGN(obj, cls, val) \
         PyArrayScalar_VAL(obj, cls) = val
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_ARRAYSCALARS_H_ */
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
new file mode 100644
index 000000000..554c7fb6c
--- /dev/null
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -0,0 +1,380 @@
+/*
+ * This header exports the new experimental DType API as proposed in
+ * NEPs 41 to 43.  For background, please check these NEPs.  Otherwise,
+ * this header also serves as documentation for the time being.
+ *
+ * Please do not hesitate to contact @seberg with questions.  This is
+ * developed together with https://github.com/seberg/experimental_user_dtypes
+ * and those interested in experimenting are encouraged to contribute there.
+ *
+ * To use the functions defined in the header, call::
+ *
+ *     if (import_experimental_dtype_api(version) < 0) {
+ *         return NULL;
+ *     }
+ *
+ * in your module init.  (A version mismatch will be reported, just update
+ * to the correct one, this will alert you of possible changes.)
+ *
+ * The following lists the main symbols currently exported.  Please do not
+ * hesitate to ask for help or clarification:
+ *
+ * - PyUFunc_AddLoopFromSpec:
+ *
+ *     Register a new loop for a ufunc.  This uses the `PyArrayMethod_Spec`
+ *     which must be filled in (see in-line comments).
+ *
+ * - PyUFunc_AddPromoter:
+ *
+ *     Register a new promoter for a ufunc.  A promoter is a function stored
+ *     in a PyCapsule (see in-line comments).  It is passed the operation and
+ *     requested DType signatures and can mutate it to attempt a new search
+ *     for a matching loop/promoter.
+ *     I.e. for Numba a promoter could even add the desired loop.
+ *
+ * - PyArrayInitDTypeMeta_FromSpec:
+ *
+ *     Initialize a new DType.  It must currently be a static Python C type
+ *     that is declared as `PyArray_DTypeMeta` and not `PyTypeObject`.
+ *     Further, it must subclass `np.dtype` and set its type to
+ *     `PyArrayDTypeMeta_Type` (before calling `PyType_Read()`).
+ *
+ * - PyArray_CommonDType:
+ *
+ *     Find the common-dtype ("promotion") for two DType classes.  Similar
+ *     to `np.result_type`, but works on the classes and not instances.
+ *
+ * - PyArray_PromoteDTypeSequence:
+ *
+ *     Same as CommonDType, but works with an arbitrary number of DTypes.
+ *     This function is smarter and can often return successful and unambiguous
+ *     results when `common_dtype(common_dtype(dt1, dt2), dt3)` would
+ *     depend on the operation order or fail.  Nevertheless, DTypes should
+ *     aim to ensure that their common-dtype implementation is associative
+ *     and commutative!  (Mainly, unsigned and signed integers are not.)
+ *
+ *     For guaranteed consistent results DTypes must implement common-Dtype
+ *     "transitively".  If A promotes B and B promotes C, than A must generally
+ *     also promote C; where "promotes" means implements the promotion.
+ *     (There are some exceptions for abstract DTypes)
+ *
+ * WARNING
+ * =======
+ *
+ * By using this header, you understand that this is a fully experimental
+ * exposure.  Details are expected to change, and some options may have no
+ * effect.  (Please contact @seberg if you have questions!)
+ * If the exposure stops working, please file a bug report with NumPy.
+ * Further, a DType created using this API/header should still be expected
+ * to be incompatible with some functionality inside and outside of NumPy.
+ * In this case crashes must be expected.  Please report any such problems
+ * so that they can be fixed before final exposure.
+ * Furthermore, expect missing checks for programming errors which the final
+ * API is expected to have.
+ *
+ * Symbols with a leading underscore are likely to not be included in the
+ * first public version, if these are central to your use-case, please let
+ * us know, so that we can reconsider.
+ *
+ * "Array-like" consumer API not yet under considerations
+ * ======================================================
+ *
+ * The new DType API is designed in a way to make it potentially useful for
+ * alternative "array-like" implementations.  This will require careful
+ * exposure of details and functions and is not part of this experimental API.
+ */
+
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_
+
+#include <Python.h>
+#include "ndarraytypes.h"
+
+
+/*
+ * Just a hack so I don't forget importing as much myself, I spend way too
+ * much time noticing it the first time around :).
+ */
+static void
+__not_imported(void)
+{
+    printf("*****\nCritical error, dtype API not imported\n*****\n");
+}
+static void *__uninitialized_table[] = {
+        &__not_imported, &__not_imported, &__not_imported, &__not_imported,
+        &__not_imported, &__not_imported, &__not_imported, &__not_imported};
+
+
+static void **__experimental_dtype_api_table = __uninitialized_table;
+
+
+/*
+ * DTypeMeta struct, the content may be made fully opaque (except the size).
+ * We may also move everything into a single `void *dt_slots`.
+ */
+typedef struct {
+    PyHeapTypeObject super;
+    PyArray_Descr *singleton;
+    int type_num;
+    PyTypeObject *scalar_type;
+    npy_uint64 flags;
+    void *dt_slots;
+    void *reserved[3];
+} PyArray_DTypeMeta;
+
+
+/*
+ * ******************************************************
+ *         ArrayMethod API (Casting and UFuncs)
+ * ******************************************************
+ */
+/*
+ * NOTE: Expected changes:
+ *       * invert logic of floating point error flag
+ *       * probably split runtime and general flags into two
+ *       * should possibly not use an enum for typdef for more stable ABI?
+ */
+typedef enum {
+    /* Flag for whether the GIL is required */
+    NPY_METH_REQUIRES_PYAPI = 1 << 1,
+    /*
+     * Some functions cannot set floating point error flags, this flag
+     * gives us the option (not requirement) to skip floating point error
+     * setup/check. No function should set error flags and ignore them
+     * since it would interfere with chaining operations (e.g. casting).
+     */
+    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
+    /* Whether the method supports unaligned access (not runtime) */
+    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
+
+    /* All flags which can change at runtime */
+    NPY_METH_RUNTIME_FLAGS = (
+            NPY_METH_REQUIRES_PYAPI |
+            NPY_METH_NO_FLOATINGPOINT_ERRORS),
+} NPY_ARRAYMETHOD_FLAGS;
+
+
+/*
+ * The main object for creating a new ArrayMethod. We use the typical `slots`
+ * mechanism used by the Python limited API (see below for the slot defs).
+ */
+typedef struct {
+    const char *name;
+    int nin, nout;
+    NPY_CASTING casting;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    PyObject **dtypes;  /* array of DType class objects */
+    PyType_Slot *slots;
+} PyArrayMethod_Spec;
+
+
+typedef PyObject *_ufunc_addloop_fromspec_func(
+        PyObject *ufunc, PyArrayMethod_Spec *spec);
+/*
+ * The main ufunc registration function.  This adds a new implementation/loop
+ * to a ufunc.  It replaces `PyUFunc_RegisterLoopForType`.
+ */
+#define PyUFunc_AddLoopFromSpec \
+    (*(_ufunc_addloop_fromspec_func *)(__experimental_dtype_api_table[0]))
+
+
+/*
+ * Type of the C promoter function, which must be wrapped into a
+ * PyCapsule with name "numpy._ufunc_promoter".
+ */
+typedef int promoter_function(PyObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[]);
+
+/*
+ * Function to register a promoter.
+ *
+ * @param ufunc The ufunc object to register the promoter with.
+ * @param DType_tuple A Python tuple containing DTypes or None matching the
+ *        number of inputs and outputs of the ufunc.
+ * @param promoter A PyCapsule with name "numpy._ufunc_promoter" containing
+ *        a pointer to a `promoter_function`.
+ */
+typedef int _ufunc_addpromoter_func(
+        PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter);
+#define PyUFunc_AddPromoter \
+    (*(_ufunc_addpromoter_func *)(__experimental_dtype_api_table[1]))
+
+/*
+ * In addition to the normal casting levels, NPY_CAST_IS_VIEW indicates
+ * that no cast operation is necessary at all (although a copy usually will be)
+ *
+ * NOTE: The most likely modification here is to add an additional
+ *       `view_offset` output to resolve_descriptors.  If set, it would
+ *       indicate both that it is a view and what offset to use.  This means that
+ *       e.g. `arr.imag` could be implemented by an ArrayMethod.
+ */
+#define NPY_CAST_IS_VIEW _NPY_CAST_IS_VIEW
+
+/*
+ * The resolve descriptors function, must be able to handle NULL values for
+ * all output (but not input) `given_descrs` and fill `loop_descrs`.
+ * Return -1 on error or 0 if the operation is not possible without an error
+ * set.  (This may still be in flux.)
+ * Otherwise must return the "casting safety", for normal functions, this is
+ * almost always "safe" (or even "equivalent"?).
+ *
+ * `resolve_descriptors` is optional if all output DTypes are non-parametric.
+ */
+#define NPY_METH_resolve_descriptors 1
+typedef NPY_CASTING (resolve_descriptors_function)(
+        /* "method" is currently opaque (necessary e.g. to wrap Python) */
+        PyObject *method,
+        /* DTypes the method was created for */
+        PyObject **dtypes,
+        /* Input descriptors (instances).  Outputs may be NULL. */
+        PyArray_Descr **given_descrs,
+        /* Exact loop descriptors to use, must not hold references on error */
+        PyArray_Descr **loop_descrs);
+
+/* NOT public yet: Signature needs adapting as external API. */
+#define _NPY_METH_get_loop 2
+
+/*
+ * Current public API to define fast inner-loops.  You must provide a
+ * strided loop.  If this is a cast between two "versions" of the same dtype
+ * you must also provide an unaligned strided loop.
+ * Other loops are useful to optimize the very common contiguous case.
+ *
+ * NOTE: As of now, NumPy will NOT use unaligned loops in ufuncs!
+ */
+#define NPY_METH_strided_loop 3
+#define NPY_METH_contiguous_loop 4
+#define NPY_METH_unaligned_strided_loop 5
+#define NPY_METH_unaligned_contiguous_loop 6
+
+
+typedef struct {
+    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
+    PyObject *method;  /* The method "self". Currently an opaque object */
+
+    /* Operand descriptors, filled in by resolve_descriptors */
+    PyArray_Descr **descriptors;
+    /* Structure may grow (this is harmless for DType authors) */
+} PyArrayMethod_Context;
+
+typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
+        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *transferdata);
+
+
+
+/*
+ * ****************************
+ *          DTYPE API
+ * ****************************
+ */
+
+#define NPY_DT_ABSTRACT 1 << 1
+#define NPY_DT_PARAMETRIC 1 << 2
+
+#define NPY_DT_discover_descr_from_pyobject 1
+#define _NPY_DT_is_known_scalar_type 2
+#define NPY_DT_default_descr 3
+#define NPY_DT_common_dtype 4
+#define NPY_DT_common_instance 5
+#define NPY_DT_setitem 6
+#define NPY_DT_getitem 7
+
+
+// TODO: These slots probably still need some thought, and/or a way to "grow"?
+typedef struct{
+    PyTypeObject *typeobj;    /* type of python scalar or NULL */
+    int flags;                /* flags, including parametric and abstract */
+    /* NULL terminated cast definitions. Use NULL for the newly created DType */
+    PyArrayMethod_Spec **casts;
+    PyType_Slot *slots;
+    /* Baseclass or NULL (will always subclass `np.dtype`) */
+    PyTypeObject *baseclass;
+} PyArrayDTypeMeta_Spec;
+
+
+#define PyArrayDTypeMeta_Type \
+    (*(PyTypeObject *)__experimental_dtype_api_table[2])
+typedef int __dtypemeta_fromspec(
+        PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *dtype_spec);
+/*
+ * Finalize creation of a DTypeMeta.  You must ensure that the DTypeMeta is
+ * a proper subclass.  The DTypeMeta object has additional fields compared to
+ * a normal PyTypeObject!
+ * The only (easy) creation of a new DType is to create a static Type which
+ * inherits `PyArray_DescrType`, sets its type to `PyArrayDTypeMeta_Type` and
+ * uses `PyArray_DTypeMeta` defined above as the C-structure.
+ */
+#define PyArrayInitDTypeMeta_FromSpec \
+    ((__dtypemeta_fromspec *)(__experimental_dtype_api_table[3]))
+
+
+/*
+ * *************************************
+ *          WORKING WITH DTYPES
+ * *************************************
+ */
+
+typedef PyArray_DTypeMeta *__common_dtype(
+        PyArray_DTypeMeta *DType1, PyArray_DTypeMeta *DType2);
+#define PyArray_CommonDType \
+    ((__common_dtype *)(__experimental_dtype_api_table[4]))
+
+
+typedef PyArray_DTypeMeta *__promote_dtype_sequence(
+        npy_intp num, PyArray_DTypeMeta *DTypes[]);
+#define PyArray_PromoteDTypeSequence \
+    ((__promote_dtype_sequence *)(__experimental_dtype_api_table[5]))
+
+
+/*
+ * ********************************
+ *         Initialization
+ * ********************************
+ *
+ * Import the experimental API, the version must match the one defined in
+ * the header to ensure changes are taken into account. NumPy will further
+ * runtime-check this.
+ * You must call this function to use the symbols defined in this file.
+ */
+#define __EXPERIMENTAL_DTYPE_VERSION 2
+
+static int
+import_experimental_dtype_api(int version)
+{
+    if (version != __EXPERIMENTAL_DTYPE_VERSION) {
+        PyErr_Format(PyExc_RuntimeError,
+                "DType API version %d did not match header version %d. Please "
+                "update the import statement and check for API changes.",
+                version, __EXPERIMENTAL_DTYPE_VERSION);
+        return -1;
+    }
+    if (__experimental_dtype_api_table != __uninitialized_table) {
+        /* already imported. */
+        return 0;
+    }
+
+    PyObject *multiarray = PyImport_ImportModule("numpy.core._multiarray_umath");
+    if (multiarray == NULL) {
+        return -1;
+    }
+
+    PyObject *api = PyObject_CallMethod(multiarray,
+        "_get_experimental_dtype_api", "i", version);
+    Py_DECREF(multiarray);
+    if (api == NULL) {
+        return -1;
+    }
+    __experimental_dtype_api_table = PyCapsule_GetPointer(api,
+            "experimental_dtype_api_table");
+    Py_DECREF(api);
+
+    if (__experimental_dtype_api_table == NULL) {
+        __experimental_dtype_api_table = __uninitialized_table;
+        return -1;
+    }
+    return 0;
+}
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_EXPERIMENTAL_DTYPE_API_H_ */
diff --git a/numpy/core/include/numpy/halffloat.h b/numpy/core/include/numpy/halffloat.h
index ab0d221fb..950401664 100644
--- a/numpy/core/include/numpy/halffloat.h
+++ b/numpy/core/include/numpy/halffloat.h
@@ -1,5 +1,5 @@
-#ifndef __NPY_HALFFLOAT_H__
-#define __NPY_HALFFLOAT_H__
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_
 
 #include <Python.h>
 #include <numpy/npy_math.h>
@@ -67,4 +67,4 @@ npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h);
 }
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_HALFFLOAT_H_ */
diff --git a/numpy/core/include/numpy/libdivide/libdivide.h b/numpy/core/include/numpy/libdivide/libdivide.h
index 81057b7b4..f4eb8039b 100644
--- a/numpy/core/include/numpy/libdivide/libdivide.h
+++ b/numpy/core/include/numpy/libdivide/libdivide.h
@@ -8,8 +8,8 @@
 // You may use libdivide under the terms of either of these.
 // See LICENSE.txt for more details.
 
-#ifndef LIBDIVIDE_H
-#define LIBDIVIDE_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_LIBDIVIDE_LIBDIVIDE_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_LIBDIVIDE_LIBDIVIDE_H_
 
 #define LIBDIVIDE_VERSION "3.0"
 #define LIBDIVIDE_VERSION_MAJOR 3
@@ -2072,8 +2072,8 @@ T& operator/=(T& n, const divider<T, ALGO>& div) {
 template <typename T>
 using branchfree_divider = divider<T, BRANCHFREE>;
 
-} // namespace libdivide
+}  // namespace libdivide
 
-#endif // __cplusplus
+#endif  // __cplusplus
 
-#endif // LIBDIVIDE_H
+#endif  // NUMPY_CORE_INCLUDE_NUMPY_LIBDIVIDE_LIBDIVIDE_H_
diff --git a/numpy/core/include/numpy/ndarrayobject.h b/numpy/core/include/numpy/ndarrayobject.h
index 5ef1f10aa..2eb951486 100644
--- a/numpy/core/include/numpy/ndarrayobject.h
+++ b/numpy/core/include/numpy/ndarrayobject.h
@@ -1,9 +1,9 @@
 /*
  * DON'T INCLUDE THIS DIRECTLY.
  */
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NDARRAYOBJECT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NDARRAYOBJECT_H_
 
-#ifndef NPY_NDARRAYOBJECT_H
-#define NPY_NDARRAYOBJECT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -265,4 +265,4 @@ PyArray_XDECREF_ERR(PyArrayObject *arr)
 #endif
 
 
-#endif /* NPY_NDARRAYOBJECT_H */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NDARRAYOBJECT_H_ */
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index 60f16094c..a1d1c01dc 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1,5 +1,5 @@
-#ifndef NDARRAYTYPES_H
-#define NDARRAYTYPES_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_
 
 #include "npy_common.h"
 #include "npy_endian.h"
@@ -355,12 +355,10 @@ struct NpyAuxData_tag {
 #define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr);
 #define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr);
 
-  /*
-   * Macros to define how array, and dimension/strides data is
-   * allocated.
-   */
-
-  /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */
+/*
+* Macros to define how array, and dimension/strides data is
+* allocated. These should be made private
+*/
 
 #define NPY_USE_PYMEM 1
 
@@ -673,6 +671,24 @@ typedef struct _arr_descr {
 } PyArray_ArrayDescr;
 
 /*
+ * Memory handler structure for array data.
+ */
+/* The declaration of free differs from PyMemAllocatorEx */
+typedef struct {
+    void *ctx;
+    void* (*malloc) (void *ctx, size_t size);
+    void* (*calloc) (void *ctx, size_t nelem, size_t elsize);
+    void* (*realloc) (void *ctx, void *ptr, size_t new_size);
+    void (*free) (void *ctx, void *ptr, size_t size);
+} PyDataMemAllocator;
+
+typedef struct {
+    char name[128];  /* multiple of 64 to keep the struct aligned */
+    PyDataMemAllocator allocator;
+} PyDataMem_Handler;
+
+
+/*
  * The main array object structure.
  *
  * It has been recommended to use the inline functions defined below
@@ -722,6 +738,10 @@ typedef struct tagPyArrayObject_fields {
     /* For weak references */
     PyObject *weakreflist;
     void *_buffer_info;  /* private buffer info, tagged to allow warning */
+    /*
+     * For malloc/calloc/realloc/free per object
+     */
+    PyObject *mem_handler;
 } PyArrayObject_fields;
 
 /*
@@ -1472,9 +1492,11 @@ PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter);
  * Include inline implementations - functions defined there are not
  * considered public API
  */
-#define _NPY_INCLUDE_NEIGHBORHOOD_IMP
+#define NUMPY_CORE_INCLUDE_NUMPY__NEIGHBORHOOD_IMP_H_
 #include "_neighborhood_iterator_imp.h"
-#undef _NPY_INCLUDE_NEIGHBORHOOD_IMP
+#undef NUMPY_CORE_INCLUDE_NUMPY__NEIGHBORHOOD_IMP_H_
+
+
 
 /* The default array type */
 #define NPY_DEFAULT_TYPE NPY_DOUBLE
@@ -1665,6 +1687,12 @@ PyArray_CLEARFLAGS(PyArrayObject *arr, int flags)
     ((PyArrayObject_fields *)arr)->flags &= ~flags;
 }
 
+static NPY_INLINE NPY_RETURNS_BORROWED_REF PyObject *
+PyArray_HANDLER(PyArrayObject *arr)
+{
+    return ((PyArrayObject_fields *)arr)->mem_handler;
+}
+
 #define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL)
 
 #define PyTypeNum_ISUNSIGNED(type) (((type) == NPY_UBYTE) ||   \
@@ -1864,32 +1892,14 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
  */
 #if defined(NPY_INTERNAL_BUILD) && NPY_INTERNAL_BUILD
     /*
-     * The Structures defined in this block are considered private API and
-     * may change without warning!
+     * The Structures defined in this block are currently considered
+     * private API and may change without warning!
+     * Part of this (at least the size) is exepcted to be public API without
+     * further modifications.
      */
     /* TODO: Make this definition public in the API, as soon as its settled */
     NPY_NO_EXPORT extern PyTypeObject PyArrayDTypeMeta_Type;
 
-    typedef struct PyArray_DTypeMeta_tag PyArray_DTypeMeta;
-
-    typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
-            PyArray_DTypeMeta *cls, PyObject *obj);
-
-    /*
-     * Before making this public, we should decide whether it should pass
-     * the type, or allow looking at the object. A possible use-case:
-     * `np.array(np.array([0]), dtype=np.ndarray)`
-     * Could consider arrays that are not `dtype=ndarray` "scalars".
-     */
-    typedef int (is_known_scalar_type_function)(
-            PyArray_DTypeMeta *cls, PyTypeObject *obj);
-
-    typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
-    typedef PyArray_DTypeMeta *(common_dtype_function)(
-            PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtyep2);
-    typedef PyArray_Descr *(common_instance_function)(
-            PyArray_Descr *dtype1, PyArray_Descr *dtyep2);
-
     /*
      * While NumPy DTypes would not need to be heap types the plan is to
      * make DTypes available in Python at which point they will be heap types.
@@ -1900,7 +1910,7 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
      * it is a fairly complex construct which may be better to allow
      * refactoring of.
      */
-    struct PyArray_DTypeMeta_tag {
+    typedef struct {
         PyHeapTypeObject super;
 
         /*
@@ -1928,7 +1938,7 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
          */
         void *dt_slots;
         void *reserved[3];
-    };
+    } PyArray_DTypeMeta;
 
 #endif  /* NPY_INTERNAL_BUILD */
 
@@ -1959,4 +1969,4 @@ typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size,
  */
 #undef NPY_DEPRECATED_INCLUDES
 
-#endif /* NPY_ARRAYTYPES_H */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_ */
diff --git a/numpy/core/include/numpy/noprefix.h b/numpy/core/include/numpy/noprefix.h
index 041f30192..2c0ce1420 100644
--- a/numpy/core/include/numpy/noprefix.h
+++ b/numpy/core/include/numpy/noprefix.h
@@ -1,5 +1,5 @@
-#ifndef NPY_NOPREFIX_H
-#define NPY_NOPREFIX_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NOPREFIX_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NOPREFIX_H_
 
 /*
  * You can directly include noprefix.h as a backward
@@ -209,4 +209,4 @@
 #define MAX_ELSIZE NPY_MAX_ELSIZE
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NOPREFIX_H_ */
diff --git a/numpy/core/include/numpy/npy_1_7_deprecated_api.h b/numpy/core/include/numpy/npy_1_7_deprecated_api.h
index a4f90e019..4fd4015a9 100644
--- a/numpy/core/include/numpy/npy_1_7_deprecated_api.h
+++ b/numpy/core/include/numpy/npy_1_7_deprecated_api.h
@@ -1,10 +1,10 @@
-#ifndef _NPY_1_7_DEPRECATED_API_H
-#define _NPY_1_7_DEPRECATED_API_H
-
 #ifndef NPY_DEPRECATED_INCLUDES
 #error "Should never include npy_*_*_deprecated_api directly."
 #endif
 
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_1_7_DEPRECATED_API_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_1_7_DEPRECATED_API_H_
+
 /* Emit a warning if the user did not specifically request the old API */
 #ifndef NPY_NO_DEPRECATED_API
 #if defined(_WIN32)
@@ -122,4 +122,4 @@
  */
 #include "old_defines.h"
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_1_7_DEPRECATED_API_H_ */
diff --git a/numpy/core/include/numpy/npy_3kcompat.h b/numpy/core/include/numpy/npy_3kcompat.h
index 551ec6be8..22c103e93 100644
--- a/numpy/core/include/numpy/npy_3kcompat.h
+++ b/numpy/core/include/numpy/npy_3kcompat.h
@@ -7,8 +7,8 @@
  * strong backwards compatibility guarantees at the moment.
  */
 
-#ifndef _NPY_3KCOMPAT_H_
-#define _NPY_3KCOMPAT_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_3KCOMPAT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_3KCOMPAT_H_
 
 #include <Python.h>
 #include <stdio.h>
@@ -592,4 +592,4 @@ NpyCapsule_Check(PyObject *ptr)
 #endif
 
 
-#endif /* _NPY_3KCOMPAT_H_ */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_3KCOMPAT_H_ */
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index d5f329b66..57cc592b9 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_COMMON_H_
-#define _NPY_COMMON_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_COMMON_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_COMMON_H_
 
 /* need Python.h for npy_intp, npy_uintp */
 #include <Python.h>
@@ -359,12 +359,11 @@ typedef unsigned char npy_bool;
 
 
 #if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
-        typedef double npy_longdouble;
         #define NPY_LONGDOUBLE_FMT "g"
 #else
-        typedef long double npy_longdouble;
         #define NPY_LONGDOUBLE_FMT "Lg"
 #endif
+typedef long double npy_longdouble;
 
 #ifndef Py_USING_UNICODE
 #error Must use Python with unicode enabled.
@@ -1107,4 +1106,4 @@ typedef npy_int64 npy_datetime;
 
 /* End of typedefs for numarray style bit-width names */
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_COMMON_H_ */
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h
index e975b0105..78d229e7d 100644
--- a/numpy/core/include/numpy/npy_cpu.h
+++ b/numpy/core/include/numpy/npy_cpu.h
@@ -21,8 +21,8 @@
  *              NPY_CPU_LOONGARCH
  *              NPY_CPU_WASM
  */
-#ifndef _NPY_CPUARCH_H_
-#define _NPY_CPUARCH_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_
 
 #include "numpyconfig.h"
 
@@ -114,7 +114,7 @@
     information about your platform (OS, CPU and compiler)
 #endif
 
-/* 
+/*
  * Except for the following architectures, memory access is limited to the natural
  * alignment of data types otherwise it may lead to bus error or performance regression.
  * For more details about unaligned access, see https://www.kernel.org/doc/Documentation/unaligned-memory-access.txt.
@@ -126,4 +126,4 @@
     #define NPY_ALIGNMENT_REQUIRED 1
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_CPU_H_ */
diff --git a/numpy/core/include/numpy/npy_endian.h b/numpy/core/include/numpy/npy_endian.h
index 620595bec..5e58a7f52 100644
--- a/numpy/core/include/numpy/npy_endian.h
+++ b/numpy/core/include/numpy/npy_endian.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ENDIAN_H_
-#define _NPY_ENDIAN_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_ENDIAN_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_ENDIAN_H_
 
 /*
  * NPY_BYTE_ORDER is set to the same value as BYTE_ORDER set by glibc in
@@ -52,6 +52,7 @@
             || defined(NPY_CPU_LOONGARCH)     \
             || defined(NPY_CPU_WASM)
         #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN
+
     #elif defined(NPY_CPU_PPC)                \
             || defined(NPY_CPU_SPARC)         \
             || defined(NPY_CPU_S390)          \
@@ -66,9 +67,11 @@
             || defined(NPY_CPU_M68K)          \
             || defined(NPY_CPU_ARCEB)
         #define NPY_BYTE_ORDER NPY_BIG_ENDIAN
+
     #else
         #error Unknown CPU: can not set endianness
     #endif
-#endif
 
 #endif
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_ENDIAN_H_ */
diff --git a/numpy/core/include/numpy/npy_interrupt.h b/numpy/core/include/numpy/npy_interrupt.h
index bcb539326..69a0374dd 100644
--- a/numpy/core/include/numpy/npy_interrupt.h
+++ b/numpy/core/include/numpy/npy_interrupt.h
@@ -14,8 +14,8 @@
  * https://github.com/python/cpython/pull/20599).
  */
 
-#ifndef NPY_INTERRUPT_H
-#define NPY_INTERRUPT_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_
 
 #ifndef NPY_NO_SIGNAL
 
@@ -46,11 +46,11 @@
         PyOS_setsig(SIGINT, _npy_sig_save);                       \
         }
 
-#else /* NPY_NO_SIGNAL  */
+#else  /* NPY_NO_SIGNAL  */
 
 #define NPY_SIGINT_ON
 #define NPY_SIGINT_OFF
 
-#endif /* HAVE_SIGSETJMP */
+#endif  /* HAVE_SIGSETJMP */
 
-#endif /* NPY_INTERRUPT_H */
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_INTERRUPT_H_ */
diff --git a/numpy/core/include/numpy/npy_math.h b/numpy/core/include/numpy/npy_math.h
index e9a6a30d2..bead0dc14 100644
--- a/numpy/core/include/numpy/npy_math.h
+++ b/numpy/core/include/numpy/npy_math.h
@@ -1,5 +1,5 @@
-#ifndef __NPY_MATH_C99_H_
-#define __NPY_MATH_C99_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_MATH_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_MATH_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -150,6 +150,17 @@ NPY_INPLACE npy_long npy_lshiftl(npy_long a, npy_long b);
 NPY_INPLACE npy_longlong npy_rshiftll(npy_longlong a, npy_longlong b);
 NPY_INPLACE npy_longlong npy_lshiftll(npy_longlong a, npy_longlong b);
 
+NPY_INPLACE uint8_t npy_popcountuhh(npy_ubyte a);
+NPY_INPLACE uint8_t npy_popcountuh(npy_ushort a);
+NPY_INPLACE uint8_t npy_popcountu(npy_uint a);
+NPY_INPLACE uint8_t npy_popcountul(npy_ulong a);
+NPY_INPLACE uint8_t npy_popcountull(npy_ulonglong a);
+NPY_INPLACE uint8_t npy_popcounthh(npy_byte a);
+NPY_INPLACE uint8_t npy_popcounth(npy_short a);
+NPY_INPLACE uint8_t npy_popcount(npy_int a);
+NPY_INPLACE uint8_t npy_popcountl(npy_long a);
+NPY_INPLACE uint8_t npy_popcountll(npy_longlong a);
+
 /*
  * C99 double math funcs
  */
@@ -585,4 +596,4 @@ void npy_set_floatstatus_invalid(void);
 #include "npy_math_internal.h"
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_MATH_H_ */
diff --git a/numpy/core/include/numpy/npy_no_deprecated_api.h b/numpy/core/include/numpy/npy_no_deprecated_api.h
index 6183dc278..39658c0bd 100644
--- a/numpy/core/include/numpy/npy_no_deprecated_api.h
+++ b/numpy/core/include/numpy/npy_no_deprecated_api.h
@@ -9,11 +9,12 @@
 #ifndef NPY_NO_DEPRECATED_API
 
 /* put this check here since there may be multiple includes in C extensions. */
-#if defined(NDARRAYTYPES_H) || defined(_NPY_DEPRECATED_API_H) || \
-    defined(OLD_DEFINES_H)
+#if defined(NUMPY_CORE_INCLUDE_NUMPY_NDARRAYTYPES_H_) || \
+    defined(NUMPY_CORE_INCLUDE_NUMPY_NPY_DEPRECATED_API_H) || \
+    defined(NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_)
 #error "npy_no_deprecated_api.h" must be first among numpy includes.
 #else
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #endif
 
-#endif
+#endif  /* NPY_NO_DEPRECATED_API */
diff --git a/numpy/core/include/numpy/npy_os.h b/numpy/core/include/numpy/npy_os.h
index 9228c3916..efa0e4012 100644
--- a/numpy/core/include/numpy/npy_os.h
+++ b/numpy/core/include/numpy/npy_os.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_OS_H_
-#define _NPY_OS_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_OS_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_OS_H_
 
 #if defined(linux) || defined(__linux) || defined(__linux__)
     #define NPY_OS_LINUX
@@ -27,4 +27,4 @@
     #define NPY_OS_UNKNOWN
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_OS_H_ */
diff --git a/numpy/core/include/numpy/numpyconfig.h b/numpy/core/include/numpy/numpyconfig.h
index 726f1dfac..1c3686769 100644
--- a/numpy/core/include/numpy/numpyconfig.h
+++ b/numpy/core/include/numpy/numpyconfig.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_NUMPYCONFIG_H_
-#define _NPY_NUMPYCONFIG_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_
 
 #include "_numpyconfig.h"
 
@@ -19,6 +19,19 @@
         #define NPY_SIZEOF_LONG         4
         #define NPY_SIZEOF_PY_INTPTR_T  4
     #endif
+
+    #undef NPY_SIZEOF_LONGDOUBLE
+    #undef NPY_SIZEOF_COMPLEX_LONGDOUBLE
+
+    #ifdef __x86_64
+        #define NPY_SIZEOF_LONGDOUBLE         16
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32
+    #elif defined(__arm64__)
+        #define NPY_SIZEOF_LONGDOUBLE         8
+        #define NPY_SIZEOF_COMPLEX_LONGDOUBLE 16
+    #else
+        #error "unknown architecture"
+    #endif
 #endif
 
 /**
@@ -45,4 +58,4 @@
 #define NPY_1_21_API_VERSION 0x0000000e
 #define NPY_1_22_API_VERSION 0x0000000e
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_NPY_NUMPYCONFIG_H_ */
diff --git a/numpy/core/include/numpy/old_defines.h b/numpy/core/include/numpy/old_defines.h
index abf81595a..b3fa67751 100644
--- a/numpy/core/include/numpy/old_defines.h
+++ b/numpy/core/include/numpy/old_defines.h
@@ -1,6 +1,6 @@
 /* This header is deprecated as of NumPy 1.7 */
-#ifndef OLD_DEFINES_H
-#define OLD_DEFINES_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_
 
 #if defined(NPY_NO_DEPRECATED_API) && NPY_NO_DEPRECATED_API >= NPY_1_7_API_VERSION
 #error The header "old_defines.h" is deprecated as of NumPy 1.7.
@@ -184,4 +184,4 @@
 
 #define PyArray_UCS4 npy_ucs4
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_OLD_DEFINES_H_ */
diff --git a/numpy/core/include/numpy/oldnumeric.h b/numpy/core/include/numpy/oldnumeric.h
index 38530faf0..6604e8d17 100644
--- a/numpy/core/include/numpy/oldnumeric.h
+++ b/numpy/core/include/numpy/oldnumeric.h
@@ -1,3 +1,8 @@
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_OLDNUMERIC_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_OLDNUMERIC_H_
+
+/* FIXME -- this file can be deleted? */
+
 #include "arrayobject.h"
 
 #ifndef PYPY_VERSION
@@ -23,3 +28,5 @@
 
 #undef import_array
 #define import_array() { if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); } }
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_OLDNUMERIC_H_ */
diff --git a/numpy/core/include/numpy/random/bitgen.h b/numpy/core/include/numpy/random/bitgen.h
index 83c2858dd..162dd5c57 100644
--- a/numpy/core/include/numpy/random/bitgen.h
+++ b/numpy/core/include/numpy/random/bitgen.h
@@ -1,5 +1,5 @@
-#ifndef _RANDOM_BITGEN_H
-#define _RANDOM_BITGEN_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_RANDOM_BITGEN_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_RANDOM_BITGEN_H_
 
 #pragma once
 #include <stddef.h>
@@ -17,4 +17,4 @@ typedef struct bitgen {
 } bitgen_t;
 
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_RANDOM_BITGEN_H_ */
diff --git a/numpy/core/include/numpy/random/distributions.h b/numpy/core/include/numpy/random/distributions.h
index 554198174..dacf77829 100644
--- a/numpy/core/include/numpy/random/distributions.h
+++ b/numpy/core/include/numpy/random/distributions.h
@@ -1,11 +1,11 @@
-#ifndef _RANDOMDGEN__DISTRIBUTIONS_H_
-#define _RANDOMDGEN__DISTRIBUTIONS_H_
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_RANDOM_DISTRIBUTIONS_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_RANDOM_DISTRIBUTIONS_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "Python.h"
+#include <Python.h>
 #include "numpy/npy_common.h"
 #include <stddef.h>
 #include <stdbool.h>
@@ -206,4 +206,4 @@ static NPY_INLINE double next_double(bitgen_t *bitgen_state) {
 }
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_RANDOM_DISTRIBUTIONS_H_ */
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index fd7307703..3f184bd45 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -1,5 +1,5 @@
-#ifndef Py_UFUNCOBJECT_H
-#define Py_UFUNCOBJECT_H
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_UFUNCOBJECT_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_UFUNCOBJECT_H_
 
 #include <numpy/npy_math.h>
 #include <numpy/npy_common.h>
@@ -349,8 +349,8 @@ typedef struct _loop1d_info {
 #endif
 #endif
 
-
 #ifdef __cplusplus
 }
 #endif
-#endif /* !Py_UFUNCOBJECT_H */
+
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_UFUNCOBJECT_H_ */
diff --git a/numpy/core/include/numpy/utils.h b/numpy/core/include/numpy/utils.h
index e251a5201..e2b57f9e5 100644
--- a/numpy/core/include/numpy/utils.h
+++ b/numpy/core/include/numpy/utils.h
@@ -1,5 +1,5 @@
-#ifndef __NUMPY_UTILS_HEADER__
-#define __NUMPY_UTILS_HEADER__
+#ifndef NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
+#define NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_
 
 #ifndef __COMP_NPY_UNUSED
     #if defined(__GNUC__)
@@ -34,4 +34,4 @@
 #define NPY_CAT_(a, b) NPY_CAT__(a, b)
 #define NPY_CAT(a, b) NPY_CAT_(a, b)
 
-#endif
+#endif  /* NUMPY_CORE_INCLUDE_NUMPY_UTILS_H_ */
diff --git a/numpy/core/memmap.pyi b/numpy/core/memmap.pyi
new file mode 100644
index 000000000..ba595bf1e
--- /dev/null
+++ b/numpy/core/memmap.pyi
@@ -0,0 +1,5 @@
+from typing import List
+
+from numpy import memmap as memmap
+
+__all__: List[str]
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 154df6f4d..351cd3a1b 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -31,8 +31,8 @@ __all__ = [
     'count_nonzero', 'c_einsum', 'datetime_as_string', 'datetime_data',
     'dot', 'dragon4_positional', 'dragon4_scientific', 'dtype',
     'empty', 'empty_like', 'error', 'flagsobj', 'flatiter', 'format_longfloat',
-    'frombuffer', 'fromfile', 'fromiter', 'fromstring', 'inner',
-    'interp', 'interp_complex', 'is_busday', 'lexsort',
+    'frombuffer', 'fromfile', 'fromiter', 'fromstring', 'get_handler_name',
+    'inner', 'interp', 'interp_complex', 'is_busday', 'lexsort',
     'matmul', 'may_share_memory', 'min_scalar_type', 'ndarray', 'nditer',
     'nested_iters', 'normalize_axis_index', 'packbits',
     'promote_types', 'putmask', 'ravel_multi_index', 'result_type', 'scalar',
diff --git a/numpy/core/multiarray.pyi b/numpy/core/multiarray.pyi
index 501e55634..a9f68e181 100644
--- a/numpy/core/multiarray.pyi
+++ b/numpy/core/multiarray.pyi
@@ -30,7 +30,6 @@ from numpy import (
     nditer as nditer,
 
     # The rest
-    nditer,
     ufunc,
     str_,
     bool_,
@@ -51,7 +50,9 @@ from numpy import (
     _ModeKind,
     _SupportsBuffer,
     _IOProtocol,
-    _CopyMode
+    _CopyMode,
+    _NDIterFlagsKind,
+    _NDIterOpFlagsKind,
 )
 
 from numpy.typing import (
@@ -66,7 +67,7 @@ from numpy.typing import (
     NDArray,
     ArrayLike,
     _SupportsArray,
-    _NestedSequence,
+    _FiniteNestedSequence,
     _ArrayLikeBool_co,
     _ArrayLikeUInt_co,
     _ArrayLikeInt_co,
@@ -92,7 +93,7 @@ _DTypeLike = Union[
     Type[_SCT],
     _SupportsDType[dtype[_SCT]],
 ]
-_ArrayLike = _NestedSequence[_SupportsArray[dtype[_SCT]]]
+_ArrayLike = _FiniteNestedSequence[_SupportsArray[dtype[_SCT]]]
 
 # Valid time units
 _UnitKind = L[
@@ -1013,3 +1014,14 @@ class flagsobj:
     def owndata(self) -> bool: ...
     def __getitem__(self, key: _GetItemKeys) -> bool: ...
     def __setitem__(self, key: _SetItemKeys, value: bool) -> None: ...
+
+def nested_iters(
+    op: ArrayLike | Sequence[ArrayLike],
+    axes: Sequence[Sequence[SupportsIndex]],
+    flags: None | Sequence[_NDIterFlagsKind] = ...,
+    op_flags: None | Sequence[Sequence[_NDIterOpFlagsKind]] = ...,
+    op_dtypes: DTypeLike | Sequence[DTypeLike] = ...,
+    order: _OrderKACF = ...,
+    casting: _CastingKind = ...,
+    buffersize: SupportsIndex = ...,
+) -> Tuple[nditer, ...]: ...
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index d8a0cf9a6..1654e8364 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -1184,7 +1184,7 @@ def roll(a, shift, axis=None):
     >>> np.roll(x, -2)
     array([2, 3, 4, 5, 6, 7, 8, 9, 0, 1])
 
-    >>> x2 = np.reshape(x, (2,5))
+    >>> x2 = np.reshape(x, (2, 5))
     >>> x2
     array([[0, 1, 2, 3, 4],
            [5, 6, 7, 8, 9]])
@@ -1206,6 +1206,12 @@ def roll(a, shift, axis=None):
     >>> np.roll(x2, -1, axis=1)
     array([[1, 2, 3, 4, 0],
            [6, 7, 8, 9, 5]])
+    >>> np.roll(x2, (1, 1), axis=(1, 0))
+    array([[9, 5, 6, 7, 8],
+           [4, 0, 1, 2, 3]])
+    >>> np.roll(x2, (2, 1), axis=(1, 0))
+    array([[8, 9, 5, 6, 7],
+           [3, 4, 0, 1, 2]])
 
     """
     a = asanyarray(a)
diff --git a/numpy/core/numeric.pyi b/numpy/core/numeric.pyi
index 54ab4b7c8..d7ec30351 100644
--- a/numpy/core/numeric.pyi
+++ b/numpy/core/numeric.pyi
@@ -1,6 +1,5 @@
 from typing import (
     Any,
-    Optional,
     Union,
     Sequence,
     Tuple,
@@ -8,18 +7,64 @@ from typing import (
     List,
     overload,
     TypeVar,
-    Iterable,
     Literal,
+    Type,
+    SupportsAbs,
+    SupportsIndex,
+    NoReturn,
 )
+from typing_extensions import TypeGuard
 
-from numpy import ndarray, generic, dtype, bool_, signedinteger, _OrderKACF, _OrderCF
-from numpy.typing import ArrayLike, DTypeLike, _ShapeLike
+from numpy import (
+    ComplexWarning as ComplexWarning,
+    dtype,
+    generic,
+    unsignedinteger,
+    signedinteger,
+    floating,
+    complexfloating,
+    bool_,
+    int_,
+    intp,
+    float64,
+    timedelta64,
+    object_,
+    _OrderKACF,
+    _OrderCF,
+)
+
+from numpy.typing import (
+    ArrayLike,
+    NDArray,
+    DTypeLike,
+    _ShapeLike,
+    _SupportsDType,
+    _FiniteNestedSequence,
+    _SupportsArray,
+    _ScalarLike_co,
+    _ArrayLikeBool_co,
+    _ArrayLikeUInt_co,
+    _ArrayLikeInt_co,
+    _ArrayLikeFloat_co,
+    _ArrayLikeComplex_co,
+    _ArrayLikeTD64_co,
+    _ArrayLikeObject_co,
+)
 
 _T = TypeVar("_T")
-_ArrayType = TypeVar("_ArrayType", bound=ndarray)
+_SCT = TypeVar("_SCT", bound=generic)
+_ArrayType = TypeVar("_ArrayType", bound=NDArray[Any])
 
+_DTypeLike = Union[
+    dtype[_SCT],
+    Type[_SCT],
+    _SupportsDType[dtype[_SCT]],
+]
+_ArrayLike = _FiniteNestedSequence[_SupportsArray[dtype[_SCT]]]
 _CorrelateMode = Literal["valid", "same", "full"]
 
+__all__: List[str]
+
 @overload
 def zeros_like(
     a: _ArrayType,
@@ -30,20 +75,61 @@ def zeros_like(
 ) -> _ArrayType: ...
 @overload
 def zeros_like(
-    a: ArrayLike,
-    dtype: DTypeLike = ...,
+    a: _ArrayLike[_SCT],
+    dtype: None = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def zeros_like(
+    a: object,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def zeros_like(
+    a: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def zeros_like(
+    a: Any,
+    dtype: DTypeLike,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
+@overload
 def ones(
     shape: _ShapeLike,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[float64]: ...
+@overload
+def ones(
+    shape: _ShapeLike,
+    dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
+@overload
+def ones(
+    shape: _ShapeLike,
+    dtype: DTypeLike,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
 
 @overload
 def ones_like(
@@ -55,21 +141,64 @@ def ones_like(
 ) -> _ArrayType: ...
 @overload
 def ones_like(
-    a: ArrayLike,
-    dtype: DTypeLike = ...,
+    a: _ArrayLike[_SCT],
+    dtype: None = ...,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def ones_like(
+    a: object,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def ones_like(
+    a: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def ones_like(
+    a: Any,
+    dtype: DTypeLike,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
+@overload
 def full(
     shape: _ShapeLike,
     fill_value: Any,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
+@overload
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: _DTypeLike[_SCT],
     order: _OrderCF = ...,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
+@overload
+def full(
+    shape: _ShapeLike,
+    fill_value: Any,
+    dtype: DTypeLike,
+    order: _OrderCF = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[Any]: ...
 
 @overload
 def full_like(
@@ -82,13 +211,40 @@ def full_like(
 ) -> _ArrayType: ...
 @overload
 def full_like(
-    a: ArrayLike,
+    a: _ArrayLike[_SCT],
     fill_value: Any,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def full_like(
+    a: object,
+    fill_value: Any,
+    dtype: None = ...,
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
+@overload
+def full_like(
+    a: Any,
+    fill_value: Any,
+    dtype: _DTypeLike[_SCT],
+    order: _OrderKACF = ...,
+    subok: bool = ...,
+    shape: None | _ShapeLike= ...,
+) -> NDArray[_SCT]: ...
+@overload
+def full_like(
+    a: Any,
+    fill_value: Any,
+    dtype: DTypeLike,
     order: _OrderKACF = ...,
     subok: bool = ...,
-    shape: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    shape: None | _ShapeLike= ...,
+) -> NDArray[Any]: ...
 
 @overload
 def count_nonzero(
@@ -105,78 +261,306 @@ def count_nonzero(
     keepdims: bool = ...,
 ) -> Any: ...  # TODO: np.intp or ndarray[np.intp]
 
-def isfortran(a: Union[ndarray, generic]) -> bool: ...
+def isfortran(a: NDArray[Any] | generic) -> bool: ...
 
-def argwhere(a: ArrayLike) -> ndarray: ...
+def argwhere(a: ArrayLike) -> NDArray[intp]: ...
 
-def flatnonzero(a: ArrayLike) -> ndarray: ...
+def flatnonzero(a: ArrayLike) -> NDArray[intp]: ...
 
+@overload
 def correlate(
-    a: ArrayLike,
-    v: ArrayLike,
+    a: _ArrayLikeBool_co,
+    v: _ArrayLikeBool_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[bool_]: ...
+@overload
+def correlate(
+    a: _ArrayLikeUInt_co,
+    v: _ArrayLikeUInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeInt_co,
+    v: _ArrayLikeInt_co,
     mode: _CorrelateMode = ...,
-) -> ndarray: ...
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeFloat_co,
+    v: _ArrayLikeFloat_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeComplex_co,
+    v: _ArrayLikeComplex_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def correlate(
+    a: _ArrayLikeTD64_co,
+    v: _ArrayLikeTD64_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def correlate(
+    a: _ArrayLikeObject_co,
+    v: _ArrayLikeObject_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[object_]: ...
 
+@overload
 def convolve(
-    a: ArrayLike,
-    v: ArrayLike,
+    a: _ArrayLikeBool_co,
+    v: _ArrayLikeBool_co,
     mode: _CorrelateMode = ...,
-) -> ndarray: ...
+) -> NDArray[bool_]: ...
+@overload
+def convolve(
+    a: _ArrayLikeUInt_co,
+    v: _ArrayLikeUInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeInt_co,
+    v: _ArrayLikeInt_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeFloat_co,
+    v: _ArrayLikeFloat_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeComplex_co,
+    v: _ArrayLikeComplex_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def convolve(
+    a: _ArrayLikeTD64_co,
+    v: _ArrayLikeTD64_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def convolve(
+    a: _ArrayLikeObject_co,
+    v: _ArrayLikeObject_co,
+    mode: _CorrelateMode = ...,
+) -> NDArray[object_]: ...
 
 @overload
 def outer(
-    a: ArrayLike,
-    b: ArrayLike,
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
     out: None = ...,
-) -> ndarray: ...
+) -> NDArray[bool_]: ...
 @overload
 def outer(
-    a: ArrayLike,
-    b: ArrayLike,
-    out: _ArrayType = ...,
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    out: None = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    out: None = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    out: None = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    out: None = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def outer(
+    a: _ArrayLikeTD64_co,
+    b: _ArrayLikeTD64_co,
+    out: None = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def outer(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    out: None = ...,
+) -> NDArray[object_]: ...
+@overload
+def outer(
+    a: _ArrayLikeComplex_co | _ArrayLikeTD64_co | _ArrayLikeObject_co,
+    b: _ArrayLikeComplex_co | _ArrayLikeTD64_co | _ArrayLikeObject_co,
+    out: _ArrayType,
 ) -> _ArrayType: ...
 
+@overload
 def tensordot(
-    a: ArrayLike,
-    b: ArrayLike,
-    axes: Union[int, Tuple[_ShapeLike, _ShapeLike]] = ...,
-) -> ndarray: ...
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[bool_]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeTD64_co,
+    b: _ArrayLikeTD64_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[timedelta64]: ...
+@overload
+def tensordot(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    axes: int | Tuple[_ShapeLike, _ShapeLike] = ...,
+) -> NDArray[object_]: ...
 
+@overload
+def roll(
+    a: _ArrayLike[_SCT],
+    shift: _ShapeLike,
+    axis: None | _ShapeLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
 def roll(
     a: ArrayLike,
     shift: _ShapeLike,
-    axis: Optional[_ShapeLike] = ...,
-) -> ndarray: ...
+    axis: None | _ShapeLike = ...,
+) -> NDArray[Any]: ...
 
-def rollaxis(a: ndarray, axis: int, start: int = ...) -> ndarray: ...
+def rollaxis(
+    a: NDArray[_SCT],
+    axis: int,
+    start: int = ...,
+) -> NDArray[_SCT]: ...
 
 def moveaxis(
-    a: ndarray,
+    a: NDArray[_SCT],
     source: _ShapeLike,
     destination: _ShapeLike,
-) -> ndarray: ...
+) -> NDArray[_SCT]: ...
 
+@overload
 def cross(
-    a: ArrayLike,
-    b: ArrayLike,
+    a: _ArrayLikeBool_co,
+    b: _ArrayLikeBool_co,
     axisa: int = ...,
     axisb: int = ...,
     axisc: int = ...,
-    axis: Optional[int] = ...,
-) -> ndarray: ...
+    axis: None | int = ...,
+) -> NoReturn: ...
+@overload
+def cross(
+    a: _ArrayLikeUInt_co,
+    b: _ArrayLikeUInt_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[unsignedinteger[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeInt_co,
+    b: _ArrayLikeInt_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[signedinteger[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeFloat_co,
+    b: _ArrayLikeFloat_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[floating[Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeComplex_co,
+    b: _ArrayLikeComplex_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[complexfloating[Any, Any]]: ...
+@overload
+def cross(
+    a: _ArrayLikeObject_co,
+    b: _ArrayLikeObject_co,
+    axisa: int = ...,
+    axisb: int = ...,
+    axisc: int = ...,
+    axis: None | int = ...,
+) -> NDArray[object_]: ...
 
 @overload
 def indices(
     dimensions: Sequence[int],
-    dtype: DTypeLike = ...,
+    dtype: Type[int] = ...,
     sparse: Literal[False] = ...,
-) -> ndarray: ...
+) -> NDArray[int_]: ...
 @overload
 def indices(
     dimensions: Sequence[int],
-    dtype: DTypeLike = ...,
+    dtype: Type[int] = ...,
     sparse: Literal[True] = ...,
-) -> Tuple[ndarray, ...]: ...
+) -> Tuple[NDArray[int_], ...]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: _DTypeLike[_SCT],
+    sparse: Literal[False] = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: _DTypeLike[_SCT],
+    sparse: Literal[True],
+) -> Tuple[NDArray[_SCT], ...]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike,
+    sparse: Literal[False] = ...,
+) -> NDArray[Any]: ...
+@overload
+def indices(
+    dimensions: Sequence[int],
+    dtype: DTypeLike,
+    sparse: Literal[True],
+) -> Tuple[NDArray[Any], ...]: ...
 
 def fromfunction(
     function: Callable[..., _T],
@@ -187,18 +571,39 @@ def fromfunction(
     **kwargs: Any,
 ) -> _T: ...
 
-def isscalar(element: Any) -> bool: ...
+def isscalar(element: object) -> TypeGuard[
+    generic | bool | int | float | complex | str | bytes | memoryview
+]: ...
 
-def binary_repr(num: int, width: Optional[int] = ...) -> str: ...
+def binary_repr(num: int, width: None | int = ...) -> str: ...
 
-def base_repr(number: int, base: int = ..., padding: int = ...) -> str: ...
+def base_repr(
+    number: SupportsAbs[float],
+    base: float = ...,
+    padding: SupportsIndex = ...,
+) -> str: ...
 
+@overload
 def identity(
     n: int,
-    dtype: DTypeLike = ...,
+    dtype: None = ...,
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[float64]: ...
+@overload
+def identity(
+    n: int,
+    dtype: _DTypeLike[_SCT],
+    *,
+    like: ArrayLike = ...,
+) -> NDArray[_SCT]: ...
+@overload
+def identity(
+    n: int,
+    dtype: DTypeLike,
     *,
     like: ArrayLike = ...,
-) -> ndarray: ...
+) -> NDArray[Any]: ...
 
 def allclose(
     a: ArrayLike,
@@ -208,13 +613,22 @@ def allclose(
     equal_nan: bool = ...,
 ) -> bool: ...
 
+@overload
+def isclose(
+    a: _ScalarLike_co,
+    b: _ScalarLike_co,
+    rtol: float = ...,
+    atol: float = ...,
+    equal_nan: bool = ...,
+) -> bool_: ...
+@overload
 def isclose(
     a: ArrayLike,
     b: ArrayLike,
     rtol: float = ...,
     atol: float = ...,
     equal_nan: bool = ...,
-) -> Any: ...
+) -> NDArray[bool_]: ...
 
 def array_equal(a1: ArrayLike, a2: ArrayLike, equal_nan: bool = ...) -> bool: ...
 
diff --git a/numpy/core/numerictypes.py b/numpy/core/numerictypes.py
index 12f424fd4..8e5de852b 100644
--- a/numpy/core/numerictypes.py
+++ b/numpy/core/numerictypes.py
@@ -80,12 +80,10 @@ Exported symbols include:
 
 """
 import numbers
-import warnings
 
 from numpy.core.multiarray import (
-        typeinfo, ndarray, array, empty, dtype, datetime_data,
-        datetime_as_string, busday_offset, busday_count, is_busday,
-        busdaycalendar
+        ndarray, array, dtype, datetime_data, datetime_as_string,
+        busday_offset, busday_count, is_busday, busdaycalendar
         )
 from numpy.core.overrides import set_module
 
diff --git a/numpy/core/overrides.py b/numpy/core/overrides.py
index e1fdd06f2..840cf38c9 100644
--- a/numpy/core/overrides.py
+++ b/numpy/core/overrides.py
@@ -2,7 +2,6 @@
 import collections
 import functools
 import os
-import textwrap
 
 from numpy.core._multiarray_umath import (
     add_docstring, implement_array_function, _get_implementing_args)
diff --git a/numpy/core/records.py b/numpy/core/records.py
index fd5f1ab39..c014bc97c 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -41,7 +41,7 @@ from . import numeric as sb
 from . import numerictypes as nt
 from numpy.compat import os_fspath
 from numpy.core.overrides import set_module
-from .arrayprint import get_printoptions
+from .arrayprint import _get_legacy_print_mode
 
 # All of the functions allow formats to be a dtype
 __all__ = [
@@ -68,7 +68,7 @@ _byteorderconv = {'b':'>',
                   'i':'|'}
 
 # formats regular expression
-# allows multidimension spec with a tuple syntax in front
+# allows multidimensional spec with a tuple syntax in front
 # of the letter code '(2,3)f4' and ' (  2 ,  3  )  f4  '
 # are equally allowed
 
@@ -230,12 +230,12 @@ class record(nt.void):
     __module__ = 'numpy'
 
     def __repr__(self):
-        if get_printoptions()['legacy'] == '1.13':
+        if _get_legacy_print_mode() <= 113:
             return self.__str__()
         return super().__repr__()
 
     def __str__(self):
-        if get_printoptions()['legacy'] == '1.13':
+        if _get_legacy_print_mode() <= 113:
             return str(self.item())
         return super().__str__()
 
@@ -551,7 +551,7 @@ class recarray(ndarray):
             lst = "[], shape=%s" % (repr(self.shape),)
 
         lf = '\n'+' '*len(prefix)
-        if get_printoptions()['legacy'] == '1.13':
+        if _get_legacy_print_mode() <= 113:
             lf = ' ' + lf  # trailing space
         return fmt % (lst, lf, repr_dtype)
 
@@ -585,6 +585,7 @@ def _deprecate_shape_0_as_None(shape):
         return shape
 
 
+@set_module("numpy.rec")
 def fromarrays(arrayList, dtype=None, shape=None, formats=None,
                names=None, titles=None, aligned=False, byteorder=None):
     """Create a record array from a (flat) list of arrays
@@ -678,6 +679,8 @@ def fromarrays(arrayList, dtype=None, shape=None, formats=None,
 
     return _array
 
+
+@set_module("numpy.rec")
 def fromrecords(recList, dtype=None, shape=None, formats=None, names=None,
                 titles=None, aligned=False, byteorder=None):
     """Create a recarray from a list of records in text form.
@@ -762,6 +765,7 @@ def fromrecords(recList, dtype=None, shape=None, formats=None, names=None,
     return res
 
 
+@set_module("numpy.rec")
 def fromstring(datastring, dtype=None, shape=None, offset=0, formats=None,
                names=None, titles=None, aligned=False, byteorder=None):
     r"""Create a record array from binary data
@@ -844,6 +848,8 @@ def get_remaining_size(fd):
     finally:
         fd.seek(pos, 0)
 
+
+@set_module("numpy.rec")
 def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
              names=None, titles=None, aligned=False, byteorder=None):
     """Create an array from binary file data
@@ -943,6 +949,8 @@ def fromfile(fd, dtype=None, shape=None, offset=0, formats=None,
 
     return _array
 
+
+@set_module("numpy.rec")
 def array(obj, dtype=None, shape=None, offset=0, strides=None, formats=None,
           names=None, titles=None, aligned=False, byteorder=None, copy=True):
     """
diff --git a/numpy/core/records.pyi b/numpy/core/records.pyi
new file mode 100644
index 000000000..fda118276
--- /dev/null
+++ b/numpy/core/records.pyi
@@ -0,0 +1,183 @@
+import os
+from typing import (
+    List,
+    Sequence,
+    Any,
+    TypeVar,
+    Iterable,
+    overload,
+    Tuple,
+    Protocol,
+)
+
+from numpy import (
+    format_parser as format_parser,
+    record as record,
+    recarray as recarray,
+    dtype,
+    generic,
+    void,
+    _ByteOrder,
+    _SupportsBuffer,
+)
+
+from numpy.typing import (
+    ArrayLike,
+    DTypeLike,
+    NDArray,
+    _ShapeLike,
+    _ArrayLikeVoid_co,
+    _NestedSequence,
+)
+
+_SCT = TypeVar("_SCT", bound=generic)
+
+_RecArray = recarray[Any, dtype[_SCT]]
+
+class _SupportsReadInto(Protocol):
+    def seek(self, offset: int, whence: int, /) -> object: ...
+    def tell(self, /) -> int: ...
+    def readinto(self, buffer: memoryview, /) -> int: ...
+
+__all__: List[str]
+
+@overload
+def fromarrays(
+    arrayList: Iterable[ArrayLike],
+    dtype: DTypeLike = ...,
+    shape: None | _ShapeLike = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+) -> _RecArray[Any]: ...
+@overload
+def fromarrays(
+    arrayList: Iterable[ArrayLike],
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    *,
+    formats: DTypeLike,
+    names: None | str | Sequence[str] = ...,
+    titles: None | str | Sequence[str] = ...,
+    aligned: bool = ...,
+    byteorder: None | _ByteOrder = ...,
+) -> _RecArray[record]: ...
+
+@overload
+def fromrecords(
+    recList: _ArrayLikeVoid_co | Tuple[Any, ...] | _NestedSequence[Tuple[Any, ...]],
+    dtype: DTypeLike = ...,
+    shape: None | _ShapeLike = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+) -> _RecArray[record]: ...
+@overload
+def fromrecords(
+    recList: _ArrayLikeVoid_co | Tuple[Any, ...] | _NestedSequence[Tuple[Any, ...]],
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    *,
+    formats: DTypeLike,
+    names: None | str | Sequence[str] = ...,
+    titles: None | str | Sequence[str] = ...,
+    aligned: bool = ...,
+    byteorder: None | _ByteOrder = ...,
+) -> _RecArray[record]: ...
+
+@overload
+def fromstring(
+    datastring: _SupportsBuffer,
+    dtype: DTypeLike,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+) -> _RecArray[record]: ...
+@overload
+def fromstring(
+    datastring: _SupportsBuffer,
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    *,
+    formats: DTypeLike,
+    names: None | str | Sequence[str] = ...,
+    titles: None | str | Sequence[str] = ...,
+    aligned: bool = ...,
+    byteorder: None | _ByteOrder = ...,
+) -> _RecArray[record]: ...
+
+@overload
+def fromfile(
+    fd: str | bytes | os.PathLike[str] | os.PathLike[bytes] | _SupportsReadInto,
+    dtype: DTypeLike,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+) -> _RecArray[Any]: ...
+@overload
+def fromfile(
+    fd: str | bytes | os.PathLike[str] | os.PathLike[bytes] | _SupportsReadInto,
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    *,
+    formats: DTypeLike,
+    names: None | str | Sequence[str] = ...,
+    titles: None | str | Sequence[str] = ...,
+    aligned: bool = ...,
+    byteorder: None | _ByteOrder = ...,
+) -> _RecArray[record]: ...
+
+@overload
+def array(
+    obj: _SCT | NDArray[_SCT],
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+    copy: bool = ...,
+) -> _RecArray[_SCT]: ...
+@overload
+def array(
+    obj: ArrayLike,
+    dtype: DTypeLike,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    formats: None = ...,
+    names: None = ...,
+    titles: None = ...,
+    aligned: bool = ...,
+    byteorder: None = ...,
+    copy: bool = ...,
+) -> _RecArray[Any]: ...
+@overload
+def array(
+    obj: ArrayLike,
+    dtype: None = ...,
+    shape: None | _ShapeLike = ...,
+    offset: int = ...,
+    *,
+    formats: DTypeLike,
+    names: None | str | Sequence[str] = ...,
+    titles: None | str | Sequence[str] = ...,
+    aligned: bool = ...,
+    byteorder: None | _ByteOrder = ...,
+    copy: bool = ...,
+) -> _RecArray[record]: ...
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index ba7d83787..3e1ed4c9b 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -5,6 +5,7 @@ import copy
 import warnings
 import platform
 import textwrap
+import glob
 from os.path import join
 
 from numpy.distutils import log
@@ -63,6 +64,20 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_complex))
         return out
 
+def can_link_svml():
+    """SVML library is supported only on x86_64 architecture and currently
+    only on linux
+    """
+    machine = platform.machine()
+    system = platform.system()
+    return "x86_64" in machine and system == "Linux"
+
+def check_svml_submodule(svmlpath):
+    if not os.path.exists(svmlpath + "/README.md"):
+        raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
+                           "update --init` to fix this.")
+    return True
+
 def pythonlib_dir():
     """return path where libpython* is."""
     if sys.platform == 'win32':
@@ -455,6 +470,9 @@ def configuration(parent_package='',top_path=None):
             # Inline check
             inline = config_cmd.check_inline()
 
+            if can_link_svml():
+                moredefs.append(('NPY_CAN_LINK_SVML', 1))
+
             # Use relaxed stride checking
             if NPY_RELAXED_STRIDES_CHECKING:
                 moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1))
@@ -496,7 +514,7 @@ def configuration(parent_package='',top_path=None):
                 # add the guard to make sure config.h is never included directly,
                 # but always through npy_config.h
                 target_f.write(textwrap.dedent("""
-                    #ifndef _NPY_NPY_CONFIG_H_
+                    #ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
                     #error config.h should never be included directly, include npy_config.h instead
                     #endif
                     """))
@@ -678,16 +696,24 @@ def configuration(parent_package='',top_path=None):
                        join('src', 'npymath', 'halffloat.c')
                        ]
 
-    # Must be true for CRT compilers but not MinGW/cygwin. See gh-9977.
-    # Intel and Clang also don't seem happy with /GL
-    is_msvc = (platform.platform().startswith('Windows') and
-               platform.python_compiler().startswith('MS'))
+    def gl_if_msvc(build_cmd):
+        """ Add flag if we are using MSVC compiler
+
+        We can't see this in our scope, because we have not initialized the
+        distutils build command, so use this deferred calculation to run when
+        we are building the library.
+        """
+        if build_cmd.compiler.compiler_type == 'msvc':
+            # explicitly disable whole-program optimization
+            return ['/GL-']
+        return []
+
     config.add_installed_library('npymath',
             sources=npymath_sources + [get_mathlib_info],
             install_dir='lib',
             build_info={
                 'include_dirs' : [],  # empty list required for creating npy_math_internal.h
-                'extra_compiler_args' : (['/GL-'] if is_msvc else []),
+                'extra_compiler_args': [gl_if_msvc],
             })
     config.add_npy_pkg_config("npymath.ini.in", "lib/npy-pkg-config",
             subst_dict)
@@ -727,6 +753,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'npy_import.h'),
             join('src', 'common', 'npy_hashtable.h'),
             join('src', 'common', 'npy_longdouble.h'),
+            join('src', 'common', 'npy_svml.h'),
             join('src', 'common', 'templ_common.h.src'),
             join('src', 'common', 'ucsnarrow.h'),
             join('src', 'common', 'ufunc_override.h'),
@@ -791,6 +818,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dragon4.h'),
             join('src', 'multiarray', 'einsum_debug.h'),
             join('src', 'multiarray', 'einsum_sumprod.h'),
+            join('src', 'multiarray', 'experimental_public_dtype_api.h'),
             join('src', 'multiarray', 'getset.h'),
             join('src', 'multiarray', 'hashdescr.h'),
             join('src', 'multiarray', 'iterators.h'),
@@ -858,6 +886,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'multiarray', 'dtype_transfer.c'),
             join('src', 'multiarray', 'einsum.c.src'),
             join('src', 'multiarray', 'einsum_sumprod.c.src'),
+            join('src', 'multiarray', 'experimental_public_dtype_api.c'),
             join('src', 'multiarray', 'flagsobject.c'),
             join('src', 'multiarray', 'getset.c'),
             join('src', 'multiarray', 'hashdescr.c'),
@@ -888,7 +917,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'npysort', 'mergesort.c.src'),
             join('src', 'npysort', 'timsort.c.src'),
             join('src', 'npysort', 'heapsort.c.src'),
-            join('src', 'npysort', 'radixsort.c.src'),
+            join('src', 'npysort', 'radixsort.cpp'),
             join('src', 'common', 'npy_partition.h.src'),
             join('src', 'npysort', 'selection.c.src'),
             join('src', 'common', 'npy_binsearch.h.src'),
@@ -923,11 +952,12 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+            join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
-            join('src', 'umath', 'clip.h.src'),
-            join('src', 'umath', 'clip.c.src'),
+            join('src', 'umath', 'clip.h'),
+            join('src', 'umath', 'clip.cpp'),
             join('src', 'umath', 'dispatching.c'),
             join('src', 'umath', 'legacy_array_method.c'),
             join('src', 'umath', 'ufunc_object.c'),
@@ -951,7 +981,15 @@ def configuration(parent_package='',top_path=None):
             join(codegen_dir, 'generate_ufunc_api.py'),
             ]
 
+    svml_path = join('numpy', 'core', 'src', 'umath', 'svml')
+    svml_objs = []
+    if can_link_svml() and check_svml_submodule(svml_path):
+        svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
+
     config.add_extension('_multiarray_umath',
+                         # Forcing C language even though we have C++ sources.
+                         # It forces the C linker and don't link C++ runtime.
+                         language = 'c',
                          sources=multiarray_src + umath_src +
                                  common_src +
                                  [generate_config_h,
@@ -965,7 +1003,12 @@ def configuration(parent_package='',top_path=None):
                          depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
                          libraries=['npymath'],
-                         extra_info=extra_info)
+                         extra_objects=svml_objs,
+                         extra_info=extra_info,
+                         extra_cxx_compile_args=['-std=c++11',
+                                                 '-D__STDC_VERSION__=0',
+                                                 '-fno-exceptions',
+                                                 '-fno-rtti'])
 
     #######################################################################
     #                        umath_tests module                           #
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 85c8f16d1..70e8fc897 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -43,8 +43,8 @@ C_ABI_VERSION = 0x01000009
 # 0x0000000d - 1.19.x
 # 0x0000000e - 1.20.x
 # 0x0000000e - 1.21.x
-# 0x0000000e - 1.22.x
-C_API_VERSION = 0x0000000e
+# 0x0000000f - 1.22.x
+C_API_VERSION = 0x0000000f
 
 class MismatchCAPIWarning(Warning):
     pass
diff --git a/numpy/core/shape_base.pyi b/numpy/core/shape_base.pyi
index d7914697d..159ad2781 100644
--- a/numpy/core/shape_base.pyi
+++ b/numpy/core/shape_base.pyi
@@ -1,12 +1,12 @@
 from typing import TypeVar, overload, List, Sequence, Any, SupportsIndex
 
 from numpy import generic, dtype
-from numpy.typing import ArrayLike, NDArray, _NestedSequence, _SupportsArray
+from numpy.typing import ArrayLike, NDArray, _FiniteNestedSequence, _SupportsArray
 
 _SCT = TypeVar("_SCT", bound=generic)
 _ArrayType = TypeVar("_ArrayType", bound=NDArray[Any])
 
-_ArrayLike = _NestedSequence[_SupportsArray[dtype[_SCT]]]
+_ArrayLike = _FiniteNestedSequence[_SupportsArray[dtype[_SCT]]]
 
 __all__: List[str]
 
diff --git a/numpy/core/src/_simd/_simd_inc.h.src b/numpy/core/src/_simd/_simd_inc.h.src
index 9858fc0dc..fbdf982c2 100644
--- a/numpy/core/src/_simd/_simd_inc.h.src
+++ b/numpy/core/src/_simd/_simd_inc.h.src
@@ -113,7 +113,7 @@ typedef struct
     int is_scalar:1;
     // returns '1' if the type represent a vector
     int is_vector:1;
-    // returns the len of multi-vector if the type reprsent x2 or x3 vector
+    // returns the len of multi-vector if the type represent x2 or x3 vector
     // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
     int is_vectorx;
     // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
diff --git a/numpy/core/src/common/.doxyfile b/numpy/core/src/common/.doxyfile
new file mode 100644
index 000000000..462cbbcfa
--- /dev/null
+++ b/numpy/core/src/common/.doxyfile
@@ -0,0 +1 @@
+INCLUDE_PATH += @CUR_DIR
diff --git a/numpy/core/src/common/array_assign.c b/numpy/core/src/common/array_assign.c
index c55f6bdb4..b7495fc09 100644
--- a/numpy/core/src/common/array_assign.c
+++ b/numpy/core/src/common/array_assign.c
@@ -7,12 +7,12 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
 #include "npy_config.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/common/array_assign.h b/numpy/core/src/common/array_assign.h
index f5d884dd9..8a28ed1d3 100644
--- a/numpy/core/src/common/array_assign.h
+++ b/numpy/core/src/common/array_assign.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__ARRAY_ASSIGN_H_
-#define _NPY_PRIVATE__ARRAY_ASSIGN_H_
+#ifndef NUMPY_CORE_SRC_COMMON_ARRAY_ASSIGN_H_
+#define NUMPY_CORE_SRC_COMMON_ARRAY_ASSIGN_H_
 
 /*
  * An array assignment function for copying arrays, treating the
@@ -115,4 +115,4 @@ NPY_NO_EXPORT int
 arrays_overlap(PyArrayObject *arr1, PyArrayObject *arr2);
 
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_ARRAY_ASSIGN_H_ */
diff --git a/numpy/core/src/common/binop_override.h b/numpy/core/src/common/binop_override.h
index c5e7ab808..61bc05ef3 100644
--- a/numpy/core/src/common/binop_override.h
+++ b/numpy/core/src/common/binop_override.h
@@ -1,5 +1,5 @@
-#ifndef __BINOP_OVERRIDE_H
-#define __BINOP_OVERRIDE_H
+#ifndef NUMPY_CORE_SRC_COMMON_BINOP_OVERRIDE_H_
+#define NUMPY_CORE_SRC_COMMON_BINOP_OVERRIDE_H_
 
 #include <string.h>
 #include <Python.h>
@@ -212,4 +212,4 @@ binop_should_defer(PyObject *self, PyObject *other, int inplace)
         }                                                               \
     } while (0)
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_BINOP_OVERRIDE_H_ */
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index e78587de0..714636782 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -2,17 +2,19 @@
  * This module provides a BLAS optimized matrix multiply,
  * inner product and dot for numpy arrays
  */
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <assert.h>
-#include <numpy/arrayobject.h>
+
+#include "numpy/arrayobject.h"
 #include "npy_cblas.h"
 #include "arraytypes.h"
 #include "common.h"
 
+#include <assert.h>
+
 
 static const double oneD[2] = {1.0, 0.0}, zeroD[2] = {0.0, 0.0};
 static const float oneF[2] = {1.0, 0.0}, zeroF[2] = {0.0, 0.0};
diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h
index 66ce4ca5b..71c533f36 100644
--- a/numpy/core/src/common/cblasfuncs.h
+++ b/numpy/core/src/common/cblasfuncs.h
@@ -1,7 +1,7 @@
-#ifndef _NPY_CBLASFUNCS_H_
-#define _NPY_CBLASFUNCS_H_
+#ifndef NUMPY_CORE_SRC_COMMON_CBLASFUNCS_H_
+#define NUMPY_CORE_SRC_COMMON_CBLASFUNCS_H_
 
 NPY_NO_EXPORT PyObject *
 cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_CBLASFUNCS_H_ */
diff --git a/numpy/core/src/common/get_attr_string.h b/numpy/core/src/common/get_attr_string.h
index 8b7cf1c5b..3b23b2e66 100644
--- a/numpy/core/src/common/get_attr_string.h
+++ b/numpy/core/src/common/get_attr_string.h
@@ -1,5 +1,5 @@
-#ifndef __GET_ATTR_STRING_H
-#define __GET_ATTR_STRING_H
+#ifndef NUMPY_CORE_SRC_COMMON_GET_ATTR_STRING_H_
+#define NUMPY_CORE_SRC_COMMON_GET_ATTR_STRING_H_
 
 static NPY_INLINE npy_bool
 _is_basic_python_type(PyTypeObject *tp)
@@ -113,4 +113,4 @@ PyArray_LookupSpecial_OnInstance(PyObject *obj, char const *name)
     return maybe_get_attr(obj, name);
 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_GET_ATTR_STRING_H_ */
diff --git a/numpy/core/src/common/lowlevel_strided_loops.h b/numpy/core/src/common/lowlevel_strided_loops.h
index 3df054b40..ad86c0489 100644
--- a/numpy/core/src/common/lowlevel_strided_loops.h
+++ b/numpy/core/src/common/lowlevel_strided_loops.h
@@ -1,8 +1,8 @@
-#ifndef __LOWLEVEL_STRIDED_LOOPS_H
-#define __LOWLEVEL_STRIDED_LOOPS_H
+#ifndef NUMPY_CORE_SRC_COMMON_LOWLEVEL_STRIDED_LOOPS_H_
+#define NUMPY_CORE_SRC_COMMON_LOWLEVEL_STRIDED_LOOPS_H_
 #include "common.h"
-#include <npy_config.h>
-#include <array_method.h>
+#include "npy_config.h"
+#include "array_method.h"
 #include "dtype_transfer.h"
 #include "mem_overlap.h"
 
@@ -770,4 +770,4 @@ PyArray_EQUIVALENTLY_ITERABLE_OVERLAP_OK(PyArrayObject *arr1, PyArrayObject *arr
                     stride2 = PyArray_TRIVIAL_PAIR_ITERATION_STRIDE(size2, arr2); \
                 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_LOWLEVEL_STRIDED_LOOPS_H_ */
diff --git a/numpy/core/src/common/mem_overlap.c b/numpy/core/src/common/mem_overlap.c
index 9da33bfc1..2632e1413 100644
--- a/numpy/core/src/common/mem_overlap.c
+++ b/numpy/core/src/common/mem_overlap.c
@@ -181,9 +181,11 @@
   All rights reserved.
   Licensed under 3-clause BSD license, see LICENSE.txt.
 */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include "numpy/ndarraytypes.h"
 #include "mem_overlap.h"
 #include "npy_extint128.h"
diff --git a/numpy/core/src/common/mem_overlap.h b/numpy/core/src/common/mem_overlap.h
index 8044f1663..3aa4f798b 100644
--- a/numpy/core/src/common/mem_overlap.h
+++ b/numpy/core/src/common/mem_overlap.h
@@ -1,5 +1,5 @@
-#ifndef MEM_OVERLAP_H_
-#define MEM_OVERLAP_H_
+#ifndef NUMPY_CORE_SRC_COMMON_MEM_OVERLAP_H_
+#define NUMPY_CORE_SRC_COMMON_MEM_OVERLAP_H_
 
 #include "npy_config.h"
 #include "numpy/ndarraytypes.h"
@@ -46,5 +46,4 @@ offset_bounds_from_strides(const int itemsize, const int nd,
                            const npy_intp *dims, const npy_intp *strides,
                            npy_intp *lower_offset, npy_intp *upper_offset);
 
-#endif
-
+#endif  /* NUMPY_CORE_SRC_COMMON_MEM_OVERLAP_H_ */
diff --git a/numpy/core/src/common/npy_argparse.c b/numpy/core/src/common/npy_argparse.c
index 8460a38e6..76123c1ed 100644
--- a/numpy/core/src/common/npy_argparse.c
+++ b/numpy/core/src/common/npy_argparse.c
@@ -1,8 +1,9 @@
-#include "Python.h"
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include "numpy/ndarraytypes.h"
 #include "npy_argparse.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/common/npy_argparse.h b/numpy/core/src/common/npy_argparse.h
index 5da535c91..f4122103d 100644
--- a/numpy/core/src/common/npy_argparse.h
+++ b/numpy/core/src/common/npy_argparse.h
@@ -1,7 +1,7 @@
-#ifndef _NPY_ARGPARSE_H
-#define _NPY_ARGPARSE_H
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_ARGPARSE_H
+#define NUMPY_CORE_SRC_COMMON_NPY_ARGPARSE_H
 
-#include "Python.h"
+#include <Python.h>
 #include "numpy/ndarraytypes.h"
 
 /*
@@ -93,4 +93,4 @@ _npy_parse_arguments(const char *funcname,
         _npy_parse_arguments(funcname, &__argparse_cache,                \
                 args, len_args, kwnames, __VA_ARGS__)
 
-#endif /* _NPY_ARGPARSE_H */
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_ARGPARSE_H */
diff --git a/numpy/core/src/common/npy_cblas.h b/numpy/core/src/common/npy_cblas.h
index 072993ec2..30fec1a65 100644
--- a/numpy/core/src/common/npy_cblas.h
+++ b/numpy/core/src/common/npy_cblas.h
@@ -3,8 +3,8 @@
  * because not all providers of cblas provide cblas.h. For instance, MKL provides
  * mkl_cblas.h and also typedefs the CBLAS_XXX enums.
  */
-#ifndef _NPY_CBLAS_H_
-#define _NPY_CBLAS_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CBLAS_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CBLAS_H_
 
 #include <stddef.h>
 
@@ -98,4 +98,4 @@ blas_stride(npy_intp stride, unsigned itemsize)
 }
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_CBLAS_H_ */
diff --git a/numpy/core/src/common/npy_cblas_base.h b/numpy/core/src/common/npy_cblas_base.h
index 792b6f09e..12dfb2e78 100644
--- a/numpy/core/src/common/npy_cblas_base.h
+++ b/numpy/core/src/common/npy_cblas_base.h
@@ -9,6 +9,9 @@
  * Prototypes for level 1 BLAS functions (complex are recast as routines)
  * ===========================================================================
  */
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CBLAS_BASE_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CBLAS_BASE_H_
+
 float  BLASNAME(cblas_sdsdot)(const BLASINT N, const float alpha, const float *X,
                               const BLASINT incX, const float *Y, const BLASINT incY);
 double BLASNAME(cblas_dsdot)(const BLASINT N, const float *X, const BLASINT incX, const float *Y,
@@ -555,3 +558,5 @@ void BLASNAME(cblas_zher2k)(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO
                             void *C, const BLASINT ldc);
 
 void BLASNAME(cblas_xerbla)(BLASINT p, const char *rout, const char *form, ...);
+
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_CBLAS_BASE_H_ */
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index c6de0cd30..fd0f1855c 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_NPY_CONFIG_H_
-#define _NPY_NPY_CONFIG_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
 
 #include "config.h"
 #include "npy_cpu_features.h"
@@ -167,9 +167,9 @@
 #undef HAVE_CACOSHF
 #undef HAVE_CACOSHL
 
-#endif /* __GLIBC_PREREQ(2, 18) */
-#endif /* defined(__GLIBC_PREREQ) */
+#endif  /* __GLIBC_PREREQ(2, 18) */
+#endif  /* defined(__GLIBC_PREREQ) */
 
-#endif /* defined(HAVE_FEATURES_H) */
+#endif  /* defined(HAVE_FEATURES_H) */
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_ */
diff --git a/numpy/core/src/common/npy_cpu_dispatch.h b/numpy/core/src/common/npy_cpu_dispatch.h
index 09e00badf..e814cd425 100644
--- a/numpy/core/src/common/npy_cpu_dispatch.h
+++ b/numpy/core/src/common/npy_cpu_dispatch.h
@@ -1,5 +1,5 @@
-#ifndef NPY_CPU_DISPATCH_H_
-#define NPY_CPU_DISPATCH_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_
 /**
  * This file is part of the NumPy CPU dispatcher. Please have a look at doc/reference/simd-optimizations.html
  * To get a better understanding of the mechanism behind it.
@@ -196,7 +196,7 @@
  * Example:
  *  Assume we have a dispatch-able source exporting the following function:
  *
- *    @targets baseline avx2 avx512_skx // configration statements
+ *    @targets baseline avx2 avx512_skx // configuration statements
  *
  *    void NPY_CPU_DISPATCH_CURFX(dispatch_me)(const int *src, int *dst)
  *    {
@@ -262,4 +262,4 @@
 #define NPY_CPU_DISPATCH_CALL_ALL_BASE_CB_(LEFT, ...) \
     ( LEFT __VA_ARGS__ )
 
-#endif // NPY_CPU_DISPATCH_H_
+#endif  // NUMPY_CORE_SRC_COMMON_NPY_CPU_DISPATCH_H_
diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src
index 1e0f4a571..a2383c45f 100644
--- a/numpy/core/src/common/npy_cpu_features.c.src
+++ b/numpy/core/src/common/npy_cpu_features.c.src
@@ -230,7 +230,7 @@ npy__cpu_try_disable_env(void)
             notsupp_cur[flen] = ' '; notsupp_cur += flen + 1;
             goto next;
         }
-        // Finaly we can disable it
+        // Finally we can disable it
         npy__cpu_have[feature_id] = 0;
     next:
         feature = strtok(NULL, delim);
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 28dd00032..ce1fc822a 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_CPU_FEATURES_H_
-#define _NPY_CPU_FEATURES_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CPU_FEATURES_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CPU_FEATURES_H_
 
 #include <Python.h> // for PyObject
 #include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN
@@ -168,4 +168,4 @@ npy_cpu_dispatch_list(void);
 }
 #endif
 
-#endif // _NPY_CPU_FEATURES_H_
+#endif  // NUMPY_CORE_SRC_COMMON_NPY_CPU_FEATURES_H_
diff --git a/numpy/core/src/common/npy_cpuinfo_parser.h b/numpy/core/src/common/npy_cpuinfo_parser.h
index f4540f6ab..364873a23 100644
--- a/numpy/core/src/common/npy_cpuinfo_parser.h
+++ b/numpy/core/src/common/npy_cpuinfo_parser.h
@@ -25,8 +25,8 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
-#ifndef __NPY_CPUINFO_PARSER_H__
-#define __NPY_CPUINFO_PARSER_H__
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CPUINFO_PARSER_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CPUINFO_PARSER_H_
 #include <errno.h>
 #include <stdio.h>
 #include <fcntl.h>
@@ -123,7 +123,7 @@ read_file(const char*  pathname, char*  buffer, size_t  buffsize)
 }
 
 /* 
- * Extract the content of a the first occurence of a given field in
+ * Extract the content of a the first occurrence of a given field in
  * the content of /proc/cpuinfo and return it as a heap-allocated
  * string that must be freed by the caller.
  *
@@ -138,7 +138,7 @@ extract_cpuinfo_field(const char* buffer, int buflen, const char* field)
     int len;
     const char *p, *q;
 
-    /* Look for first field occurence, and ensures it starts the line. */
+    /* Look for first field occurrence, and ensures it starts the line. */
     p = buffer;
     for (;;) {
         p = memmem(p, bufend-p, field, fieldlen);
@@ -259,4 +259,4 @@ get_feature_from_proc_cpuinfo(unsigned long *hwcap, unsigned long *hwcap2) {
     *hwcap2 |= has_list_item(cpuFeatures, "crc32") ? NPY__HWCAP2_CRC32 : 0;
     return 1;
 }
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_CPUINFO_PARSER_H_ */
diff --git a/numpy/core/src/common/npy_ctypes.h b/numpy/core/src/common/npy_ctypes.h
index c0cc4f1a1..05761cad3 100644
--- a/numpy/core/src/common/npy_ctypes.h
+++ b/numpy/core/src/common/npy_ctypes.h
@@ -1,5 +1,5 @@
-#ifndef NPY_CTYPES_H
-#define NPY_CTYPES_H
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CTYPES_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_CTYPES_H_
 
 #include <Python.h>
 
@@ -47,4 +47,4 @@ fail:
     return 0;
 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_CTYPES_H_ */
diff --git a/numpy/core/src/common/npy_extint128.h b/numpy/core/src/common/npy_extint128.h
index a887ff317..d563c2ac8 100644
--- a/numpy/core/src/common/npy_extint128.h
+++ b/numpy/core/src/common/npy_extint128.h
@@ -1,5 +1,5 @@
-#ifndef NPY_EXTINT128_H_
-#define NPY_EXTINT128_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_EXTINT128_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_EXTINT128_H_
 
 
 typedef struct {
@@ -314,4 +314,4 @@ ceildiv_128_64(npy_extint128_t a, npy_int64 b)
     return result;
 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_EXTINT128_H_ */
diff --git a/numpy/core/src/common/npy_fpmath.h b/numpy/core/src/common/npy_fpmath.h
index dbb3fb23d..27e9ea3f4 100644
--- a/numpy/core/src/common/npy_fpmath.h
+++ b/numpy/core/src/common/npy_fpmath.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_NPY_FPMATH_H_
-#define _NPY_NPY_FPMATH_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_NPY_FPMATH_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_NPY_FPMATH_H_
 
 #include "npy_config.h"
 
@@ -27,4 +27,4 @@
     #define HAVE_LDOUBLE_DOUBLE_DOUBLE_BE
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_NPY_FPMATH_H_ */
diff --git a/numpy/core/src/common/npy_hashtable.h b/numpy/core/src/common/npy_hashtable.h
index 5f11d2c1d..a0bf81967 100644
--- a/numpy/core/src/common/npy_hashtable.h
+++ b/numpy/core/src/common/npy_hashtable.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_NPY_HASHTABLE_H
-#define _NPY_NPY_HASHTABLE_H
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_NPY_HASHTABLE_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_NPY_HASHTABLE_H_
 
 #include <Python.h>
 
@@ -29,4 +29,4 @@ PyArrayIdentityHash_New(int key_len);
 NPY_NO_EXPORT void
 PyArrayIdentityHash_Dealloc(PyArrayIdentityHash *tb);
 
-#endif  /* _NPY_NPY_HASHTABLE_H */
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_NPY_HASHTABLE_H_ */
diff --git a/numpy/core/src/common/npy_import.h b/numpy/core/src/common/npy_import.h
index f485514d1..f36b6924a 100644
--- a/numpy/core/src/common/npy_import.h
+++ b/numpy/core/src/common/npy_import.h
@@ -1,5 +1,5 @@
-#ifndef NPY_IMPORT_H
-#define NPY_IMPORT_H
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_IMPORT_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_IMPORT_H_
 
 #include <Python.h>
 
@@ -29,4 +29,4 @@ npy_cache_import(const char *module, const char *attr, PyObject **cache)
     }
 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_IMPORT_H_ */
diff --git a/numpy/core/src/common/npy_longdouble.c b/numpy/core/src/common/npy_longdouble.c
index 260e02a64..38dfd325c 100644
--- a/numpy/core/src/common/npy_longdouble.c
+++ b/numpy/core/src/common/npy_longdouble.c
@@ -1,8 +1,9 @@
-#include <Python.h>
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include "numpy/ndarraytypes.h"
 #include "numpy/npy_math.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/common/npy_longdouble.h b/numpy/core/src/common/npy_longdouble.h
index 01db06de7..cf8b37bc9 100644
--- a/numpy/core/src/common/npy_longdouble.h
+++ b/numpy/core/src/common/npy_longdouble.h
@@ -1,5 +1,5 @@
-#ifndef __NPY_LONGDOUBLE_H
-#define __NPY_LONGDOUBLE_H
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_LONGDOUBLE_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_LONGDOUBLE_H_
 
 #include "npy_config.h"
 #include "numpy/ndarraytypes.h"
@@ -24,4 +24,4 @@ npy_longdouble_to_PyLong(npy_longdouble ldval);
 NPY_VISIBILITY_HIDDEN npy_longdouble
 npy_longdouble_from_PyLong(PyObject *long_obj);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_LONGDOUBLE_H_ */
diff --git a/numpy/core/src/common/npy_pycompat.h b/numpy/core/src/common/npy_pycompat.h
index 9e94a9710..6641cd591 100644
--- a/numpy/core/src/common/npy_pycompat.h
+++ b/numpy/core/src/common/npy_pycompat.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PYCOMPAT_H_
-#define _NPY_PYCOMPAT_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_PYCOMPAT_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_PYCOMPAT_H_
 
 #include "numpy/npy_3kcompat.h"
 
@@ -19,4 +19,4 @@ Npy_HashDouble(PyObject *NPY_UNUSED(identity), double val)
 #endif
 
 
-#endif /* _NPY_COMPAT_H_ */
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_PYCOMPAT_H_ */
diff --git a/numpy/core/src/common/npy_sort.h.src b/numpy/core/src/common/npy_sort.h.src
index ddbde0c9b..b4a1e9b0c 100644
--- a/numpy/core/src/common/npy_sort.h.src
+++ b/numpy/core/src/common/npy_sort.h.src
@@ -49,9 +49,14 @@ NPY_NO_EXPORT int atimsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *
  * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
  *         longlong, ulonglong#
  */
-
+#ifdef __cplusplus
+extern "C" {
+#endif
 NPY_NO_EXPORT int radixsort_@suff@(void *vec, npy_intp cnt, void *null);
 NPY_NO_EXPORT int aradixsort_@suff@(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
 
 /**end repeat**/
 
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
new file mode 100644
index 000000000..4292f7090
--- /dev/null
+++ b/numpy/core/src/common/npy_svml.h
@@ -0,0 +1,41 @@
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+extern __m512 __svml_exp2f16(__m512 x);
+extern __m512 __svml_log2f16(__m512 x);
+extern __m512 __svml_log10f16(__m512 x);
+extern __m512 __svml_expm1f16(__m512 x);
+extern __m512 __svml_log1pf16(__m512 x);
+extern __m512 __svml_cbrtf16(__m512 x);
+extern __m512 __svml_sinf16(__m512 x);
+extern __m512 __svml_cosf16(__m512 x);
+extern __m512 __svml_tanf16(__m512 x);
+extern __m512 __svml_asinf16(__m512 x);
+extern __m512 __svml_acosf16(__m512 x);
+extern __m512 __svml_atanf16(__m512 x);
+extern __m512 __svml_atan2f16(__m512 x);
+extern __m512 __svml_sinhf16(__m512 x);
+extern __m512 __svml_coshf16(__m512 x);
+extern __m512 __svml_tanhf16(__m512 x);
+extern __m512 __svml_asinhf16(__m512 x);
+extern __m512 __svml_acoshf16(__m512 x);
+extern __m512 __svml_atanhf16(__m512 x);
+
+extern __m512d __svml_exp28(__m512d x);
+extern __m512d __svml_log28(__m512d x);
+extern __m512d __svml_log108(__m512d x);
+extern __m512d __svml_expm18(__m512d x);
+extern __m512d __svml_log1p8(__m512d x);
+extern __m512d __svml_cbrt8(__m512d x);
+extern __m512d __svml_sin8(__m512d x);
+extern __m512d __svml_cos8(__m512d x);
+extern __m512d __svml_tan8(__m512d x);
+extern __m512d __svml_asin8(__m512d x);
+extern __m512d __svml_acos8(__m512d x);
+extern __m512d __svml_atan8(__m512d x);
+extern __m512d __svml_atan28(__m512d x);
+extern __m512d __svml_sinh8(__m512d x);
+extern __m512d __svml_cosh8(__m512d x);
+extern __m512d __svml_tanh8(__m512d x);
+extern __m512d __svml_asinh8(__m512d x);
+extern __m512d __svml_acosh8(__m512d x);
+extern __m512d __svml_atanh8(__m512d x);
+#endif
diff --git a/numpy/core/src/common/numpy_tag.h b/numpy/core/src/common/numpy_tag.h
new file mode 100644
index 000000000..dc8d5286b
--- /dev/null
+++ b/numpy/core/src/common/numpy_tag.h
@@ -0,0 +1,78 @@
+#ifndef _NPY_COMMON_TAG_H_
+#define _NPY_COMMON_TAG_H_
+
+namespace npy {
+
+struct integral_tag {
+};
+struct floating_point_tag {
+};
+struct complex_tag {
+};
+struct date_tag {
+};
+
+struct bool_tag : integral_tag {
+    using type = npy_bool;
+};
+struct byte_tag : integral_tag {
+    using type = npy_byte;
+};
+struct ubyte_tag : integral_tag {
+    using type = npy_ubyte;
+};
+struct short_tag : integral_tag {
+    using type = npy_short;
+};
+struct ushort_tag : integral_tag {
+    using type = npy_ushort;
+};
+struct int_tag : integral_tag {
+    using type = npy_int;
+};
+struct uint_tag : integral_tag {
+    using type = npy_uint;
+};
+struct long_tag : integral_tag {
+    using type = npy_long;
+};
+struct ulong_tag : integral_tag {
+    using type = npy_ulong;
+};
+struct longlong_tag : integral_tag {
+    using type = npy_longlong;
+};
+struct ulonglong_tag : integral_tag {
+    using type = npy_ulonglong;
+};
+struct half_tag {
+    using type = npy_half;
+};
+struct float_tag : floating_point_tag {
+    using type = npy_float;
+};
+struct double_tag : floating_point_tag {
+    using type = npy_double;
+};
+struct longdouble_tag : floating_point_tag {
+    using type = npy_longdouble;
+};
+struct cfloat_tag : complex_tag {
+    using type = npy_cfloat;
+};
+struct cdouble_tag : complex_tag {
+    using type = npy_cdouble;
+};
+struct clongdouble_tag : complex_tag {
+    using type = npy_clongdouble;
+};
+struct datetime_tag : date_tag {
+    using type = npy_datetime;
+};
+struct timedelta_tag : date_tag {
+    using type = npy_timedelta;
+};
+
+}  // namespace npy
+
+#endif
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index 42a71777b..4551a06a2 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -1,11 +1,9 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#include <locale.h>
-#include <stdio.h>
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/npy_math.h"
 
@@ -13,14 +11,13 @@
 
 #include "npy_pycompat.h"
 
+#include <locale.h>
+#include <stdio.h>
+
 #ifdef HAVE_STRTOLD_L
 #include <stdlib.h>
 #ifdef HAVE_XLOCALE_H
-    /*
-     * the defines from xlocale.h are included in locale.h on some systems;
-     * see gh-8367
-     */
-    #include <xlocale.h>
+#include <xlocale.h>  // xlocale was removed in glibc 2.26, see gh-8367
 #endif
 #endif
 
diff --git a/numpy/core/src/common/numpyos.h b/numpy/core/src/common/numpyos.h
index 4deed8400..ce49cbea7 100644
--- a/numpy/core/src/common/numpyos.h
+++ b/numpy/core/src/common/numpyos.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_NUMPYOS_H_
-#define _NPY_NUMPYOS_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_
 
 NPY_NO_EXPORT char*
 NumPyOS_ascii_formatd(char *buffer, size_t buf_size,
@@ -38,4 +38,5 @@ NumPyOS_strtoll(const char *str, char **endptr, int base);
 /* Convert a string to an int in an arbitrary base */
 NPY_NO_EXPORT npy_ulonglong
 NumPyOS_strtoull(const char *str, char **endptr, int base);
-#endif
+
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_NUMPYOS_H_ */
diff --git a/numpy/core/src/common/python_xerbla.c b/numpy/core/src/common/python_xerbla.c
index fe2f718b2..37a41408b 100644
--- a/numpy/core/src/common/python_xerbla.c
+++ b/numpy/core/src/common/python_xerbla.c
@@ -1,4 +1,6 @@
-#include "Python.h"
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include "numpy/npy_common.h"
 #include "npy_cblas.h"
 
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index e27bf15fe..5891a270a 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 #if 0 // slower
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m256i idx = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
+    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
 }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_maskload_epi64((const void*)ptr, mask);
 }
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m256i vfill = _mm256_set1_epi64x(fill);
-    const __m256i idx   = _mm256_setr_epi64x(0, 1*stride, 2*stride, 3*stride);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane  = _mm256_set1_epi64x(nlane > 4 ? 4 : (int)nlane);
+    const __m256i vfill = npyv_setall_s64(fill);
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
 }
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m256i steps = _mm256_setr_epi64x(0, 1, 2, 3);
-    __m256i vnlane = _mm256_set1_epi64x(nlane > 8 ? 8 : (int)nlane);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
     _mm256_maskstore_epi64((void*)ptr, mask, a);
 }
diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
index e96696dc9..5e91e91b3 100644
--- a/numpy/core/src/common/simd/avx2/misc.h
+++ b/numpy/core/src/common/simd/avx2/misc.h
@@ -24,11 +24,27 @@
 #define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
-#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
 #define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
 
+NPY_FINLINE __m256i npyv__setr_epi64(npy_int64, npy_int64, npy_int64, npy_int64);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai);
+#else
+    return _mm256_set1_epi64x(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a);
+#else
+    return _mm256_set1_epi64x(a);
+#endif
+}
 /*
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -59,7 +75,14 @@ NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int
 }
 NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm256_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32)
+    );
+#else
     return _mm256_setr_epi64x(i0, i1, i2, i3);
+#endif
 }
 
 NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
diff --git a/numpy/core/src/common/simd/avx512/math.h b/numpy/core/src/common/simd/avx512/math.h
index 0141396d0..0949b2b06 100644
--- a/numpy/core/src/common/simd/avx512/math.h
+++ b/numpy/core/src/common/simd/avx512/math.h
@@ -35,7 +35,7 @@ NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a)
     return _mm512_range_pd(a, a, 8);
 #else
     return npyv_and_f64(
-        a, _mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffffLL))
+        a, _mm512_castsi512_pd(npyv_setall_s64(0x7fffffffffffffffLL))
     );
 #endif
 }
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index bffd6e907..47095bf72 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -110,7 +110,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 //// 64
 NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -140,7 +140,7 @@ NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a)
 //// 64
 NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 {
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
@@ -173,7 +173,7 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
@@ -210,11 +210,11 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __m512i vfill = _mm512_set1_epi64(fill);
+    const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
@@ -258,7 +258,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __m512i idx = _mm512_setr_epi64(
+    const __m512i idx = npyv_set_s64(
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
index 4b6729b05..c3039ecfe 100644
--- a/numpy/core/src/common/simd/avx512/misc.h
+++ b/numpy/core/src/common/simd/avx512/misc.h
@@ -24,11 +24,30 @@
 #define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
 #define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
 #define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
-#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
-#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
 #define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
 #define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
 
+NPY_FINLINE __m512i npyv__setr_epi64(
+    npy_int64, npy_int64, npy_int64, npy_int64,
+    npy_int64, npy_int64, npy_int64, npy_int64
+);
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+    npy_int64 ai = (npy_int64)a;
+#if defined(_MSC_VER) && defined(_M_IX86) 
+    return npyv__setr_epi64(ai, ai, ai, ai, ai, ai, ai, ai);
+#else
+    return _mm512_set1_epi64(ai);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a, a, a, a, a, a, a);
+#else
+    return _mm512_set1_epi64(a);
+#endif
+}
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -76,7 +95,16 @@ NPY_FINLINE __m512i npyv__setr_epi32(
 NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
                                      npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm512_setr_epi32(
+        (int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32),
+        (int)i2, (int)(i2 >> 32), (int)i3, (int)(i3 >> 32),
+        (int)i4, (int)(i4 >> 32), (int)i5, (int)(i5 >> 32),
+        (int)i6, (int)(i6 >> 32), (int)i7, (int)(i7 >> 32)
+    );
+#else
     return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+#endif
 }
 
 NPY_FINLINE __m512 npyv__setr_ps(
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 7e7446bc5..41e397c2d 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -1,5 +1,5 @@
 /**
- * This header is used internaly by all current supported SIMD extention,
+ * This header is used internally by all current supported SIMD extensions,
  * execpt for AVX512.
  */
 #ifndef NPY_SIMD
diff --git a/numpy/core/src/common/simd/intdiv.h b/numpy/core/src/common/simd/intdiv.h
index f6ea9abf2..a7a461721 100644
--- a/numpy/core/src/common/simd/intdiv.h
+++ b/numpy/core/src/common/simd/intdiv.h
@@ -39,7 +39,7 @@
  *    for (; len >= vstep; src += vstep, dst += vstep, len -= vstep) {
  *        npyv_s32 a = npyv_load_s32(*src);       // load s32 vector from memory
  *                 a = npyv_divc_s32(a, divisor); // divide all elements by x
- *        npyv_store_s32(dst, a);                 // store s32 vector into memroy
+ *        npyv_store_s32(dst, a);                 // store s32 vector into memory
  *    }
  *
  ** NOTES:
@@ -162,11 +162,12 @@ NPY_FINLINE npy_uint64 npyv__divh128_u64(npy_uint64 high, npy_uint64 divisor)
     npy_uint32 divisor_hi  = divisor >> 32;
     npy_uint32 divisor_lo  = divisor & 0xFFFFFFFF;
     // compute high quotient digit
-    npy_uint32 quotient_hi = (npy_uint32)(high / divisor_hi);
+    npy_uint64 quotient_hi = high / divisor_hi;
     npy_uint64 remainder   = high - divisor_hi * quotient_hi;
     npy_uint64 base32      = 1ULL << 32;
     while (quotient_hi >= base32 || quotient_hi*divisor_lo > base32*remainder) {
-        remainder += --divisor_hi;
+        --quotient_hi;
+        remainder += divisor_hi;
         if (remainder >= base32) {
             break;
         }
@@ -200,7 +201,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
     default:
         l   = npyv__bitscan_revnz_u32(d - 1) + 1;  // ceil(log2(d))
         l2  = (npy_uint8)(1 << l);                 // 2^l, overflow to 0 if l = 8
-        m   = ((l2 - d) << 8) / d + 1;             // multiplier
+        m   = ((npy_uint16)((l2 - d) << 8)) / d + 1; // multiplier
         sh1 = 1;  sh2 = l - 1;                     // shift counts
     }
     npyv_u8x3 divisor;
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index ced82d1de..19ea6f22f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -31,7 +31,7 @@ NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a)
         const npyv_f32 zero = vdupq_n_f32(0.0f);
         const npyv_u32 pinf = vdupq_n_u32(0x7f800000);
         npyv_u32 is_zero = vceqq_f32(a, zero), is_inf = vceqq_u32(vreinterpretq_u32_f32(a), pinf);
-        // guard agianst floating-point division-by-zero error
+        // guard against floating-point division-by-zero error
         npyv_f32 guard_byz = vbslq_f32(is_zero, vreinterpretq_f32_u32(pinf), a);
         // estimate to (1/√a)
         npyv_f32 rsqrte = vrsqrteq_f32(guard_byz);
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index a3e2b95de..08b2a7d00 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -27,6 +27,25 @@ typedef npy_int64  npyv_lanetype_s64;
 typedef float      npyv_lanetype_f32;
 typedef double     npyv_lanetype_f64;
 
+#if defined(_MSC_VER) && defined(_M_IX86)
+/*
+ * Avoid using any of the following intrinsics with MSVC 32-bit,
+ * even if they are apparently work on newer versions.
+ * They had bad impact on the generated instructions,
+ * sometimes the compiler deal with them without the respect
+ * of 32-bit mode which lead to crush due to execute 64-bit
+ * instructions and other times generate bad emulated instructions. 
+ */
+    #undef _mm512_set1_epi64
+    #undef _mm256_set1_epi64x
+    #undef _mm_set1_epi64x
+    #undef _mm512_setr_epi64x
+    #undef _mm256_setr_epi64x
+    #undef _mm_setr_epi64x
+    #undef _mm512_set_epi64x
+    #undef _mm256_set_epi64x
+    #undef _mm_set_epi64x
+#endif
 #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
     #include "avx512/avx512.h"
 #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
index 1099c491d..7d13fbf55 100644
--- a/numpy/core/src/common/simd/sse/misc.h
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -24,11 +24,28 @@
 #define npyv_setall_s16(VAL) _mm_set1_epi16((short)(VAL))
 #define npyv_setall_u32(VAL) _mm_set1_epi32((int)(VAL))
 #define npyv_setall_s32(VAL) _mm_set1_epi32((int)(VAL))
-#define npyv_setall_u64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
-#define npyv_setall_s64(VAL) _mm_set1_epi64x((npy_int64)(VAL))
 #define npyv_setall_f32 _mm_set1_ps
 #define npyv_setall_f64 _mm_set1_pd
 
+NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1);
+
+NPY_FINLINE npyv_u64 npyv_setall_u64(npy_uint64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64((npy_int64)a, (npy_int64)a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+NPY_FINLINE npyv_s64 npyv_setall_s64(npy_int64 a)
+{
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return npyv__setr_epi64(a, a);
+#else
+    return _mm_set1_epi64x((npy_int64)a);
+#endif
+}
+
 /**
  * vector with specific values set to each lane and
  * set a specific value to all remained lanes
@@ -53,7 +70,11 @@ NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
 }
 NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
 {
+#if defined(_MSC_VER) && defined(_M_IX86)
+    return _mm_setr_epi32((int)i0, (int)(i0 >> 32), (int)i1, (int)(i1 >> 32));
+#else
     return _mm_set_epi64x(i1, i0);
+#endif
 }
 NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
 {
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
index 23c5d0dbe..d34057ff3 100644
--- a/numpy/core/src/common/simd/vsx/operators.h
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -103,7 +103,7 @@ NPYV_IMPL_VSX_BIN_B64(or)
 NPYV_IMPL_VSX_BIN_B64(xor)
 
 // NOT
-// note: we implement npyv_not_b*(boolen types) for internal use*/
+// note: we implement npyv_not_b*(boolean types) for internal use*/
 #define NPYV_IMPL_VSX_NOT_INT(VEC_LEN)                                 \
     NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \
     { return vec_nor(a, a); }                                          \
diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c
index 3ef5d6878..4bea4beee 100644
--- a/numpy/core/src/common/ucsnarrow.c
+++ b/numpy/core/src/common/ucsnarrow.c
@@ -1,12 +1,9 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#include <locale.h>
-#include <stdio.h>
-
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/npy_math.h"
 
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index c811e1f2c..6fe157199 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -1,7 +1,7 @@
-#ifndef _NPY_UCSNARROW_H_
-#define _NPY_UCSNARROW_H_
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
+#define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
 
 NPY_NO_EXPORT PyUnicodeObject *
 PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/core/src/common/ufunc_override.h b/numpy/core/src/common/ufunc_override.h
index bf86865c9..5da95fb29 100644
--- a/numpy/core/src/common/ufunc_override.h
+++ b/numpy/core/src/common/ufunc_override.h
@@ -1,5 +1,5 @@
-#ifndef __UFUNC_OVERRIDE_H
-#define __UFUNC_OVERRIDE_H
+#ifndef NUMPY_CORE_SRC_COMMON_UFUNC_OVERRIDE_H_
+#define NUMPY_CORE_SRC_COMMON_UFUNC_OVERRIDE_H_
 
 #include "npy_config.h"
 
@@ -34,4 +34,5 @@ PyUFunc_HasOverride(PyObject *obj);
  */
 NPY_NO_EXPORT int
 PyUFuncOverride_GetOutObjects(PyObject *kwds, PyObject **out_kwd_obj, PyObject ***out_objs);
-#endif
+
+#endif  /* NUMPY_CORE_SRC_COMMON_UFUNC_OVERRIDE_H_ */
diff --git a/numpy/core/src/common/umathmodule.h b/numpy/core/src/common/umathmodule.h
index 5c718a841..6d4169ad5 100644
--- a/numpy/core/src/common/umathmodule.h
+++ b/numpy/core/src/common/umathmodule.h
@@ -1,3 +1,6 @@
+#ifndef NUMPY_CORE_SRC_COMMON_UMATHMODULE_H_
+#define NUMPY_CORE_SRC_COMMON_UMATHMODULE_H_
+
 #include "__umath_generated.c"
 #include "__ufunc_api.c"
 
@@ -8,4 +11,4 @@ PyObject * add_newdoc_ufunc(PyObject *NPY_UNUSED(dummy), PyObject *args);
 PyObject * ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kwds));
 int initumath(PyObject *m);
 
-
+#endif  /* NUMPY_CORE_SRC_COMMON_UMATHMODULE_H_ */
diff --git a/numpy/core/src/dummymodule.c b/numpy/core/src/dummymodule.c
index e26875736..7284ffd68 100644
--- a/numpy/core/src/dummymodule.c
+++ b/numpy/core/src/dummymodule.c
@@ -4,12 +4,13 @@
  * This is a dummy module whose purpose is to get distutils to generate the
  * configuration files before the libraries are made.
  */
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define NO_IMPORT_ARRAY
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <npy_pycompat.h>
+
+#include "npy_pycompat.h"
 
 static struct PyMethodDef methods[] = {
     {NULL, NULL, 0, NULL}
diff --git a/numpy/core/src/multiarray/_datetime.h b/numpy/core/src/multiarray/_datetime.h
index c0d2f1967..2ebeb1dff 100644
--- a/numpy/core/src/multiarray/_datetime.h
+++ b/numpy/core/src/multiarray/_datetime.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__DATETIME_H_
-#define _NPY_PRIVATE__DATETIME_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY__DATETIME_H_
+#define NUMPY_CORE_SRC_MULTIARRAY__DATETIME_H_
 
 extern NPY_NO_EXPORT char const *_datetime_strings[NPY_DATETIME_NUMUNITS];
 extern NPY_NO_EXPORT int _days_per_month_table[2][12];
@@ -376,4 +376,4 @@ find_object_datetime_type(PyObject *obj, int type_num);
 NPY_NO_EXPORT int
 PyArray_InitializeDatetimeCasts(void);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY__DATETIME_H_ */
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index f4764b371..e945d0771 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -1,8 +1,8 @@
 /* -*-c-*- */
 #define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#include <Python.h>
 #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
diff --git a/numpy/core/src/multiarray/abstractdtypes.c b/numpy/core/src/multiarray/abstractdtypes.c
index 3fa354ddc..cc1d7fad8 100644
--- a/numpy/core/src/multiarray/abstractdtypes.c
+++ b/numpy/core/src/multiarray/abstractdtypes.c
@@ -1,10 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/ndarraytypes.h"
 #include "numpy/arrayobject.h"
 
@@ -157,7 +157,7 @@ int_common_dtype(PyArray_DTypeMeta *NPY_UNUSED(cls), PyArray_DTypeMeta *other)
         }
         else if (PyTypeNum_ISNUMBER(other->type_num) ||
                  other->type_num == NPY_TIMEDELTA) {
-            /* All other numeric types (ant timdelta) are preserved: */
+            /* All other numeric types (ant timedelta) are preserved: */
             Py_INCREF(other);
             return other;
         }
diff --git a/numpy/core/src/multiarray/abstractdtypes.h b/numpy/core/src/multiarray/abstractdtypes.h
index a6c526717..42c192cac 100644
--- a/numpy/core/src/multiarray/abstractdtypes.h
+++ b/numpy/core/src/multiarray/abstractdtypes.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ABSTRACTDTYPES_H
-#define _NPY_ABSTRACTDTYPES_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ABSTRACTDTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ABSTRACTDTYPES_H_
 
 #include "dtypemeta.h"
 
@@ -16,4 +16,4 @@ NPY_NO_EXPORT extern PyArray_DTypeMeta PyArray_PyComplexAbstractDType;
 NPY_NO_EXPORT int
 initialize_and_map_pytypes_to_dtypes(void);
 
-#endif  /*_NPY_ABSTRACTDTYPES_H */
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ABSTRACTDTYPES_H_ */
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index e74056736..e4756264d 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -1,20 +1,18 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
-
+#include <structmember.h>
 #include <pymem.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/ndarraytypes.h>
+#include "numpy/ndarraytypes.h"
 #include "numpy/arrayobject.h"
-#include <numpy/npy_common.h>
+#include "numpy/npy_common.h"
 #include "npy_config.h"
 #include "alloc.h"
 
-
 #include <assert.h>
-
 #ifdef NPY_OS_LINUX
 #include <sys/mman.h>
 #ifndef MADV_HUGEPAGE
@@ -135,9 +133,10 @@ npy_alloc_cache(npy_uintp sz)
 
 /* zero initialized data, sz is number of bytes to allocate */
 NPY_NO_EXPORT void *
-npy_alloc_cache_zero(npy_uintp sz)
+npy_alloc_cache_zero(size_t nmemb, size_t size)
 {
     void * p;
+    size_t sz = nmemb * size;
     NPY_BEGIN_THREADS_DEF;
     if (sz < NBUCKETS) {
         p = _npy_alloc_cache(sz, 1, NBUCKETS, datacache, &PyDataMem_NEW);
@@ -147,7 +146,7 @@ npy_alloc_cache_zero(npy_uintp sz)
         return p;
     }
     NPY_BEGIN_THREADS;
-    p = PyDataMem_NEW_ZEROED(sz, 1);
+    p = PyDataMem_NEW_ZEROED(nmemb, size);
     NPY_END_THREADS;
     return p;
 }
@@ -189,8 +188,8 @@ npy_free_cache_dim(void * p, npy_uintp sz)
 
 
 /* malloc/free/realloc hook */
-NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook;
-NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data;
+NPY_NO_EXPORT PyDataMem_EventHookFunc *_PyDataMem_eventhook = NULL;
+NPY_NO_EXPORT void *_PyDataMem_eventhook_user_data = NULL;
 
 /*NUMPY_API
  * Sets the allocation event hook for numpy array data.
@@ -256,21 +255,21 @@ PyDataMem_NEW(size_t size)
  * Allocates zeroed memory for array data.
  */
 NPY_NO_EXPORT void *
-PyDataMem_NEW_ZEROED(size_t size, size_t elsize)
+PyDataMem_NEW_ZEROED(size_t nmemb, size_t size)
 {
     void *result;
 
-    result = calloc(size, elsize);
+    result = calloc(nmemb, size);
     if (_PyDataMem_eventhook != NULL) {
         NPY_ALLOW_C_API_DEF
         NPY_ALLOW_C_API
         if (_PyDataMem_eventhook != NULL) {
-            (*_PyDataMem_eventhook)(NULL, result, size * elsize,
+            (*_PyDataMem_eventhook)(NULL, result, nmemb * size,
                                     _PyDataMem_eventhook_user_data);
         }
         NPY_DISABLE_C_API
     }
-    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, nmemb * size);
     return result;
 }
 
@@ -318,3 +317,325 @@ PyDataMem_RENEW(void *ptr, size_t size)
     }
     return result;
 }
+
+// The default data mem allocator malloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserNEW
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_malloc(void *NPY_UNUSED(ctx), size_t size)
+{
+    return _npy_alloc_cache(size, 1, NBUCKETS, datacache, &malloc);
+}
+
+// The default data mem allocator calloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserNEW_ZEROED
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_calloc(void *NPY_UNUSED(ctx), size_t nelem, size_t elsize)
+{
+    void * p;
+    size_t sz = nelem * elsize;
+    NPY_BEGIN_THREADS_DEF;
+    if (sz < NBUCKETS) {
+        p = _npy_alloc_cache(sz, 1, NBUCKETS, datacache, &malloc);
+        if (p) {
+            memset(p, 0, sz);
+        }
+        return p;
+    }
+    NPY_BEGIN_THREADS;
+    p = calloc(nelem, elsize);
+    NPY_END_THREADS;
+    return p;
+}
+
+// The default data mem allocator realloc routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserRENEW
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void *
+default_realloc(void *NPY_UNUSED(ctx), void *ptr, size_t new_size)
+{
+    return realloc(ptr, new_size);
+}
+
+// The default data mem allocator free routine does not make use of a ctx.
+// It should be called only through PyDataMem_UserFREE
+// since itself does not handle eventhook and tracemalloc logic.
+static NPY_INLINE void
+default_free(void *NPY_UNUSED(ctx), void *ptr, size_t size)
+{
+    _npy_free_cache(ptr, size, NBUCKETS, datacache, &free);
+}
+
+/* Memory handler global default */
+PyDataMem_Handler default_handler = {
+    "default_allocator",
+    {
+        NULL,            /* ctx */
+        default_malloc,  /* malloc */
+        default_calloc,  /* calloc */
+        default_realloc, /* realloc */
+        default_free     /* free */
+    }
+};
+
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+PyObject *current_handler;
+#endif
+
+int uo_index=0;   /* user_override index */
+
+/* Wrappers for the default or any user-assigned PyDataMem_Handler */
+
+NPY_NO_EXPORT void *
+PyDataMem_UserNEW(size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+
+    assert(size != 0);
+    result = handler->allocator.malloc(handler->allocator.ctx, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    return result;
+}
+
+NPY_NO_EXPORT void *
+PyDataMem_UserNEW_ZEROED(size_t nmemb, size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+    result = handler->allocator.calloc(handler->allocator.ctx, nmemb, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(NULL, result, nmemb * size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, nmemb * size);
+    return result;
+}
+
+/* Similar to array_dealloc in arrayobject.c */
+static NPY_INLINE void
+WARN_IN_FREE(PyObject* warning, const char * msg) {
+    if (PyErr_WarnEx(warning, msg, 1) < 0) {
+        PyObject * s;
+
+        s = PyUnicode_FromString("PyDataMem_UserFREE");
+        if (s) {
+            PyErr_WriteUnraisable(s);
+            Py_DECREF(s);
+        }
+        else {
+            PyErr_WriteUnraisable(Py_None);
+        }
+    }
+}
+
+
+
+NPY_NO_EXPORT void
+PyDataMem_UserFREE(void *ptr, size_t size, PyObject *mem_handler)
+{
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        WARN_IN_FREE(PyExc_RuntimeWarning,
+                     "Could not get pointer to 'mem_handler' from PyCapsule");
+        PyErr_Clear();
+        return;
+    }
+    PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
+    handler->allocator.free(handler->allocator.ctx, ptr, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, NULL, 0,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+}
+
+NPY_NO_EXPORT void *
+PyDataMem_UserRENEW(void *ptr, size_t size, PyObject *mem_handler)
+{
+    void *result;
+    PyDataMem_Handler *handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        return NULL;
+    }
+
+    assert(size != 0);
+    result = handler->allocator.realloc(handler->allocator.ctx, ptr, size);
+    if (result != ptr) {
+        PyTraceMalloc_Untrack(NPY_TRACE_DOMAIN, (npy_uintp)ptr);
+    }
+    PyTraceMalloc_Track(NPY_TRACE_DOMAIN, (npy_uintp)result, size);
+    if (_PyDataMem_eventhook != NULL) {
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API
+        if (_PyDataMem_eventhook != NULL) {
+            (*_PyDataMem_eventhook)(ptr, result, size,
+                                    _PyDataMem_eventhook_user_data);
+        }
+        NPY_DISABLE_C_API
+    }
+    return result;
+}
+
+/*NUMPY_API
+ * Set a new allocation policy. If the input value is NULL, will reset
+ * the policy to the default. Return the previous policy, or
+ * return NULL if an error has occurred. We wrap the user-provided
+ * functions so they will still call the python and numpy
+ * memory management callback hooks.
+ */
+NPY_NO_EXPORT PyObject *
+PyDataMem_SetHandler(PyObject *handler)
+{
+    PyObject *old_handler;
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    PyObject *token;
+    if (PyContextVar_Get(current_handler, NULL, &old_handler)) {
+        return NULL;
+    }
+    if (handler == NULL) {
+        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+        if (handler == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        Py_INCREF(handler);
+    }
+    token = PyContextVar_Set(current_handler, handler);
+    Py_DECREF(handler);
+    if (token == NULL) {
+        Py_DECREF(old_handler);
+        return NULL;
+    }
+    Py_DECREF(token);
+    return old_handler;
+#else
+    PyObject *p;
+    p = PyThreadState_GetDict();
+    if (p == NULL) {
+        return NULL;
+    }
+    old_handler = PyDict_GetItemString(p, "current_allocator");
+    if (old_handler == NULL) {
+        old_handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+        if (old_handler == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        Py_INCREF(old_handler);
+    }
+    if (handler == NULL) {
+        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+        if (handler == NULL) {
+            Py_DECREF(old_handler);
+            return NULL;
+        }
+    }
+    else {
+        Py_INCREF(handler);
+    }
+    const int error = PyDict_SetItemString(p, "current_allocator", handler);
+    Py_DECREF(handler);
+    if (error) {
+        Py_DECREF(old_handler);
+        return NULL;
+    }
+    return old_handler;
+#endif
+}
+
+/*NUMPY_API
+ * Return the policy that will be used to allocate data
+ * for the next PyArrayObject. On failure, return NULL.
+ */
+NPY_NO_EXPORT PyObject *
+PyDataMem_GetHandler()
+{
+    PyObject *handler;
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    if (PyContextVar_Get(current_handler, NULL, &handler)) {
+        return NULL;
+    }
+    return handler;
+#else
+    PyObject *p = PyThreadState_GetDict();
+    if (p == NULL) {
+        return NULL;
+    }
+    handler = PyDict_GetItemString(p, "current_allocator");
+    if (handler == NULL) {
+        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+        if (handler == NULL) {
+            return NULL;
+        }
+    }
+    else {
+        Py_INCREF(handler);
+    }
+    return handler;
+#endif
+}
+
+NPY_NO_EXPORT PyObject *
+get_handler_name(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *arr=NULL;
+    if (!PyArg_ParseTuple(args, "|O:get_handler_name", &arr)) {
+        return NULL;
+    }
+    if (arr != NULL && !PyArray_Check(arr)) {
+         PyErr_SetString(PyExc_ValueError, "if supplied, argument must be an ndarray");
+         return NULL;
+    }
+    PyObject *mem_handler;
+    PyDataMem_Handler *handler;
+    PyObject *name;
+    if (arr != NULL) {
+        mem_handler = PyArray_HANDLER((PyArrayObject *) arr);
+        if (mem_handler == NULL) {
+            Py_RETURN_NONE;
+        }
+        Py_INCREF(mem_handler);
+    }
+    else {
+        mem_handler = PyDataMem_GetHandler();
+        if (mem_handler == NULL) {
+            return NULL;
+        }
+    }
+    handler = (PyDataMem_Handler *) PyCapsule_GetPointer(mem_handler, "mem_handler");
+    if (handler == NULL) {
+        Py_DECREF(mem_handler);
+        return NULL;
+    }
+    name = PyUnicode_FromString(handler->name);
+    Py_DECREF(mem_handler);
+    return name;
+}
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index 15e31ebb5..4f7df1f84 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -1,8 +1,9 @@
-#ifndef _NPY_ARRAY_ALLOC_H_
-#define _NPY_ARRAY_ALLOC_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ALLOC_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ALLOC_H_
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-#include <numpy/ndarraytypes.h>
+#include "numpy/ndarraytypes.h"
 
 #define NPY_TRACE_DOMAIN 389047
 
@@ -10,13 +11,16 @@ NPY_NO_EXPORT PyObject *
 _set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj);
 
 NPY_NO_EXPORT void *
-npy_alloc_cache(npy_uintp sz);
+PyDataMem_UserNEW(npy_uintp sz, PyObject *mem_handler);
 
 NPY_NO_EXPORT void *
-npy_alloc_cache_zero(npy_uintp sz);
+PyDataMem_UserNEW_ZEROED(size_t nmemb, size_t size, PyObject *mem_handler);
 
 NPY_NO_EXPORT void
-npy_free_cache(void * p, npy_uintp sd);
+PyDataMem_UserFREE(void * p, npy_uintp sd, PyObject *mem_handler);
+
+NPY_NO_EXPORT void *
+PyDataMem_UserRENEW(void *ptr, size_t size, PyObject *mem_handler);
 
 NPY_NO_EXPORT void *
 npy_alloc_cache_dim(npy_uintp sz);
@@ -36,4 +40,12 @@ npy_free_cache_dim_array(PyArrayObject * arr)
     npy_free_cache_dim(PyArray_DIMS(arr), PyArray_NDIM(arr));
 }
 
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+extern PyObject *current_handler; /* PyContextVar/PyCapsule */
+extern PyDataMem_Handler default_handler;
 #endif
+
+NPY_NO_EXPORT PyObject *
+get_handler_name(PyObject *NPY_UNUSED(self), PyObject *obj);
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ALLOC_H_ */
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 665dadfbf..020a7f29a 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -6,13 +6,13 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/ndarraytypes.h>
+#include "numpy/ndarraytypes.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/multiarray/array_assign_scalar.c b/numpy/core/src/multiarray/array_assign_scalar.c
index 6cd5f4ad9..4ffef7ecc 100644
--- a/numpy/core/src/multiarray/array_assign_scalar.c
+++ b/numpy/core/src/multiarray/array_assign_scalar.c
@@ -6,12 +6,12 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
 
 #include "npy_config.h"
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 713bf7505..b335b64a0 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -1,8 +1,9 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _UMATHMODULE
 #define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
-#include "Python.h"
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #include "numpy/npy_3kcompat.h"
 
@@ -135,7 +136,7 @@ _prime_global_pytype_to_type_dict(void)
  *
  * This assumes that the DType class is guaranteed to hold on the
  * python type (this assumption is guaranteed).
- * This functionality supercedes ``_typenum_fromtypeobj``.
+ * This functionality supersedes ``_typenum_fromtypeobj``.
  *
  * @param DType DType to map the python type to
  * @param pytype Python type to map from
@@ -1400,7 +1401,7 @@ PyArray_DiscoverDTypeAndShape(
  * These should be largely deprecated, and represent only the DType class
  * for most `dtype` parameters.
  *
- * TODO: This function should eventually recieve a deprecation warning and
+ * TODO: This function should eventually receive a deprecation warning and
  *       be removed.
  *
  * @param descr
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index c5ccad225..db0e479fe 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_COERCION_H
-#define _NPY_ARRAY_COERCION_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAY_COERCION_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAY_COERCION_H_
 
 
 /*
@@ -54,4 +54,4 @@ npy_unlink_coercion_cache(coercion_cache_obj *current);
 NPY_NO_EXPORT int
 PyArray_AssignFromCache(PyArrayObject *self, coercion_cache_obj *cache);
 
-#endif  /* _NPY_ARRAY_COERCION_H */
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAY_COERCION_H_ */
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index 44ba8c733..d93dac506 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -26,10 +26,9 @@
  *    It is then sufficient for a ufunc (or other owner) to only hold a
  *    weak reference to the input DTypes.
  */
-
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+
 #include <npy_pycompat.h>
 #include "arrayobject.h"
 #include "array_method.h"
@@ -59,16 +58,10 @@ default_resolve_descriptors(
 {
     int nin = method->nin;
     int nout = method->nout;
-    int all_defined = 1;
 
     for (int i = 0; i < nin + nout; i++) {
         PyArray_DTypeMeta *dtype = dtypes[i];
-        if (dtype == NULL) {
-            output_descrs[i] = NULL;
-            all_defined = 0;
-            continue;
-        }
-        if (NPY_DTYPE(input_descrs[i]) == dtype) {
+        if (input_descrs[i] != NULL) {
             output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
         }
         else {
@@ -78,41 +71,11 @@ default_resolve_descriptors(
             goto fail;
         }
     }
-    if (all_defined) {
-        return method->casting;
-    }
-
-    if (NPY_UNLIKELY(nin == 0 || dtypes[0] == NULL)) {
-        /* Registration should reject this, so this would be indicates a bug */
-        PyErr_SetString(PyExc_RuntimeError,
-                "Invalid use of default resolver without inputs or with "
-                "input or output DType incorrectly missing.");
-        goto fail;
-    }
-    /* We find the common dtype of all inputs, and use it for the unknowns */
-    PyArray_DTypeMeta *common_dtype = dtypes[0];
-    assert(common_dtype != NULL);
-    for (int i = 1; i < nin; i++) {
-        Py_SETREF(common_dtype, PyArray_CommonDType(common_dtype, dtypes[i]));
-        if (common_dtype == NULL) {
-            goto fail;
-        }
-    }
-    for (int i = nin; i < nin + nout; i++) {
-        if (output_descrs[i] != NULL) {
-            continue;
-        }
-        if (NPY_DTYPE(input_descrs[i]) == common_dtype) {
-            output_descrs[i] = ensure_dtype_nbo(input_descrs[i]);
-        }
-        else {
-            output_descrs[i] = NPY_DT_CALL_default_descr(common_dtype);
-        }
-        if (NPY_UNLIKELY(output_descrs[i] == NULL)) {
-            goto fail;
-        }
-    }
-
+    /*
+     * If we relax the requirement for specifying all `dtypes` (e.g. allow
+     * abstract ones or unspecified outputs).  We can use the common-dtype
+     * operation to provide a default here.
+     */
     return method->casting;
 
   fail:
@@ -220,9 +183,18 @@ validate_spec(PyArrayMethod_Spec *spec)
     }
 
     for (int i = 0; i < nargs; i++) {
-        if (spec->dtypes[i] == NULL && i < spec->nin) {
+        /*
+         * Note that we could allow for output dtypes to not be specified
+         * (the array-method would have to make sure to support this).
+         * We could even allow for some dtypes to be abstract.
+         * For now, assume that this is better handled in a promotion step.
+         * One problem with providing all DTypes is the definite need to
+         * hold references.  We probably, eventually, have to implement
+         * traversal and trust the GC to deal with it.
+         */
+        if (spec->dtypes[i] == NULL) {
             PyErr_Format(PyExc_TypeError,
-                    "ArrayMethod must have well defined input DTypes. "
+                    "ArrayMethod must provide all input and output DTypes. "
                     "(method: %s)", spec->name);
             return -1;
         }
@@ -232,10 +204,10 @@ validate_spec(PyArrayMethod_Spec *spec)
                     "(method: %s)", spec->dtypes[i], spec->name);
             return -1;
         }
-        if (NPY_DT_is_abstract(spec->dtypes[i]) && i < spec->nin) {
+        if (NPY_DT_is_abstract(spec->dtypes[i])) {
             PyErr_Format(PyExc_TypeError,
-                    "abstract DType %S are currently not allowed for inputs."
-                    "(method: %s defined at %s)", spec->dtypes[i], spec->name);
+                    "abstract DType %S are currently not supported."
+                    "(method: %s)", spec->dtypes[i], spec->name);
             return -1;
         }
     }
@@ -324,7 +296,7 @@ fill_arraymethod_from_slots(
                     PyErr_Format(PyExc_TypeError,
                             "Must specify output DTypes or use custom "
                             "`resolve_descriptors` when there are no inputs. "
-                            "(method: %s defined at %s)", spec->name);
+                            "(method: %s)", spec->name);
                     return -1;
                 }
             }
@@ -371,6 +343,26 @@ fill_arraymethod_from_slots(
 }
 
 
+/*
+ * Public version of `PyArrayMethod_FromSpec_int` (see below).
+ *
+ * TODO: Error paths will probably need to be improved before a release into
+ *       the non-experimental public API.
+ */
+NPY_NO_EXPORT PyObject *
+PyArrayMethod_FromSpec(PyArrayMethod_Spec *spec)
+{
+    for (int i = 0; i < spec->nin + spec->nout; i++) {
+        if (!PyObject_TypeCheck(spec->dtypes[i], &PyArrayDTypeMeta_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "ArrayMethod spec contained a non DType.");
+            return NULL;
+        }
+    }
+    return (PyObject *)PyArrayMethod_FromSpec_int(spec, 0);
+}
+
+
 /**
  * Create a new ArrayMethod (internal version).
  *
@@ -467,7 +459,6 @@ NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type = {
 };
 
 
-
 static PyObject *
 boundarraymethod_repr(PyBoundArrayMethodObject *self)
 {
@@ -477,9 +468,11 @@ boundarraymethod_repr(PyBoundArrayMethodObject *self)
     if (dtypes == NULL) {
         return NULL;
     }
-    return PyUnicode_FromFormat(
-            "<np._BoundArrayMethod `%s` for dtypes %S>",
-            self->method->name, dtypes);
+    PyObject *repr = PyUnicode_FromFormat(
+                        "<np._BoundArrayMethod `%s` for dtypes %S>",
+                        self->method->name, dtypes);
+    Py_DECREF(dtypes);
+    return repr;
 }
 
 
@@ -683,7 +676,7 @@ boundarraymethod__simple_strided_call(
                     "All arrays must have the same length.");
             return NULL;
         }
-        if (i >= nout) {
+        if (i >= nin) {
             if (PyArray_FailUnlessWriteable(
                     arrays[i], "_simple_strided_call() output") < 0) {
                 return NULL;
@@ -787,6 +780,13 @@ _masked_stridedloop_data_free(NpyAuxData *auxdata)
  * This function wraps a regular unmasked strided-loop as a
  * masked strided-loop, only calling the function for elements
  * where the mask is True.
+ *
+ * TODO: Reductions also use this code to implement masked reductions.
+ *       Before consolidating them, reductions had a special case for
+ *       broadcasts: when the mask stride was 0 the code does not check all
+ *       elements as `npy_memchr` currently does.
+ *       It may be worthwhile to add such an optimization again if broadcasted
+ *       masks are common enough.
  */
 static int
 generic_masked_strided_loop(PyArrayMethod_Context *context,
@@ -806,7 +806,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
     npy_intp N = dimensions[0];
     /* Process the data as runs of unmasked values */
     do {
-        ssize_t subloopsize;
+        Py_ssize_t subloopsize;
 
         /* Skip masked values */
         mask = npy_memchr(mask, 0, mask_stride, N, &subloopsize, 1);
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index fc2304889..7b7372bd0 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_METHOD_H
-#define _NPY_ARRAY_METHOD_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAY_METHOD_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAY_METHOD_H_
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
@@ -21,6 +21,17 @@ typedef enum {
     NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
     /* Whether the method supports unaligned access (not runtime) */
     NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
+    /*
+     * Private flag for now for *logic* functions.  The logical functions
+     * `logical_or` and `logical_and` can always cast the inputs to booleans
+     * "safely" (because that is how the cast to bool is defined).
+     * @seberg: I am not sure this is the best way to handle this, so its
+     * private for now (also it is very limited anyway).
+     * There is one "exception". NA aware dtypes cannot cast to bool
+     * (hopefully), so the `??->?` loop should error even with this flag.
+     * But a second NA fallback loop will be necessary.
+     */
+    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
 
     /* All flags which can change at runtime */
     NPY_METH_RUNTIME_FLAGS = (
@@ -170,6 +181,11 @@ PyArrayMethod_GetMaskedStridedLoop(
         NPY_ARRAYMETHOD_FLAGS *flags);
 
 
+
+NPY_NO_EXPORT PyObject *
+PyArrayMethod_FromSpec(PyArrayMethod_Spec *spec);
+
+
 /*
  * TODO: This function is the internal version, and its error paths may
  *       need better tests when a public version is exposed.
@@ -177,4 +193,4 @@ PyArrayMethod_GetMaskedStridedLoop(
 NPY_NO_EXPORT PyBoundArrayMethodObject *
 PyArrayMethod_FromSpec_int(PyArrayMethod_Spec *spec, int private);
 
-#endif  /*_NPY_ARRAY_METHOD_H*/
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAY_METHOD_H_ */
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index fdf0dfcaf..09f7ee548 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
-#define _NPY_PRIVATE__ARRAYFUNCTION_OVERRIDE_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 
 NPY_NO_EXPORT PyObject *
 array_implement_array_function(
@@ -19,4 +19,4 @@ NPY_NO_EXPORT PyObject *
 array_function_method_impl(PyObject *func, PyObject *types, PyObject *args,
                            PyObject *kwargs);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_ */
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 55ba5601b..c8aaced4e 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -20,13 +20,13 @@ maintainer email:  oliphant.travis@ieee.org
   Space Science Telescope Institute
   (J. Todd Miller, Perry Greenfield, Rick White)
 */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-/*#include <stdio.h>*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -493,7 +493,28 @@ array_dealloc(PyArrayObject *self)
         if (PyDataType_FLAGCHK(fa->descr, NPY_ITEM_REFCOUNT)) {
             PyArray_XDECREF(self);
         }
-        npy_free_cache(fa->data, PyArray_NBYTES(self));
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        size_t nbytes = PyArray_NBYTES(self);
+        if (nbytes == 0) {
+            nbytes = fa->descr->elsize ? fa->descr->elsize : 1;
+        }
+        if (fa->mem_handler == NULL) {
+            char *env = getenv("NUMPY_WARN_IF_NO_MEM_POLICY");
+            if ((env != NULL) && (strncmp(env, "1", 1) == 0)) {
+                char const * msg = "Trying to dealloc data, but a memory policy "
+                    "is not set. If you take ownership of the data, you must "
+                    "set a base owning the data (e.g. a PyCapsule).";
+                WARN_IN_DEALLOC(PyExc_RuntimeWarning, msg);
+            }
+            // Guess at malloc/free ???
+            free(fa->data);
+        } else {
+            PyDataMem_UserFREE(fa->data, nbytes, fa->mem_handler);
+            Py_DECREF(fa->mem_handler);
+        }
     }
 
     /* must match allocation in PyArray_NewFromDescr */
@@ -858,7 +879,7 @@ _uni_release(char *ptr, int nc)
                 relfunc(aptr, N1);                              \
                 return -1;                                      \
             }                                                   \
-            val = compfunc(aptr, bptr, N1, N2);                  \
+            val = compfunc(aptr, bptr, N1, N2);                 \
             *dptr = (val CMP 0);                                \
             PyArray_ITER_NEXT(iself);                           \
             PyArray_ITER_NEXT(iother);                          \
@@ -870,7 +891,7 @@ _uni_release(char *ptr, int nc)
 
 #define _reg_loop(CMP) {                                \
         while(size--) {                                 \
-            val = compfunc((void *)iself->dataptr,       \
+            val = compfunc((void *)iself->dataptr,      \
                           (void *)iother->dataptr,      \
                           N1, N2);                      \
             *dptr = (val CMP 0);                        \
@@ -1705,22 +1726,6 @@ array_iter(PyArrayObject *arr)
     return PySeqIter_New((PyObject *)arr);
 }
 
-static PyObject *
-array_alloc(PyTypeObject *type, Py_ssize_t NPY_UNUSED(nitems))
-{
-    /* nitems will always be 0 */
-    PyObject *obj = PyObject_Malloc(type->tp_basicsize);
-    PyObject_Init(obj, type);
-    return obj;
-}
-
-static void
-array_free(PyObject * v)
-{
-    /* avoid same deallocator as PyBaseObject, see gentype_free */
-    PyObject_Free(v);
-}
-
 
 NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
@@ -1741,7 +1746,5 @@ NPY_NO_EXPORT PyTypeObject PyArray_Type = {
     .tp_iter = (getiterfunc)array_iter,
     .tp_methods = array_methods,
     .tp_getset = array_getsetlist,
-    .tp_alloc = (allocfunc)array_alloc,
     .tp_new = (newfunc)array_new,
-    .tp_free = (freefunc)array_free,
 };
diff --git a/numpy/core/src/multiarray/arrayobject.h b/numpy/core/src/multiarray/arrayobject.h
index 9b74944ff..fb9b0bd81 100644
--- a/numpy/core/src/multiarray/arrayobject.h
+++ b/numpy/core/src/multiarray/arrayobject.h
@@ -1,10 +1,10 @@
-#ifndef _NPY_INTERNAL_ARRAYOBJECT_H_
-#define _NPY_INTERNAL_ARRAYOBJECT_H_
-
 #ifndef _MULTIARRAYMODULE
 #error You should not include this
 #endif
 
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYOBJECT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAYOBJECT_H_
+
 NPY_NO_EXPORT PyObject *
 _strings_richcompare(PyArrayObject *self, PyArrayObject *other, int cmp_op,
                      int rstrip);
@@ -26,4 +26,4 @@ array_might_be_written(PyArrayObject *obj);
  */
 static const int NPY_ARRAY_WARN_ON_WRITE = (1 << 31);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYOBJECT_H_ */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index b3ea7544d..71808cc48 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -1,7 +1,7 @@
 /* -*- c -*- */
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
+#include <Python.h>
+#include <structmember.h>
 #include <limits.h>
 #include <assert.h>
 
@@ -62,7 +62,7 @@ get_dummy_stack_array(PyArrayObject *orig)
     PyArrayObject_fields new_fields;
     new_fields.flags = PyArray_FLAGS(orig);
     /* Set to NULL so the dummy object can be distinguished from the real one */
-    Py_TYPE(&new_fields) = NULL;
+    Py_SET_TYPE(&new_fields, NULL);
     new_fields.base = (PyObject *)orig;
     return new_fields;
 }
@@ -2759,10 +2759,10 @@ VOID_nonzero (char *ip, PyArrayObject *ap)
             dummy_fields.descr = new;
             if ((new->alignment > 1) && !__ALIGNED(ip + offset,
                         new->alignment)) {
-                PyArray_CLEARFLAGS(ap, NPY_ARRAY_ALIGNED);
+                PyArray_CLEARFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
             }
             else {
-                PyArray_ENABLEFLAGS(ap, NPY_ARRAY_ALIGNED);
+                PyArray_ENABLEFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
             }
             if (new->f->nonzero(ip+offset, dummy_arr)) {
                 nonz = NPY_TRUE;
@@ -3093,6 +3093,10 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     if (!PyArray_HASFIELDS(ap)) {
         return STRING_compare(ip1, ip2, ap);
     }
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        goto finish;
+    }
     descr = PyArray_DESCR(ap);
     /*
      * Compare on the first-field.  If equal, then
@@ -3107,15 +3111,19 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
         if (_unpack_field(tup, &new, &offset) < 0) {
             goto finish;
         }
-        /* descr is the only field checked by compare or copyswap */
+        /* Set the fields needed by compare or copyswap */
         dummy_struct.descr = new;
+
         swap = PyArray_ISBYTESWAPPED(dummy);
         nip1 = ip1 + offset;
         nip2 = ip2 + offset;
         if (swap || new->alignment > 1) {
             if (swap || !npy_is_aligned(nip1, new->alignment)) {
-                /* create buffer and copy */
-                nip1 = npy_alloc_cache(new->elsize);
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip1 = PyDataMem_UserNEW(new->elsize, mem_handler);
                 if (nip1 == NULL) {
                     goto finish;
                 }
@@ -3124,11 +3132,15 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
                     new->f->copyswap(nip1, NULL, swap, dummy);
             }
             if (swap || !npy_is_aligned(nip2, new->alignment)) {
-                /* create buffer and copy */
-                nip2 = npy_alloc_cache(new->elsize);
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip2 = PyDataMem_UserNEW(new->elsize, mem_handler);
                 if (nip2 == NULL) {
                     if (nip1 != ip1 + offset) {
-                        npy_free_cache(nip1, new->elsize);
+                        /* destroy temporary buffer */
+                        PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
                     }
                     goto finish;
                 }
@@ -3140,10 +3152,12 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
         res = new->f->compare(nip1, nip2, dummy);
         if (swap || new->alignment > 1) {
             if (nip1 != ip1 + offset) {
-                npy_free_cache(nip1, new->elsize);
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
             }
             if (nip2 != ip2 + offset) {
-                npy_free_cache(nip2, new->elsize);
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip2, new->elsize, mem_handler);
             }
         }
         if (res != 0) {
@@ -3152,6 +3166,7 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     }
 
 finish:
+    Py_XDECREF(mem_handler);
     return res;
 }
 
diff --git a/numpy/core/src/multiarray/arraytypes.h b/numpy/core/src/multiarray/arraytypes.h
index a9469aef7..b3a13b297 100644
--- a/numpy/core/src/multiarray/arraytypes.h
+++ b/numpy/core/src/multiarray/arraytypes.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAYTYPES_H_
-#define _NPY_ARRAYTYPES_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
 
 #include "common.h"
 
@@ -28,4 +28,4 @@ small_correlate(const char * d_, npy_intp dstride,
                 npy_intp nk, enum NPY_TYPES ktype,
                 char * out_, npy_intp ostride);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 5458c81cc..d10122c4f 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index 21e52c32b..327f685d4 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "lowlevel_strided_loops.h"
 
diff --git a/numpy/core/src/multiarray/calculation.h b/numpy/core/src/multiarray/calculation.h
index 49105a138..6a9c3c7c9 100644
--- a/numpy/core/src/multiarray/calculation.h
+++ b/numpy/core/src/multiarray/calculation.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_CALCULATION_H_
-#define _NPY_CALCULATION_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_CALCULATION_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_CALCULATION_H_
 
 NPY_NO_EXPORT PyObject*
 PyArray_ArgMax(PyArrayObject* self, int axis, PyArrayObject *out);
@@ -67,4 +67,4 @@ PyArray_All(PyArrayObject* self, int axis, PyArrayObject* out);
 NPY_NO_EXPORT PyObject*
 PyArray_Any(PyArrayObject* self, int axis, PyArrayObject* out);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_CALCULATION_H_ */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index 1fd9ab1a3..82d34193d 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -1,8 +1,9 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 203decaa0..b3526c4c1 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -1,10 +1,11 @@
-#ifndef _NPY_PRIVATE_COMMON_H_
-#define _NPY_PRIVATE_COMMON_H_
-#include "structmember.h"
-#include <numpy/npy_common.h>
-#include <numpy/ndarraytypes.h>
-#include <limits.h>
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_COMMON_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_COMMON_H_
+
+#include <structmember.h>
+#include "numpy/npy_common.h"
+#include "numpy/ndarraytypes.h"
 #include "npy_import.h"
+#include <limits.h>
 
 #define error_converting(x)  (((x) == -1) && PyErr_Occurred())
 
@@ -343,5 +344,4 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
  */
 #define NPY_ITER_REDUCTION_AXIS(axis) (axis + (1 << (NPY_BITSOF_INT - 2)))
 
-#endif
-
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_COMMON_H_ */
diff --git a/numpy/core/src/multiarray/common_dtype.c b/numpy/core/src/multiarray/common_dtype.c
index 659580c98..ca80b1ed7 100644
--- a/numpy/core/src/multiarray/common_dtype.c
+++ b/numpy/core/src/multiarray/common_dtype.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/npy_common.h>
+#include "numpy/npy_common.h"
 #include "numpy/arrayobject.h"
 
 #include "common_dtype.h"
diff --git a/numpy/core/src/multiarray/common_dtype.h b/numpy/core/src/multiarray/common_dtype.h
index b3666531a..13d38ddf8 100644
--- a/numpy/core/src/multiarray/common_dtype.h
+++ b/numpy/core/src/multiarray/common_dtype.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_COMMON_DTYPE_H_
-#define _NPY_COMMON_DTYPE_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_COMMON_DTYPE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_COMMON_DTYPE_H_
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
@@ -14,4 +14,4 @@ NPY_NO_EXPORT PyArray_DTypeMeta *
 PyArray_PromoteDTypeSequence(
         npy_intp length, PyArray_DTypeMeta **dtypes_in);
 
-#endif  /* _NPY_COMMON_DTYPE_H_ */
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_COMMON_DTYPE_H_ */
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index de793f87c..9910fffe6 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -1,9 +1,10 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <structmember.h>
-#include <string.h>
 
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/npy_3kcompat.h"
 #include "numpy/npy_math.h"
@@ -15,6 +16,8 @@
 #include "common.h"
 #include "simd/simd.h"
 
+#include <string.h>
+
 typedef enum {
     PACK_ORDER_LITTLE = 0,
     PACK_ORDER_BIG
@@ -1425,9 +1428,26 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
         PyCFunctionObject *new = (PyCFunctionObject *)obj;
         _ADDDOC(new->m_ml->ml_doc, new->m_ml->ml_name);
     }
-    else if (Py_TYPE(obj) == &PyType_Type) {
+    else if (PyObject_TypeCheck(obj, &PyType_Type)) {
+        /*
+         * We add it to both `tp_doc` and `__doc__` here.  Note that in theory
+         * `tp_doc` extracts the signature line, but we currently do not use
+         * it.  It may make sense to only add it as `__doc__` and
+         * `__text_signature__` to the dict in the future.
+         * The dictionary path is only necessary for heaptypes (currently not
+         * used) and metaclasses.
+         * If `__doc__` as stored in `tp_dict` is None, we assume this was
+         * filled in by `PyType_Ready()` and should also be replaced.
+         */
         PyTypeObject *new = (PyTypeObject *)obj;
         _ADDDOC(new->tp_doc, new->tp_name);
+        if (new->tp_dict != NULL && PyDict_CheckExact(new->tp_dict) &&
+                PyDict_GetItemString(new->tp_dict, "__doc__") == Py_None) {
+            /* Warning: Modifying `tp_dict` is not generally safe! */
+            if (PyDict_SetItemString(new->tp_dict, "__doc__", str) < 0) {
+                return NULL;
+            }
+        }
     }
     else if (Py_TYPE(obj) == &PyMemberDescr_Type) {
         PyMemberDescrObject *new = (PyMemberDescrObject *)obj;
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index 082139910..d3bc08cb2 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -1,6 +1,7 @@
-#ifndef _NPY_PRIVATE__COMPILED_BASE_H_
-#define _NPY_PRIVATE__COMPILED_BASE_H_
-#include <numpy/ndarraytypes.h>
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_COMPILED_BASE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_COMPILED_BASE_H_
+
+#include "numpy/ndarraytypes.h"
 
 NPY_NO_EXPORT PyObject *
 arr_insert(PyObject *, PyObject *, PyObject *);
@@ -23,4 +24,4 @@ io_pack(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 io_unpack(PyObject *, PyObject *, PyObject *);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_COMPILED_BASE_H_ */
diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c
index 15fe4bde2..59e3b4922 100644
--- a/numpy/core/src/multiarray/conversion_utils.c
+++ b/numpy/core/src/multiarray/conversion_utils.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h
index 35525b7fe..4662c6a8b 100644
--- a/numpy/core/src/multiarray/conversion_utils.h
+++ b/numpy/core/src/multiarray/conversion_utils.h
@@ -1,7 +1,7 @@
-#ifndef _NPY_PRIVATE_CONVERSION_UTILS_H_
-#define _NPY_PRIVATE_CONVERSION_UTILS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_CONVERSION_UTILS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_CONVERSION_UTILS_H_
 
-#include <numpy/ndarraytypes.h>
+#include "numpy/ndarraytypes.h"
 
 NPY_NO_EXPORT int
 PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq);
@@ -85,4 +85,4 @@ PyArray_ConvertMultiAxis(PyObject *axis_in, int ndim, npy_bool *out_axis_flags);
  */
 extern NPY_NO_EXPORT int evil_global_disable_warn_O4O8_flag;
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_CONVERSION_UTILS_H_ */
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 2ad8d6d0e..2f68db07c 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -1,11 +1,12 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#include <npy_config.h>
+#include "npy_config.h"
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/multiarray/convert.h b/numpy/core/src/multiarray/convert.h
index 96df19711..d64d9be3f 100644
--- a/numpy/core/src/multiarray/convert.h
+++ b/numpy/core/src/multiarray/convert.h
@@ -1,8 +1,8 @@
-#ifndef _NPY_ARRAYOBJECT_CONVERT_H_
-#define _NPY_ARRAYOBJECT_CONVERT_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_CONVERT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_CONVERT_H_
 
 NPY_NO_EXPORT int
 PyArray_AssignZero(PyArrayObject *dst,
                    PyArrayObject *wheremask);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_CONVERT_H_ */
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 45b03a6f3..3135d6989 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -1547,6 +1548,40 @@ should_use_min_scalar(npy_intp narrs, PyArrayObject **arr,
 }
 
 
+/*
+ * Utility function used only in PyArray_ResultType for value-based logic.
+ * See that function for the meaning and contents of the parameters.
+ */
+static PyArray_Descr *
+get_descr_from_cast_or_value(
+        npy_intp i,
+        PyArrayObject *arrs[],
+        npy_intp ndtypes,
+        PyArray_Descr *descriptor,
+        PyArray_DTypeMeta *common_dtype)
+{
+    PyArray_Descr *curr;
+    if (NPY_LIKELY(i < ndtypes ||
+            !(PyArray_FLAGS(arrs[i-ndtypes]) & _NPY_ARRAY_WAS_PYSCALAR))) {
+        curr = PyArray_CastDescrToDType(descriptor, common_dtype);
+    }
+    else {
+        /*
+         * Unlike `PyArray_CastToDTypeAndPromoteDescriptors`, deal with
+         * plain Python values "graciously". This recovers the original
+         * value the long route, but it should almost never happen...
+         */
+        PyObject *tmp = PyArray_GETITEM(arrs[i-ndtypes],
+                                        PyArray_BYTES(arrs[i-ndtypes]));
+        if (tmp == NULL) {
+            return NULL;
+        }
+        curr = NPY_DT_CALL_discover_descr_from_pyobject(common_dtype, tmp);
+        Py_DECREF(tmp);
+    }
+    return curr;
+}
+
 /*NUMPY_API
  *
  * Produces the result type of a bunch of inputs, using the same rules
@@ -1648,7 +1683,7 @@ PyArray_ResultType(
         }
         Py_INCREF(all_DTypes[i_all]);
         /*
-         * Leave the decriptor empty, if we need it, we will have to go
+         * Leave the descriptor empty, if we need it, we will have to go
          * to more extreme lengths unfortunately.
          */
         all_descriptors[i_all] = NULL;
@@ -1683,28 +1718,15 @@ PyArray_ResultType(
         result = NPY_DT_CALL_default_descr(common_dtype);
     }
     else {
-        result = PyArray_CastDescrToDType(all_descriptors[0], common_dtype);
+        result = get_descr_from_cast_or_value(
+                    0, arrs, ndtypes, all_descriptors[0], common_dtype);
+        if (result == NULL) {
+            goto error;
+        }
 
         for (npy_intp i = 1; i < ndtypes+narrs; i++) {
-            PyArray_Descr *curr;
-            if (NPY_LIKELY(i < ndtypes ||
-                    !(PyArray_FLAGS(arrs[i-ndtypes]) & _NPY_ARRAY_WAS_PYSCALAR))) {
-                curr = PyArray_CastDescrToDType(all_descriptors[i], common_dtype);
-            }
-            else {
-                /*
-                 * Unlike `PyArray_CastToDTypeAndPromoteDescriptors` deal with
-                 * plain Python values "graciously". This recovers the original
-                 * value the long route, but it should almost never happen...
-                 */
-                PyObject *tmp = PyArray_GETITEM(
-                        arrs[i-ndtypes], PyArray_BYTES(arrs[i-ndtypes]));
-                if (tmp == NULL) {
-                    goto error;
-                }
-                curr = NPY_DT_CALL_discover_descr_from_pyobject(common_dtype, tmp);
-                Py_DECREF(tmp);
-            }
+            PyArray_Descr *curr = get_descr_from_cast_or_value(
+                    i, arrs, ndtypes, all_descriptors[i], common_dtype);
             if (curr == NULL) {
                 goto error;
             }
@@ -2097,7 +2119,7 @@ PyArray_ObjectType(PyObject *op, int minimum_type)
  * This function is only used in one place within NumPy and should
  * generally be avoided. It is provided mainly for backward compatibility.
  *
- * The user of the function has to free the returned array.
+ * The user of the function has to free the returned array with PyDataMem_FREE.
  */
 NPY_NO_EXPORT PyArrayObject **
 PyArray_ConvertToCommonType(PyObject *op, int *retn)
@@ -2242,7 +2264,7 @@ PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth)
  * Add a new casting implementation using a PyArrayMethod_Spec.
  *
  * @param spec
- * @param private If private, allow slots not publically exposed.
+ * @param private If private, allow slots not publicly exposed.
  * @return 0 on success -1 on failure
  */
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 22b3859d2..5e0682f22 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_CONVERT_DATATYPE_H_
-#define _NPY_ARRAY_CONVERT_DATATYPE_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_CONVERT_DATATYPE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_CONVERT_DATATYPE_H_
 
 #include "array_method.h"
 
@@ -78,9 +78,9 @@ PyArray_CheckCastSafety(NPY_CASTING casting,
 NPY_NO_EXPORT NPY_CASTING
 legacy_same_dtype_resolve_descriptors(
         PyArrayMethodObject *self,
-        PyArray_DTypeMeta **dtypes,
-        PyArray_Descr **given_descrs,
-        PyArray_Descr **loop_descrs);
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2]);
 
 NPY_NO_EXPORT int
 legacy_cast_get_strided_loop(
@@ -92,11 +92,11 @@ legacy_cast_get_strided_loop(
 NPY_NO_EXPORT NPY_CASTING
 simple_cast_resolve_descriptors(
         PyArrayMethodObject *self,
-        PyArray_DTypeMeta **dtypes,
-        PyArray_Descr **input_descrs,
-        PyArray_Descr **loop_descrs);
+        PyArray_DTypeMeta *dtypes[2],
+        PyArray_Descr *input_descrs[2],
+        PyArray_Descr *loop_descrs[2]);
 
 NPY_NO_EXPORT int
 PyArray_InitializeCasts(void);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_CONVERT_DATATYPE_H_ */
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index deab7d2a1..27fd3a057 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -725,6 +726,7 @@ PyArray_NewFromDescr_int(
     fa->nd = nd;
     fa->dimensions = NULL;
     fa->data = NULL;
+    fa->mem_handler = NULL;
 
     if (data == NULL) {
         fa->flags = NPY_ARRAY_DEFAULT;
@@ -804,12 +806,19 @@ PyArray_NewFromDescr_int(
         fa->flags |= NPY_ARRAY_C_CONTIGUOUS|NPY_ARRAY_F_CONTIGUOUS;
     }
 
+
     if (data == NULL) {
+        /* Store the handler in case the default is modified */
+        fa->mem_handler = PyDataMem_GetHandler();
+        if (fa->mem_handler == NULL) {
+            goto fail;
+        }
         /*
          * Allocate something even for zero-space arrays
          * e.g. shape=(0,) -- otherwise buffer exposure
          * (a.data) doesn't work as it should.
          * Could probably just allocate a few bytes here. -- Chuck
+         * Note: always sync this with calls to PyDataMem_UserFREE
          */
         if (nbytes == 0) {
             nbytes = descr->elsize ? descr->elsize : 1;
@@ -819,21 +828,23 @@ PyArray_NewFromDescr_int(
          * which could also be sub-fields of a VOID array
          */
         if (zeroed || PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
-            data = npy_alloc_cache_zero(nbytes);
+            data = PyDataMem_UserNEW_ZEROED(nbytes, 1, fa->mem_handler);
         }
         else {
-            data = npy_alloc_cache(nbytes);
+            data = PyDataMem_UserNEW(nbytes, fa->mem_handler);
         }
         if (data == NULL) {
             raise_memory_error(fa->nd, fa->dimensions, descr);
             goto fail;
         }
+
         fa->flags |= NPY_ARRAY_OWNDATA;
     }
     else {
+        /* The handlers should never be called in this case */
+        fa->mem_handler = NULL;
         /*
-         * If data is passed in, this object won't own it by default.
-         * Caller must arrange for this to be reset if truly desired
+         * If data is passed in, this object won't own it.
          */
         fa->flags &= ~NPY_ARRAY_OWNDATA;
     }
@@ -901,6 +912,7 @@ PyArray_NewFromDescr_int(
     return (PyObject *)fa;
 
  fail:
+    Py_XDECREF(fa->mem_handler);
     Py_DECREF(fa);
     return NULL;
 }
@@ -1019,6 +1031,17 @@ PyArray_NewLikeArrayWithShape(PyArrayObject *prototype, NPY_ORDER order,
 
         /* Build the new strides */
         stride = dtype->elsize;
+        if (stride == 0 && PyDataType_ISSTRING(dtype)) {
+            /* Special case for dtype=str or dtype=bytes. */
+            if (dtype->type_num == NPY_STRING) {
+                /* dtype is bytes */
+                stride = 1;
+            }
+            else {
+                /* dtype is str (type_num is NPY_UNICODE) */
+                stride = 4;
+            }
+        }
         for (idim = ndim-1; idim >= 0; --idim) {
             npy_intp i_perm = strideperm[idim].perm;
             strides[i_perm] = stride;
@@ -2724,7 +2747,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
         /* If we exhausted the dst block, refresh it */
         if (dst_count == count) {
             res = dst_iternext(dst_iter);
-            if (!res) {
+            if (res == 0) {
                 break;
             }
             dst_count = *dst_countptr;
@@ -2738,7 +2761,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
         /* If we exhausted the src block, refresh it */
         if (src_count == count) {
             res = src_iternext(src_iter);
-            if (!res) {
+            if (res == 0) {
                 break;
             }
             src_count = *src_countptr;
@@ -2755,10 +2778,6 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     NPY_cast_info_xfree(&cast_info);
     NpyIter_Deallocate(dst_iter);
     NpyIter_Deallocate(src_iter);
-    if (res > 0) {
-        /* The iteration stopped successfully, do not report an error */
-        return 0;
-    }
     return res;
 }
 
@@ -3420,7 +3439,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         dptr += dtype->elsize;
         if (num < 0 && thisbuf == size) {
             totalbytes += bytes;
-            tmp = PyDataMem_RENEW(PyArray_DATA(r), totalbytes);
+            /* The handler is always valid */
+            tmp = PyDataMem_UserRENEW(PyArray_DATA(r), totalbytes,
+                                  PyArray_HANDLER(r));
             if (tmp == NULL) {
                 err = 1;
                 break;
@@ -3442,7 +3463,9 @@ array_from_text(PyArray_Descr *dtype, npy_intp num, char const *sep, size_t *nre
         const size_t nsize = PyArray_MAX(*nread,1)*dtype->elsize;
 
         if (nsize != 0) {
-            tmp = PyDataMem_RENEW(PyArray_DATA(r), nsize);
+            /* The handler is always valid */
+            tmp = PyDataMem_UserRENEW(PyArray_DATA(r), nsize,
+                                  PyArray_HANDLER(r));
             if (tmp == NULL) {
                 err = 1;
             }
@@ -3547,7 +3570,9 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
         const size_t nsize = PyArray_MAX(nread,1) * dtype->elsize;
         char *tmp;
 
-        if ((tmp = PyDataMem_RENEW(PyArray_DATA(ret), nsize)) == NULL) {
+        /* The handler is always valid */
+        if((tmp = PyDataMem_UserRENEW(PyArray_DATA(ret), nsize,
+                                     PyArray_HANDLER(ret))) == NULL) {
             Py_DECREF(dtype);
             Py_DECREF(ret);
             return PyErr_NoMemory();
@@ -3831,7 +3856,9 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
             */
             elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
             if (!npy_mul_with_overflow_intp(&nbytes, elcount, elsize)) {
-                new_data = PyDataMem_RENEW(PyArray_DATA(ret), nbytes);
+                /* The handler is always valid */
+                new_data = PyDataMem_UserRENEW(PyArray_DATA(ret), nbytes,
+                                  PyArray_HANDLER(ret));
             }
             else {
                 new_data = NULL;
@@ -3869,10 +3896,12 @@ PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count)
      * (assuming realloc is reasonably good about reusing space...)
      */
     if (i == 0 || elsize == 0) {
-        /* The size cannot be zero for PyDataMem_RENEW. */
+        /* The size cannot be zero for realloc. */
         goto done;
     }
-    new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * elsize);
+    /* The handler is always valid */
+    new_data = PyDataMem_UserRENEW(PyArray_DATA(ret), i * elsize,
+                                   PyArray_HANDLER(ret));
     if (new_data == NULL) {
         PyErr_SetString(PyExc_MemoryError,
                 "cannot allocate array memory");
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index 8db1412c7..e59e86e8b 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_CTORS_H_
-#define _NPY_ARRAY_CTORS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_CTORS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_CTORS_H_
 
 NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr(
@@ -102,4 +102,4 @@ NPY_NO_EXPORT PyArrayObject *
 PyArray_SubclassWrap(PyArrayObject *arr_of_subclass, PyArrayObject *towrap);
 
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_CTORS_H_ */
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 182eb12f9..e0064c017 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -6,16 +6,14 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <datetime.h>
 
-#include <time.h>
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
+#include "numpy/arrayobject.h"
+#include "numpyos.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -30,7 +28,11 @@
 #include "usertypes.h"
 
 #include "dtype_transfer.h"
-#include <lowlevel_strided_loops.h>
+#include "lowlevel_strided_loops.h"
+
+#include <datetime.h>
+#include <time.h>
+
 
 /*
  * Computes the python `ret, d = divmod(d, unit)`.
@@ -426,7 +428,7 @@ PyArray_DatetimeStructToDatetime(
 }
 
 /*NUMPY_API
- * Create a timdelta value from a filled timedelta struct and resolution unit.
+ * Create a timedelta value from a filled timedelta struct and resolution unit.
  *
  * TO BE REMOVED - NOT USED INTERNALLY.
  */
@@ -722,12 +724,21 @@ parse_datetime_extended_unit_from_string(char const *str, Py_ssize_t len,
 {
     char const *substr = str, *substrend = NULL;
     int den = 1;
+    npy_longlong true_meta_val;
 
     /* First comes an optional integer multiplier */
     out_meta->num = (int)strtol_const(substr, &substrend, 10);
     if (substr == substrend) {
         out_meta->num = 1;
     }
+    else {
+        // check for 32-bit integer overflow
+        char *endptr = NULL;
+        true_meta_val = NumPyOS_strtoll(substr, &endptr, 10);
+        if (true_meta_val > INT_MAX || true_meta_val < 0) {
+            goto bad_input;
+        }
+    }
     substr = substrend;
 
     /* Next comes the unit itself, followed by either '/' or the string end */
@@ -1159,7 +1170,7 @@ get_datetime_conversion_factor(PyArray_DatetimeMetaData *src_meta,
     }
 
     /* If something overflowed, make both num and denom 0 */
-    if (denom == 0 || num == 0) {
+    if (num == 0) {
         PyErr_Format(PyExc_OverflowError,
                     "Integer overflow while computing the conversion "
                     "factor between NumPy datetime units %s and %s",
@@ -3775,7 +3786,17 @@ time_to_time_resolve_descriptors(
     meta2 = get_datetime_metadata_from_dtype(loop_descrs[1]);
     assert(meta2 != NULL);
 
-    if (meta1->base == meta2->base && meta1->num == meta2->num) {
+    if ((meta1->base == meta2->base && meta1->num == meta2->num) ||
+            // handle some common metric prefix conversions
+            // 1000 fold conversions
+            ((meta2->base >= 7) && (meta1->base - meta2->base == 1)
+              && ((meta1->num / meta2->num) == 1000)) ||
+            // 10^6 fold conversions
+            ((meta2->base >= 7) && (meta1->base - meta2->base == 2)
+              && ((meta1->num / meta2->num) == 1000000)) ||
+            // 10^9 fold conversions
+            ((meta2->base >= 7) && (meta1->base - meta2->base == 3)
+              && ((meta1->num / meta2->num) == 1000000000))) {
         if (byteorder_may_allow_view) {
             return NPY_NO_CASTING | byteorder_may_allow_view;
         }
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index f0564146d..d3e9e1451 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -6,12 +6,12 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
diff --git a/numpy/core/src/multiarray/datetime_busday.h b/numpy/core/src/multiarray/datetime_busday.h
index 483151122..b53a25010 100644
--- a/numpy/core/src/multiarray/datetime_busday.h
+++ b/numpy/core/src/multiarray/datetime_busday.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__DATETIME_BUSDAY_H_
-#define _NPY_PRIVATE__DATETIME_BUSDAY_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAY_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAY_H_
 
 /*
  * This is the 'busday_offset' function exposed for calling
@@ -25,4 +25,4 @@ NPY_NO_EXPORT PyObject *
 array_is_busday(PyObject *NPY_UNUSED(self),
                       PyObject *args, PyObject *kwds);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAY_H_ */
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.c b/numpy/core/src/multiarray/datetime_busdaycal.c
index e3e729d3c..880efe934 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.c
+++ b/numpy/core/src/multiarray/datetime_busdaycal.c
@@ -7,19 +7,19 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
+#include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
 
 #include "common.h"
-#include "numpy/arrayscalars.h"
 #include "lowlevel_strided_loops.h"
 #include "_datetime.h"
 #include "datetime_busday.h"
diff --git a/numpy/core/src/multiarray/datetime_busdaycal.h b/numpy/core/src/multiarray/datetime_busdaycal.h
index 02903e3d2..20efebe0a 100644
--- a/numpy/core/src/multiarray/datetime_busdaycal.h
+++ b/numpy/core/src/multiarray/datetime_busdaycal.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__DATETIME_BUSDAYDEF_H_
-#define _NPY_PRIVATE__DATETIME_BUSDAYDEF_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAYCAL_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAYCAL_H_
 
 /*
  * A list of holidays, which should be sorted, not contain any
@@ -59,4 +59,4 @@ PyArray_HolidaysConverter(PyObject *dates_in, npy_holidayslist *holidays);
 
 
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DATETIME_BUSDAYCAL_H_ */
diff --git a/numpy/core/src/multiarray/datetime_strings.c b/numpy/core/src/multiarray/datetime_strings.c
index 360868568..5080647cb 100644
--- a/numpy/core/src/multiarray/datetime_strings.c
+++ b/numpy/core/src/multiarray/datetime_strings.c
@@ -6,15 +6,14 @@
  *
  * See LICENSE.txt for the license.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#include <time.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
+#include "numpy/arrayobject.h"
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -24,6 +23,8 @@
 #include "_datetime.h"
 #include "datetime_strings.h"
 
+#include <time.h>
+
 /*
  * Platform-specific time_t typedef. Some platforms use 32 bit, some use 64 bit
  * and we just use the default with the exception of mingw, where we must use
diff --git a/numpy/core/src/multiarray/datetime_strings.h b/numpy/core/src/multiarray/datetime_strings.h
index 148369595..ca35d29c8 100644
--- a/numpy/core/src/multiarray/datetime_strings.h
+++ b/numpy/core/src/multiarray/datetime_strings.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__DATETIME_STRINGS_H_
-#define _NPY_PRIVATE__DATETIME_STRINGS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DATETIME_STRINGS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DATETIME_STRINGS_H_
 
 /*
  * Parses (almost) standard ISO 8601 date strings. The differences are:
@@ -81,4 +81,4 @@ NPY_NO_EXPORT PyObject *
 array_datetime_as_string(PyObject *NPY_UNUSED(self), PyObject *args,
                                 PyObject *kwds);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DATETIME_STRINGS_H_ */
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 90453e38f..6a09f92ac 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1,11 +1,11 @@
 /* Array Descr Object */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -257,7 +257,7 @@ static PyArray_Descr *
 _convert_from_tuple(PyObject *obj, int align)
 {
     if (PyTuple_GET_SIZE(obj) != 2) {
-        PyErr_Format(PyExc_TypeError, 
+        PyErr_Format(PyExc_TypeError,
 	        "Tuple must have size 2, but has size %zd",
 	        PyTuple_GET_SIZE(obj));
         return NULL;
@@ -449,8 +449,8 @@ _convert_from_array_descr(PyObject *obj, int align)
     for (int i = 0; i < n; i++) {
         PyObject *item = PyList_GET_ITEM(obj, i);
         if (!PyTuple_Check(item) || (PyTuple_GET_SIZE(item) < 2)) {
-            PyErr_Format(PyExc_TypeError, 
-			 "Field elements must be 2- or 3-tuples, got '%R'", 
+            PyErr_Format(PyExc_TypeError,
+			 "Field elements must be 2- or 3-tuples, got '%R'",
 			 item);
             goto fail;
         }
@@ -461,7 +461,7 @@ _convert_from_array_descr(PyObject *obj, int align)
         }
         else if (PyTuple_Check(name)) {
             if (PyTuple_GET_SIZE(name) != 2) {
-                PyErr_Format(PyExc_TypeError, 
+                PyErr_Format(PyExc_TypeError,
 				"If a tuple, the first element of a field tuple must have "
 				"two elements, not %zd",
 			       	PyTuple_GET_SIZE(name));
@@ -475,7 +475,7 @@ _convert_from_array_descr(PyObject *obj, int align)
             }
         }
         else {
-            PyErr_SetString(PyExc_TypeError, 
+            PyErr_SetString(PyExc_TypeError,
 			            "First element of field tuple is "
 			            "neither a tuple nor str");
             goto fail;
@@ -2304,6 +2304,33 @@ arraydescr_new(PyTypeObject *subtype,
                 PyObject *args, PyObject *kwds)
 {
     if (subtype != &PyArrayDescr_Type) {
+        if (Py_TYPE(subtype) == &PyArrayDTypeMeta_Type &&
+                !(PyType_GetFlags(Py_TYPE(subtype)) & Py_TPFLAGS_HEAPTYPE) &&
+                (NPY_DT_SLOTS((PyArray_DTypeMeta *)subtype)) != NULL) {
+            /*
+             * Appears to be a properly initialized user DType. Allocate
+             * it and initialize the main part as best we can.
+             * TODO: This should probably be a user function, and enforce
+             *       things like the `elsize` being correctly set.
+             * TODO: This is EXPERIMENTAL API!
+             */
+            PyArray_DTypeMeta *DType = (PyArray_DTypeMeta *)subtype;
+            PyArray_Descr *descr = (PyArray_Descr *)subtype->tp_alloc(subtype, 0);
+            if (descr == 0) {
+                PyErr_NoMemory();
+                return NULL;
+            }
+            PyObject_Init((PyObject *)descr, subtype);
+            descr->f = &NPY_DT_SLOTS(DType)->f;
+            Py_XINCREF(DType->scalar_type);
+            descr->typeobj = DType->scalar_type;
+            descr->type_num = DType->type_num;
+            descr->flags = NPY_USE_GETITEM|NPY_USE_SETITEM;
+            descr->byteorder = '|';  /* If DType uses it, let it override */
+            descr->elsize = -1;  /* Initialize to invalid value */
+            descr->hash = -1;
+            return (PyObject *)descr;
+        }
         /* The DTypeMeta class should prevent this from happening. */
         PyErr_Format(PyExc_SystemError,
                 "'%S' must not inherit np.dtype.__new__().", subtype);
@@ -3101,6 +3128,30 @@ arraydescr_newbyteorder(PyArray_Descr *self, PyObject *args)
     return (PyObject *)PyArray_DescrNewByteorder(self, endian);
 }
 
+static PyObject *
+arraydescr_class_getitem(PyObject *cls, PyObject *args)
+{
+    PyObject *generic_alias;
+
+#ifdef Py_GENERICALIASOBJECT_H
+    Py_ssize_t args_len;
+
+    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
+    if (args_len != 1) {
+        return PyErr_Format(PyExc_TypeError,
+                            "Too %s arguments for %s",
+                            args_len > 1 ? "many" : "few",
+                            ((PyTypeObject *)cls)->tp_name);
+    }
+    generic_alias = Py_GenericAlias(cls, args);
+#else
+    PyErr_SetString(PyExc_TypeError,
+                    "Type subscription requires python >= 3.9");
+    generic_alias = NULL;
+#endif
+    return generic_alias;
+}
+
 static PyMethodDef arraydescr_methods[] = {
     /* for pickling */
     {"__reduce__",
@@ -3112,6 +3163,10 @@ static PyMethodDef arraydescr_methods[] = {
     {"newbyteorder",
         (PyCFunction)arraydescr_newbyteorder,
         METH_VARARGS, NULL},
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)arraydescr_class_getitem,
+        METH_CLASS | METH_O, NULL},
     {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/descriptor.h b/numpy/core/src/multiarray/descriptor.h
index e1316acbd..f832958da 100644
--- a/numpy/core/src/multiarray/descriptor.h
+++ b/numpy/core/src/multiarray/descriptor.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAYDESCR_H_
-#define _NPY_ARRAYDESCR_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 
 NPY_NO_EXPORT PyObject *arraydescr_protocol_typestr_get(
         PyArray_Descr *, void *);
@@ -30,4 +30,4 @@ arraydescr_field_subset_view(PyArray_Descr *self, PyObject *ind);
 
 extern NPY_NO_EXPORT char const *_datetime_strings[];
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_ */
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index 1d8c27570..ce0293615 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -2206,7 +2206,7 @@ Dragon4_PrintFloat_IEEE_binary16(
         Dragon4_Scratch *scratch, npy_half *value, Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     npy_uint16 val = *value;
@@ -2218,15 +2218,6 @@ Dragon4_PrintFloat_IEEE_binary16(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
-
     /* deconstruct the floating point value */
     floatMantissa = val & bitmask_u32(10);
     floatExponent = (val >> 10) & bitmask_u32(5);
@@ -2303,7 +2294,7 @@ Dragon4_PrintFloat_IEEE_binary32(
         Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     union
@@ -2319,15 +2310,6 @@ Dragon4_PrintFloat_IEEE_binary32(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
-
     /* deconstruct the floating point value */
     floatUnion.floatingPoint = *value;
     floatMantissa = floatUnion.integer & bitmask_u32(23);
@@ -2404,7 +2386,7 @@ Dragon4_PrintFloat_IEEE_binary64(
         Dragon4_Scratch *scratch, npy_float64 *value, Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     union
@@ -2421,14 +2403,6 @@ Dragon4_PrintFloat_IEEE_binary64(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
 
     /* deconstruct the floating point value */
     floatUnion.floatingPoint = *value;
@@ -2527,7 +2501,7 @@ Dragon4_PrintFloat_Intel_extended(
     Dragon4_Scratch *scratch, FloatVal128 value, Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     npy_uint32 floatExponent, floatSign;
@@ -2539,15 +2513,6 @@ Dragon4_PrintFloat_Intel_extended(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
-
     /* deconstruct the floating point value (we ignore the intbit) */
     floatMantissa = value.lo & bitmask_u64(63);
     floatExponent = value.hi & bitmask_u32(15);
@@ -2748,7 +2713,7 @@ Dragon4_PrintFloat_IEEE_binary128(
     Dragon4_Scratch *scratch, FloatVal128 val128, Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     npy_uint32 floatExponent, floatSign;
@@ -2759,15 +2724,6 @@ Dragon4_PrintFloat_IEEE_binary128(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
-
     mantissa_hi = val128.hi & bitmask_u64(48);
     mantissa_lo = val128.lo;
     floatExponent = (val128.hi >> 48) & bitmask_u32(15);
@@ -2917,7 +2873,7 @@ Dragon4_PrintFloat_IBM_double_double(
     Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
 {
     char *buffer = scratch->repr;
-    npy_uint32 bufferSize = sizeof(scratch->repr);
+    const npy_uint32 bufferSize = sizeof(scratch->repr);
     BigInt *bigints = scratch->bigints;
 
     FloatVal128 val128;
@@ -2934,15 +2890,6 @@ Dragon4_PrintFloat_IBM_double_double(
     npy_bool hasUnequalMargins;
     char signbit = '\0';
 
-    if (bufferSize == 0) {
-        return 0;
-    }
-
-    if (bufferSize == 1) {
-        buffer[0] = '\0';
-        return 0;
-    }
-
     /* The high part always comes before the low part, regardless of the
      * endianness of the system. */
     buf128.floatingPoint = *value;
diff --git a/numpy/core/src/multiarray/dragon4.h b/numpy/core/src/multiarray/dragon4.h
index 4b76bf9e5..e3325bfa2 100644
--- a/numpy/core/src/multiarray/dragon4.h
+++ b/numpy/core/src/multiarray/dragon4.h
@@ -29,12 +29,11 @@
  * Ryan Juckett's original code was under the Zlib license; he gave numpy
  * permission to include it under the MIT license instead.
  */
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DRAGON4_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DRAGON4_H_
 
-#ifndef _NPY_DRAGON4_H_
-#define _NPY_DRAGON4_H_
-
-#include "Python.h"
-#include "structmember.h"
+#include <Python.h>
+#include <structmember.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
@@ -136,5 +135,4 @@ Dragon4_Scientific(PyObject *obj, DigitMode digit_mode, int precision,
                    int min_digits, int sign, TrimMode trim, int pad_left,
                    int exp_digits);
 
-#endif
-
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DRAGON4_H_ */
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 50db627ea..8fb44c4f6 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -7,16 +7,16 @@
  * The University of British Columbia
  *
  * See LICENSE.txt for the license.
-
+ *
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
+#include <Python.h>
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
+#include "numpy/arrayobject.h"
 
 #include "lowlevel_strided_loops.h"
 #include "npy_pycompat.h"
diff --git a/numpy/core/src/multiarray/dtype_transfer.h b/numpy/core/src/multiarray/dtype_transfer.h
index e29ac40b8..c7e0a029f 100644
--- a/numpy/core/src/multiarray/dtype_transfer.h
+++ b/numpy/core/src/multiarray/dtype_transfer.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_DTYPE_TRANSFER_H
-#define _NPY_DTYPE_TRANSFER_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRANSFER_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRANSFER_H_
 
 #include "array_method.h"
 
@@ -202,4 +202,4 @@ get_wrapped_legacy_cast_function(int aligned,
         int *out_needs_api, int allow_wrapped);
 
 
-#endif  /* _NPY_DTYPE_TRANSFER_H */
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRANSFER_H_  */
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index 597468c50..cd489d5e7 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -1,12 +1,11 @@
 /* Array Descr Object */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
-#include "assert.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
 #include <numpy/arrayscalars.h>
 #include "npy_pycompat.h"
@@ -19,6 +18,7 @@
 #include "convert_datatype.h"
 #include "usertypes.h"
 
+#include <assert.h>
 
 static void
 dtypemeta_dealloc(PyArray_DTypeMeta *self) {
@@ -101,7 +101,7 @@ static PyObject *
 legacy_dtype_default_new(PyArray_DTypeMeta *self,
         PyObject *args, PyObject *kwargs)
 {
-    /* TODO: This should allow endianess and possibly metadata */
+    /* TODO: This should allow endianness and possibly metadata */
     if (NPY_DT_is_parametric(self)) {
         /* reject parametric ones since we would need to get unit, etc. info */
         PyErr_Format(PyExc_TypeError,
@@ -290,7 +290,7 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
     return descr1;
 }
 
-static int
+NPY_NO_EXPORT int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
 {
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 200111ac2..2a61fe39d 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_DTYPEMETA_H
-#define _NPY_DTYPEMETA_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 
 
 /* DType flags, currently private, since we may just expose functions */
@@ -8,6 +8,35 @@
 #define NPY_DT_PARAMETRIC 1 << 2
 
 
+typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
+        PyArray_DTypeMeta *cls, PyObject *obj);
+
+/*
+ * Before making this public, we should decide whether it should pass
+ * the type, or allow looking at the object. A possible use-case:
+ * `np.array(np.array([0]), dtype=np.ndarray)`
+ * Could consider arrays that are not `dtype=ndarray` "scalars".
+ */
+typedef int (is_known_scalar_type_function)(
+        PyArray_DTypeMeta *cls, PyTypeObject *obj);
+
+typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
+typedef PyArray_DTypeMeta *(common_dtype_function)(
+        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
+typedef PyArray_Descr *(common_instance_function)(
+        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
+
+/*
+ * TODO: These two functions are currently only used for experimental DType
+ *       API support.  Their relation should be "reversed": NumPy should
+ *       always use them internally.
+ *       There are open points about "casting safety" though, e.g. setting
+ *       elements is currently always unsafe.
+ */
+typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
+typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+
+
 typedef struct {
     /* DType methods, these could be moved into its own struct */
     discover_descr_from_pyobject_function *discover_descr_from_pyobject;
@@ -16,6 +45,12 @@ typedef struct {
     common_dtype_function *common_dtype;
     common_instance_function *common_instance;
     /*
+     * Currently only used for experimental user DTypes.
+     * Typing as `void *` until NumPy itself uses these (directly).
+     */
+    setitemfunction *setitem;
+    getitemfunction *getitem;
+    /*
      * The casting implementation (ArrayMethod) to convert between two
      * instances of this DType, stored explicitly for fast access:
      */
@@ -39,9 +74,9 @@ typedef struct {
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
 #define NPY_DT_SLOTS(dtype) ((NPY_DType_Slots *)(dtype)->dt_slots)
 
-#define NPY_DT_is_legacy(dtype) ((dtype)->flags & NPY_DT_LEGACY)
-#define NPY_DT_is_abstract(dtype) ((dtype)->flags & NPY_DT_ABSTRACT)
-#define NPY_DT_is_parametric(dtype) ((dtype)->flags & NPY_DT_PARAMETRIC)
+#define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
+#define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
+#define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
 
 /*
  * Macros for convenient classmethod calls, since these require
@@ -58,7 +93,10 @@ typedef struct {
     NPY_DT_SLOTS(dtype)->default_descr(dtype)
 #define NPY_DT_CALL_common_dtype(dtype, other)  \
     NPY_DT_SLOTS(dtype)->common_dtype(dtype, other)
-
+#define NPY_DT_CALL_getitem(descr, data_ptr)  \
+    NPY_DT_SLOTS(NPY_DTYPE(descr))->getitem(descr, data_ptr)
+#define NPY_DT_CALL_setitem(descr, value, data_ptr)  \
+    NPY_DT_SLOTS(NPY_DTYPE(descr))->setitem(descr, value, data_ptr)
 
 /*
  * This function will hopefully be phased out or replaced, but was convenient
@@ -78,6 +116,10 @@ PyArray_DTypeFromTypeNum(int typenum)
 
 
 NPY_NO_EXPORT int
+python_builtins_are_known_scalar_types(
+        PyArray_DTypeMeta *cls, PyTypeObject *pytype);
+
+NPY_NO_EXPORT int
 dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
 
-#endif  /*_NPY_DTYPEMETA_H */
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_ */
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 85806fab3..cd1a58982 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -9,8 +9,8 @@
  */
 
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
+#include <Python.h>
+#include <structmember.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
diff --git a/numpy/core/src/multiarray/einsum_debug.h b/numpy/core/src/multiarray/einsum_debug.h
index 9aa81fcbd..964964743 100644
--- a/numpy/core/src/multiarray/einsum_debug.h
+++ b/numpy/core/src/multiarray/einsum_debug.h
@@ -6,8 +6,8 @@
  *
  * See LICENSE.txt for the license.
  */
-#ifndef _NPY_MULTIARRAY_EINSUM_DEBUG_H
-#define _NPY_MULTIARRAY_EINSUM_DEBUG_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_EINSUM_DEBUG_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_EINSUM_DEBUG_H_
 
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_EINSUM_DBG_TRACING 0
@@ -25,4 +25,4 @@
 #define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
 #endif
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_EINSUM_DEBUG_H_ */
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index 333b8e188..29ceabd71 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -80,7 +80,7 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count)
     /* Use aligned instructions if possible */
     const int is_aligned = EINSUM_IS_ALIGNED(data);
     const int vstep = npyv_nlanes_@sfx@;
-    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+    npyv_@sfx@ v_accum = npyv_zero_@sfx@();
     const npy_intp vstepx4 = vstep * 4;
 
     /**begin repeat1
@@ -98,15 +98,15 @@ static NPY_GCC_OPT_3 @temptype@ @name@_sum_of_arr(@type@ *data, npy_intp count)
             npyv_@sfx@ a01   = npyv_add_@sfx@(a0, a1);
             npyv_@sfx@ a23   = npyv_add_@sfx@(a2, a3);
             npyv_@sfx@ a0123 = npyv_add_@sfx@(a01, a23);
-                      vaccum = npyv_add_@sfx@(a0123, vaccum);
+                     v_accum = npyv_add_@sfx@(a0123, v_accum);
         }
     }
     /**end repeat1**/
     for (; count > 0; count -= vstep, data += vstep) {
         npyv_@sfx@ a = npyv_load_tillz_@sfx@(data, count);
-        vaccum = npyv_add_@sfx@(a, vaccum);
+        v_accum = npyv_add_@sfx@(a, v_accum);
     }
-    accum = npyv_sum_@sfx@(vaccum);
+    accum = npyv_sum_@sfx@(v_accum);
     npyv_cleanup();
 #else
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -485,7 +485,7 @@ static NPY_GCC_OPT_3 void
     /* Use aligned instructions if possible */
     const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
     const int vstep = npyv_nlanes_@sfx@;
-    npyv_@sfx@ vaccum = npyv_zero_@sfx@();
+    npyv_@sfx@ v_accum = npyv_zero_@sfx@();
 
     /**begin repeat2
      * #cond = if(is_aligned), else#
@@ -501,19 +501,19 @@ static NPY_GCC_OPT_3 void
             npyv_@sfx@ a@i@ = npyv_@ld@_@sfx@(data0 + vstep * @i@);
             npyv_@sfx@ b@i@ = npyv_@ld@_@sfx@(data1 + vstep * @i@);
             /**end repeat3**/
-            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, vaccum);
+            npyv_@sfx@ ab3 = npyv_muladd_@sfx@(a3, b3, v_accum);
             npyv_@sfx@ ab2 = npyv_muladd_@sfx@(a2, b2, ab3);
             npyv_@sfx@ ab1 = npyv_muladd_@sfx@(a1, b1, ab2);
-                    vaccum = npyv_muladd_@sfx@(a0, b0, ab1);
+                   v_accum = npyv_muladd_@sfx@(a0, b0, ab1);
         }
     }
     /**end repeat2**/
     for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
         npyv_@sfx@ a = npyv_load_tillz_@sfx@(data0, count);
         npyv_@sfx@ b = npyv_load_tillz_@sfx@(data1, count);
-        vaccum = npyv_muladd_@sfx@(a, b, vaccum);
+        v_accum = npyv_muladd_@sfx@(a, b, v_accum);
     }
-    accum = npyv_sum_@sfx@(vaccum);
+    accum = npyv_sum_@sfx@(v_accum);
     npyv_cleanup();
 #else
 #ifndef NPY_DISABLE_OPTIMIZATION
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
index c6cf18ec6..29ddaea14 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.h
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
-#define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_EINSUM_SUMPROD_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_EINSUM_SUMPROD_H_
 
 #include <numpy/npy_common.h>
 
@@ -9,4 +9,4 @@ NPY_VISIBILITY_HIDDEN sum_of_products_fn
 get_sum_of_products_function(int nop, int type_num,
                              npy_intp itemsize, npy_intp const *fixed_strides);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_EINSUM_SUMPROD_H_ */
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
new file mode 100644
index 000000000..ef5030471
--- /dev/null
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -0,0 +1,392 @@
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#include <numpy/npy_common.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "common.h"
+
+#include "experimental_public_dtype_api.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "array_coercion.h"
+#include "convert_datatype.h"
+#include "common_dtype.h"
+
+
+#define EXPERIMENTAL_DTYPE_API_VERSION 2
+
+
+typedef struct{
+    PyTypeObject *typeobj;    /* type of python scalar or NULL */
+    int flags;                /* flags, including parametric and abstract */
+    /* NULL terminated cast definitions. Use NULL for the newly created DType */
+    PyArrayMethod_Spec **casts;
+    PyType_Slot *slots;
+} PyArrayDTypeMeta_Spec;
+
+
+
+static PyArray_DTypeMeta *
+dtype_does_not_promote(
+        PyArray_DTypeMeta *NPY_UNUSED(self), PyArray_DTypeMeta *NPY_UNUSED(other))
+{
+    /* `other` is guaranteed not to be `self`, so we don't have to do much... */
+    Py_INCREF(Py_NotImplemented);
+    return (PyArray_DTypeMeta *)Py_NotImplemented;
+}
+
+
+static PyArray_Descr *
+discover_as_default(PyArray_DTypeMeta *cls, PyObject *NPY_UNUSED(obj))
+{
+    return NPY_DT_CALL_default_descr(cls);
+}
+
+
+static PyArray_Descr *
+use_new_as_default(PyArray_DTypeMeta *self)
+{
+    PyObject *res = PyObject_CallObject((PyObject *)self, NULL);
+    if (res == NULL) {
+        return NULL;
+    }
+    /*
+     * Lets not trust that the DType is implemented correctly
+     * TODO: Should probably do an exact type-check (at least unless this is
+     *       an abstract DType).
+     */
+    if (!PyArray_DescrCheck(res)) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Instantiating %S did not return a dtype instance, this is "
+                "invalid (especially without a custom `default_descr()`).",
+                self);
+        Py_DECREF(res);
+        return NULL;
+    }
+    PyArray_Descr *descr = (PyArray_Descr *)res;
+    /*
+     * Should probably do some more sanity checks here on the descriptor
+     * to ensure the user is not being naughty. But in the end, we have
+     * only limited control anyway.
+     */
+    return descr;
+}
+
+
+static int
+legacy_setitem_using_DType(PyObject *obj, void *data, void *arr)
+{
+    if (arr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Using legacy SETITEM with NULL array object is only "
+                "supported for basic NumPy DTypes.");
+        return -1;
+    }
+    setitemfunction *setitem;
+    setitem = NPY_DT_SLOTS(NPY_DTYPE(PyArray_DESCR(arr)))->setitem;
+    return setitem(PyArray_DESCR(arr), obj, data);
+}
+
+
+static PyObject *
+legacy_getitem_using_DType(void *data, void *arr)
+{
+    if (arr == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Using legacy SETITEM with NULL array object is only "
+                "supported for basic NumPy DTypes.");
+        return NULL;
+    }
+    getitemfunction *getitem;
+    getitem = NPY_DT_SLOTS(NPY_DTYPE(PyArray_DESCR(arr)))->getitem;
+    return getitem(PyArray_DESCR(arr), data);
+}
+
+
+/*
+ * The descr->f structure used user-DTypes.  Some functions may be filled
+ * from the user in the future and more could get defaults for compatibility.
+ */
+PyArray_ArrFuncs default_funcs = {
+        .setitem = &legacy_setitem_using_DType,
+        .getitem = &legacy_getitem_using_DType
+};
+
+
+/* other slots are in order, so keep only last around: */
+#define NUM_DTYPE_SLOTS 7
+
+
+int
+PyArrayInitDTypeMeta_FromSpec(
+        PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *spec)
+{
+    if (!PyObject_TypeCheck(DType, &PyArrayDTypeMeta_Type)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Passed in DType must be a valid (initialized) DTypeMeta "
+                "instance!");
+        return -1;
+    }
+
+    if (spec->typeobj == NULL || !PyType_Check(spec->typeobj)) {
+        PyErr_SetString(PyExc_TypeError,
+                "Not giving a type object is currently not supported, but "
+                "is expected to be supported eventually.  This would mean "
+                "that e.g. indexing a NumPy array will return a 0-D array "
+                "and not a scalar.");
+        return -1;
+    }
+
+    if (DType->dt_slots != NULL) {
+        PyErr_Format(PyExc_RuntimeError,
+                "DType %R appears already registered?", DType);
+        return -1;
+    }
+
+    /* Check and handle flags: */
+    if (spec->flags & ~(NPY_DT_PARAMETRIC|NPY_DT_ABSTRACT)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "invalid DType flags specified, only parametric and abstract "
+                "are valid flags for user DTypes.");
+        return -1;
+    }
+
+    DType->flags = spec->flags;
+    DType->dt_slots = PyMem_Calloc(1, sizeof(NPY_DType_Slots));
+    if (DType->dt_slots == NULL) {
+        return -1;
+    }
+
+    /* Set default values (where applicable) */
+    NPY_DT_SLOTS(DType)->discover_descr_from_pyobject = &discover_as_default;
+    NPY_DT_SLOTS(DType)->is_known_scalar_type = (
+            &python_builtins_are_known_scalar_types);
+    NPY_DT_SLOTS(DType)->default_descr = use_new_as_default;
+    NPY_DT_SLOTS(DType)->common_dtype = dtype_does_not_promote;
+    /* May need a default for non-parametric? */
+    NPY_DT_SLOTS(DType)->common_instance = NULL;
+    NPY_DT_SLOTS(DType)->setitem = NULL;
+    NPY_DT_SLOTS(DType)->getitem = NULL;
+
+    PyType_Slot *spec_slot = spec->slots;
+    while (1) {
+        int slot = spec_slot->slot;
+        void *pfunc = spec_slot->pfunc;
+        spec_slot++;
+        if (slot == 0) {
+            break;
+        }
+        if (slot > NUM_DTYPE_SLOTS || slot < 0) {
+            PyErr_Format(PyExc_RuntimeError,
+                    "Invalid slot with value %d passed in.", slot);
+            return -1;
+        }
+        /*
+         * It is up to the user to get this right, and slots are sorted
+         * exactly like they are stored right now:
+         */
+        void **current = (void **)(&(
+                NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
+        current += slot - 1;
+        *current = pfunc;
+    }
+    if (NPY_DT_SLOTS(DType)->setitem == NULL
+            || NPY_DT_SLOTS(DType)->getitem == NULL) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "A DType must provide a getitem/setitem (there may be an "
+                "exception here in the future if no scalar type is provided)");
+        return -1;
+    }
+
+    /*
+     * Now that the spec is read we can check that all required functions were
+     * defined by the user.
+     */
+    if (spec->flags & NPY_DT_PARAMETRIC) {
+        if (NPY_DT_SLOTS(DType)->common_instance == NULL ||
+                NPY_DT_SLOTS(DType)->discover_descr_from_pyobject
+                        == &discover_as_default) {
+            PyErr_SetString(PyExc_RuntimeError,
+                    "Parametric DType must define a common-instance and "
+                    "descriptor discovery function!");
+            return -1;
+        }
+    }
+    NPY_DT_SLOTS(DType)->f = default_funcs;
+    /* invalid type num. Ideally, we get away with it! */
+    DType->type_num = -1;
+
+    /*
+     * Handle the scalar type mapping.
+     */
+    Py_INCREF(spec->typeobj);
+    DType->scalar_type = spec->typeobj;
+    if (PyType_GetFlags(spec->typeobj) & Py_TPFLAGS_HEAPTYPE) {
+        if (PyObject_SetAttrString((PyObject *)DType->scalar_type,
+                "__associated_array_dtype__", (PyObject *)DType) < 0) {
+            Py_DECREF(DType);
+            return -1;
+        }
+    }
+    if (_PyArray_MapPyTypeToDType(DType, DType->scalar_type, 0) < 0) {
+        Py_DECREF(DType);
+        return -1;
+    }
+
+    /* Ensure cast dict is defined (not sure we have to do it here) */
+    NPY_DT_SLOTS(DType)->castingimpls = PyDict_New();
+    if (NPY_DT_SLOTS(DType)->castingimpls == NULL) {
+        return -1;
+    }
+    /*
+     * And now, register all the casts that are currently defined!
+     */
+    PyArrayMethod_Spec **next_meth_spec = spec->casts;
+    while (1) {
+        PyArrayMethod_Spec *meth_spec = *next_meth_spec;
+        next_meth_spec++;
+        if (meth_spec == NULL) {
+            break;
+        }
+        /*
+         * The user doesn't know the name of DType yet, so we have to fill it
+         * in for them!
+         */
+        for (int i=0; i < meth_spec->nin + meth_spec->nout; i++) {
+            if (meth_spec->dtypes[i] == NULL) {
+                meth_spec->dtypes[i] = DType;
+            }
+        }
+        /* Register the cast! */
+        int res = PyArray_AddCastingImplementation_FromSpec(meth_spec, 0);
+
+        /* Also clean up again, so nobody can get bad ideas... */
+        for (int i=0; i < meth_spec->nin + meth_spec->nout; i++) {
+            if (meth_spec->dtypes[i] == DType) {
+                meth_spec->dtypes[i] = NULL;
+            }
+        }
+
+        if (res < 0) {
+            return -1;
+        }
+    }
+
+    if (NPY_DT_SLOTS(DType)->within_dtype_castingimpl == NULL) {
+        /*
+         * We expect this for now. We should have a default for DType that
+         * only supports simple copy (and possibly byte-order assuming that
+         * they swap the full itemsize).
+         */
+        PyErr_SetString(PyExc_RuntimeError,
+                "DType must provide a function to cast (or just copy) between "
+                "its own instances!");
+        return -1;
+    }
+
+    /* And finally, we have to register all the casts! */
+    return 0;
+}
+
+
+/* Function is defined in umath/dispatching.c (same/one compilation unit) */
+NPY_NO_EXPORT int
+PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate);
+
+static int
+PyUFunc_AddLoopFromSpec(PyObject *ufunc, PyArrayMethod_Spec *spec)
+{
+    if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) {
+        PyErr_SetString(PyExc_TypeError,
+                "ufunc object passed is not a ufunc!");
+        return -1;
+    }
+    PyBoundArrayMethodObject *bmeth =
+            (PyBoundArrayMethodObject *)PyArrayMethod_FromSpec(spec);
+    if (bmeth == NULL) {
+        return -1;
+    }
+    int nargs = bmeth->method->nin + bmeth->method->nout;
+    PyObject *dtypes = PyArray_TupleFromItems(
+            nargs, (PyObject **)bmeth->dtypes, 1);
+    if (dtypes == NULL) {
+        return -1;
+    }
+    PyObject *info = PyTuple_Pack(2, dtypes, bmeth->method);
+    Py_DECREF(bmeth);
+    Py_DECREF(dtypes);
+    if (info == NULL) {
+        return -1;
+    }
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
+
+static int
+PyUFunc_AddPromoter(
+        PyObject *ufunc, PyObject *DType_tuple, PyObject *promoter)
+{
+    if (!PyObject_TypeCheck(ufunc, &PyUFunc_Type)) {
+        PyErr_SetString(PyExc_TypeError,
+                "ufunc object passed is not a ufunc!");
+        return -1;
+    }
+    if (!PyCapsule_CheckExact(promoter)) {
+        PyErr_SetString(PyExc_TypeError,
+                "promoter must (currently) be a PyCapsule.");
+        return -1;
+    }
+    if (PyCapsule_GetPointer(promoter, "numpy._ufunc_promoter") == NULL) {
+        return -1;
+    }
+    PyObject *info = PyTuple_Pack(2, DType_tuple, promoter);
+    if (info == NULL) {
+        return -1;
+    }
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
+
+NPY_NO_EXPORT PyObject *
+_get_experimental_dtype_api(PyObject *NPY_UNUSED(mod), PyObject *arg)
+{
+    static void *experimental_api_table[] = {
+            &PyUFunc_AddLoopFromSpec,
+            &PyUFunc_AddPromoter,
+            &PyArrayDTypeMeta_Type,
+            &PyArrayInitDTypeMeta_FromSpec,
+            &PyArray_CommonDType,
+            &PyArray_PromoteDTypeSequence,
+            NULL,
+    };
+
+    char *env = getenv("NUMPY_EXPERIMENTAL_DTYPE_API");
+    if (env == NULL || strcmp(env, "1") != 0) {
+        PyErr_Format(PyExc_RuntimeError,
+                "The new DType API is currently in an exploratory phase and "
+                "should NOT be used for production code.  "
+                "Expect modifications and crashes!  "
+                "To experiment with the new API you must set "
+                "`NUMPY_EXPERIMENTAL_DTYPE_API=1` as an environment variable.");
+        return NULL;
+    }
+
+    long version = PyLong_AsLong(arg);
+    if (error_converting(version)) {
+        return NULL;
+    }
+    if (version != EXPERIMENTAL_DTYPE_API_VERSION) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Experimental DType API version %d requested, but NumPy "
+                "is exporting version %d.  Recompile your DType and/or upgrade "
+                "NumPy to match.",
+                version, EXPERIMENTAL_DTYPE_API_VERSION);
+        return NULL;
+    }
+
+    return PyCapsule_New(&experimental_api_table,
+            "experimental_dtype_api_table", NULL);
+}
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.h b/numpy/core/src/multiarray/experimental_public_dtype_api.h
new file mode 100644
index 000000000..270cb82bf
--- /dev/null
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.h
@@ -0,0 +1,18 @@
+/*
+ * This file exports the experimental dtype API as exposed via the
+ * `numpy/core/include/numpy/experimental_dtype_api.h`
+ * header file.
+ *
+ * This file is a stub, all important definitions are in the code file.
+ *
+ * NOTE: This file is considered in-flux, exploratory and transitional.
+ */
+
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_EXPERIMENTAL_PUBLIC_DTYPE_API_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_EXPERIMENTAL_PUBLIC_DTYPE_API_H_
+
+NPY_NO_EXPORT PyObject *
+_get_experimental_dtype_api(PyObject *mod, PyObject *arg);
+
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_EXPERIMENTAL_PUBLIC_DTYPE_API_H_ */
diff --git a/numpy/core/src/multiarray/flagsobject.c b/numpy/core/src/multiarray/flagsobject.c
index fe73c18ee..3b1b4f406 100644
--- a/numpy/core/src/multiarray/flagsobject.c
+++ b/numpy/core/src/multiarray/flagsobject.c
@@ -1,11 +1,11 @@
 /* Array Flags Object */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "arrayobject.h"
 #include "numpy/arrayscalars.h"
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index de2a8c14e..e81ca2947 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -1,11 +1,11 @@
 /* Array Descr Object */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
@@ -384,7 +384,23 @@ array_data_set(PyArrayObject *self, PyObject *op, void *NPY_UNUSED(ignored))
     }
     if (PyArray_FLAGS(self) & NPY_ARRAY_OWNDATA) {
         PyArray_XDECREF(self);
-        PyDataMem_FREE(PyArray_DATA(self));
+        size_t nbytes = PyArray_NBYTES(self);
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        if (nbytes == 0) {
+            PyArray_Descr *dtype = PyArray_DESCR(self);
+            nbytes = dtype->elsize ? dtype->elsize : 1;
+        }
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return -1;
+        }
+        PyDataMem_UserFREE(PyArray_DATA(self), nbytes, handler);
     }
     if (PyArray_BASE(self)) {
         if ((PyArray_FLAGS(self) & NPY_ARRAY_WRITEBACKIFCOPY) ||
diff --git a/numpy/core/src/multiarray/getset.h b/numpy/core/src/multiarray/getset.h
index 4f1209de5..a95c98020 100644
--- a/numpy/core/src/multiarray/getset.h
+++ b/numpy/core/src/multiarray/getset.h
@@ -1,6 +1,6 @@
-#ifndef _NPY_ARRAY_GETSET_H_
-#define _NPY_ARRAY_GETSET_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_GETSET_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_GETSET_H_
 
 extern NPY_NO_EXPORT PyGetSetDef array_getsetlist[];
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_GETSET_H_ */
diff --git a/numpy/core/src/multiarray/hashdescr.c b/numpy/core/src/multiarray/hashdescr.c
index e9a99cc8f..a3c9e986b 100644
--- a/numpy/core/src/multiarray/hashdescr.c
+++ b/numpy/core/src/multiarray/hashdescr.c
@@ -1,7 +1,9 @@
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
 #include <numpy/arrayobject.h>
 
 #include "npy_config.h"
diff --git a/numpy/core/src/multiarray/hashdescr.h b/numpy/core/src/multiarray/hashdescr.h
index 8d577e7b0..97375b4af 100644
--- a/numpy/core/src/multiarray/hashdescr.h
+++ b/numpy/core/src/multiarray/hashdescr.h
@@ -1,7 +1,7 @@
-#ifndef _NPY_HASHDESCR_H_
-#define _NPY_HASHDESCR_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_HASHDESCR_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_HASHDESCR_H_
 
 NPY_NO_EXPORT npy_hash_t
 PyArray_DescrHash(PyObject* odescr);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_HASHDESCR_H_ */
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 2b8ea9e79..086b674c8 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -1,10 +1,10 @@
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include "structmember.h"
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -776,6 +776,7 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     return NULL;
 }
 
+
 /*NUMPY_API
  */
 NPY_NO_EXPORT PyObject *
@@ -907,7 +908,7 @@ PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject *out,
         Py_XDECREF(mps[i]);
     }
     Py_DECREF(ap);
-    npy_free_cache(mps, n * sizeof(mps[0]));
+    PyDataMem_FREE(mps);
     if (out != NULL && out != obj) {
         Py_INCREF(out);
         PyArray_ResolveWritebackIfCopy(obj);
@@ -922,7 +923,7 @@ PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject *out,
         Py_XDECREF(mps[i]);
     }
     Py_XDECREF(ap);
-    npy_free_cache(mps, n * sizeof(mps[0]));
+    PyDataMem_FREE(mps);
     PyArray_DiscardWritebackIfCopy(obj);
     Py_XDECREF(obj);
     return NULL;
@@ -962,14 +963,19 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         return 0;
     }
 
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        return -1;
+    }
     it = (PyArrayIterObject *)PyArray_IterAllButAxis((PyObject *)op, &axis);
     if (it == NULL) {
+        Py_DECREF(mem_handler);
         return -1;
     }
     size = it->size;
 
     if (needcopy) {
-        buffer = npy_alloc_cache(N * elsize);
+        buffer = PyDataMem_UserNEW(N * elsize, mem_handler);
         if (buffer == NULL) {
             ret = -1;
             goto fail;
@@ -1053,12 +1059,14 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
-    npy_free_cache(buffer, N * elsize);
+    /* cleanup internal buffer */
+    PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
     if (ret < 0 && !PyErr_Occurred()) {
         /* Out of memory during sorting or buffer creation */
         PyErr_NoMemory();
     }
     Py_DECREF(it);
+    Py_DECREF(mem_handler);
 
     return ret;
 }
@@ -1090,11 +1098,16 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     NPY_BEGIN_THREADS_DEF;
 
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        return NULL;
+    }
     rop = (PyArrayObject *)PyArray_NewFromDescr(
             Py_TYPE(op), PyArray_DescrFromType(NPY_INTP),
             PyArray_NDIM(op), PyArray_DIMS(op), NULL, NULL,
             0, (PyObject *)op);
     if (rop == NULL) {
+        Py_DECREF(mem_handler);
         return NULL;
     }
     rstride = PyArray_STRIDE(rop, axis);
@@ -1102,6 +1115,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     /* Check if there is any argsorting to do */
     if (N <= 1 || PyArray_SIZE(op) == 0) {
+        Py_DECREF(mem_handler);
         memset(PyArray_DATA(rop), 0, PyArray_NBYTES(rop));
         return (PyObject *)rop;
     }
@@ -1115,7 +1129,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     size = it->size;
 
     if (needcopy) {
-        valbuffer = npy_alloc_cache(N * elsize);
+        valbuffer = PyDataMem_UserNEW(N * elsize, mem_handler);
         if (valbuffer == NULL) {
             ret = -1;
             goto fail;
@@ -1123,7 +1137,8 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     }
 
     if (needidxbuffer) {
-        idxbuffer = (npy_intp *)npy_alloc_cache(N * sizeof(npy_intp));
+        idxbuffer = (npy_intp *)PyDataMem_UserNEW(N * sizeof(npy_intp),
+                                                  mem_handler);
         if (idxbuffer == NULL) {
             ret = -1;
             goto fail;
@@ -1212,8 +1227,9 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
 fail:
     NPY_END_THREADS_DESCR(PyArray_DESCR(op));
-    npy_free_cache(valbuffer, N * elsize);
-    npy_free_cache(idxbuffer, N * sizeof(npy_intp));
+    /* cleanup internal buffers */
+    PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    PyDataMem_UserFREE(idxbuffer, N * sizeof(npy_intp), mem_handler);
     if (ret < 0) {
         if (!PyErr_Occurred()) {
             /* Out of memory during sorting or buffer creation */
@@ -1224,6 +1240,7 @@ fail:
     }
     Py_XDECREF(it);
     Py_XDECREF(rit);
+    Py_DECREF(mem_handler);
 
     return (PyObject *)rop;
 }
@@ -1292,7 +1309,15 @@ partition_prep_kth_array(PyArrayObject * ktharray,
     npy_intp * kth;
     npy_intp nkth, i;
 
-    if (!PyArray_CanCastSafely(PyArray_TYPE(ktharray), NPY_INTP)) {
+    if (PyArray_ISBOOL(ktharray)) {
+        /* 2021-09-29, NumPy 1.22 */
+        if (DEPRECATE(
+                "Passing booleans as partition index is deprecated"
+                " (warning added in NumPy 1.22)") < 0) {
+            return NULL;
+        }
+    }
+    else if (!PyArray_ISINTEGER(ktharray)) {
         PyErr_Format(PyExc_TypeError, "Partition index must be integer");
         return NULL;
     }
@@ -2390,19 +2415,14 @@ PyArray_CountNonzero(PyArrayObject *self)
     npy_intp *strideptr, *innersizeptr;
     NPY_BEGIN_THREADS_DEF;
 
-    // Special low-overhead version specific to the boolean/int types
     dtype = PyArray_DESCR(self);
-    switch(dtype->kind) {
-        case 'u':
-        case 'i':
-        case 'b':
-            if (dtype->elsize > 8) {
-                break;
-            }
-            return count_nonzero_int(
-                PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
-                PyArray_STRIDES(self), dtype->elsize
-            );
+    /* Special low-overhead version specific to the boolean/int types */
+    if (PyArray_ISALIGNED(self) && (
+            PyDataType_ISBOOL(dtype) || PyDataType_ISINTEGER(dtype))) {
+        return count_nonzero_int(
+            PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+            PyArray_STRIDES(self), dtype->elsize
+        );
     }
 
     nonzero = PyArray_DESCR(self)->f->nonzero;
diff --git a/numpy/core/src/multiarray/item_selection.h b/numpy/core/src/multiarray/item_selection.h
index c1c8b5567..40d9eb298 100644
--- a/numpy/core/src/multiarray/item_selection.h
+++ b/numpy/core/src/multiarray/item_selection.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE__ITEM_SELECTION_H_
-#define _NPY_PRIVATE__ITEM_SELECTION_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ITEM_SELECTION_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ITEM_SELECTION_H_
 
 /*
  * Counts the number of True values in a raw boolean array. This
@@ -27,4 +27,4 @@ NPY_NO_EXPORT int
 PyArray_MultiIndexSetItem(PyArrayObject *self, const npy_intp *multi_index,
                                                 PyObject *obj);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ITEM_SELECTION_H_ */
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index f724837ce..f959162fd 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -1123,6 +1124,35 @@ NPY_NO_EXPORT PyTypeObject PyArrayIter_Type = {
 
 /** END of Array Iterator **/
 
+
+static int
+set_shape_mismatch_exception(PyArrayMultiIterObject *mit, int i1, int i2)
+{
+    PyObject *shape1, *shape2, *msg;
+
+    shape1 = PyObject_GetAttrString((PyObject *) mit->iters[i1]->ao, "shape");
+    if (shape1 == NULL) {
+        return -1;
+    }
+    shape2 = PyObject_GetAttrString((PyObject *) mit->iters[i2]->ao, "shape");
+    if (shape2 == NULL) {
+        Py_DECREF(shape1);
+        return -1;
+    }
+    msg = PyUnicode_FromFormat("shape mismatch: objects cannot be broadcast "
+                               "to a single shape.  Mismatch is between arg %d "
+                               "with shape %S and arg %d with shape %S.",
+                               i1, shape1, i2, shape2);
+    Py_DECREF(shape1);
+    Py_DECREF(shape2);
+    if (msg == NULL) {
+        return -1;
+    }
+    PyErr_SetObject(PyExc_ValueError, msg);
+    Py_DECREF(msg);
+    return 0;
+}
+
 /* Adjust dimensionality and strides for index object iterators
    --- i.e. broadcast
 */
@@ -1131,6 +1161,7 @@ NPY_NO_EXPORT int
 PyArray_Broadcast(PyArrayMultiIterObject *mit)
 {
     int i, nd, k, j;
+    int src_iter = -1;  /* Initializing avoids a compiler warning. */
     npy_intp tmp;
     PyArrayIterObject *it;
 
@@ -1154,12 +1185,10 @@ PyArray_Broadcast(PyArrayMultiIterObject *mit)
                 }
                 if (mit->dimensions[i] == 1) {
                     mit->dimensions[i] = tmp;
+                    src_iter = j;
                 }
                 else if (mit->dimensions[i] != tmp) {
-                    PyErr_SetString(PyExc_ValueError,
-                                    "shape mismatch: objects" \
-                                    " cannot be broadcast" \
-                                    " to a single shape");
+                    set_shape_mismatch_exception(mit, src_iter, j);
                     return -1;
                 }
             }
diff --git a/numpy/core/src/multiarray/iterators.h b/numpy/core/src/multiarray/iterators.h
index d942f45b8..883615cc9 100644
--- a/numpy/core/src/multiarray/iterators.h
+++ b/numpy/core/src/multiarray/iterators.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAYITERATORS_H_
-#define _NPY_ARRAYITERATORS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ITERATORS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ITERATORS_H_
 
 NPY_NO_EXPORT PyObject
 *iter_subscript(PyArrayIterObject *, PyObject *);
@@ -10,4 +10,4 @@ iter_ass_subscript(PyArrayIterObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT void
 PyArray_RawIterBaseInit(PyArrayIterObject *it, PyArrayObject *ao);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ITERATORS_H_ */
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.c b/numpy/core/src/multiarray/legacy_dtype_implementation.c
index 9b4946da3..72a52d7a8 100644
--- a/numpy/core/src/multiarray/legacy_dtype_implementation.c
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.c
@@ -6,9 +6,9 @@
  * until such a time where legay user dtypes are deprecated and removed
  * entirely.
  */
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
+
 #include "numpy/arrayobject.h"
 #include "scalartypes.h"
 #include "_datetime.h"
diff --git a/numpy/core/src/multiarray/legacy_dtype_implementation.h b/numpy/core/src/multiarray/legacy_dtype_implementation.h
index b36eb019a..04f455cde 100644
--- a/numpy/core/src/multiarray/legacy_dtype_implementation.h
+++ b/numpy/core/src/multiarray/legacy_dtype_implementation.h
@@ -1,8 +1,8 @@
-#ifndef _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
-#define _NPY_LEGACY_DTYPE_IMPLEMENTATION_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_LEGACY_DTYPE_IMPLEMENTATION_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_LEGACY_DTYPE_IMPLEMENTATION_H_
 
 NPY_NO_EXPORT npy_bool
 PyArray_LegacyCanCastTypeTo(PyArray_Descr *from, PyArray_Descr *to,
         NPY_CASTING casting);
 
-#endif /*_NPY_LEGACY_DTYPE_IMPLEMENTATION_H*/
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_LEGACY_DTYPE_IMPLEMENTATION_H_ */
diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
index e38873746..e313d2447 100644
--- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
+++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src
@@ -9,7 +9,7 @@
  */
 
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
+#include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
@@ -1849,7 +1849,7 @@ mapiter_@name@(PyArrayMapIterObject *mit)
                         return -1;
                     }
 #else
-                    /* The operand order is reveresed here */
+                    /* The operand order is reversed here */
                     char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
                     npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
                     if (NPY_UNLIKELY(cast_info.func(&cast_info.context,
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 41311b03f..014a863d5 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1,10 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-/*#include <stdio.h>*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "arrayobject.h"
 
diff --git a/numpy/core/src/multiarray/mapping.h b/numpy/core/src/multiarray/mapping.h
index 4e22f79df..e929b8b3f 100644
--- a/numpy/core/src/multiarray/mapping.h
+++ b/numpy/core/src/multiarray/mapping.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAYMAPPING_H_
-#define _NPY_ARRAYMAPPING_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_MAPPING_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_MAPPING_H_
 
 extern NPY_NO_EXPORT PyMappingMethods array_as_mapping;
 
@@ -70,4 +70,4 @@ PyArray_MapIterNew(npy_index_info *indices , int index_num, int index_type,
                    npy_uint32 subspace_iter_flags, npy_uint32 subspace_flags,
                    npy_uint32 extra_op_flags, PyArrayObject *extra_op,
                    PyArray_Descr *extra_op_dtype);
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_MAPPING_H_ */
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index ffa735b38..bb0006e32 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -1,14 +1,14 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
-#include <stdarg.h>
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
-#include "arrayobject.h"
 #include "numpy/arrayscalars.h"
 
+#include "arrayobject.h"
 #include "arrayfunction_override.h"
 #include "npy_argparse.h"
 #include "npy_config.h"
@@ -30,6 +30,8 @@
 #include "methods.h"
 #include "alloc.h"
 
+#include <stdarg.h>
+
 
 /* NpyArg_ParseKeywords
  *
@@ -1983,6 +1985,16 @@ array_setstate(PyArrayObject *self, PyObject *args)
         return NULL;
     }
 
+    /*
+     * Reassigning fa->descr messes with the reallocation strategy,
+     * since fa could be a 0-d or scalar, and then
+     * PyDataMem_UserFREE will be confused
+     */
+    size_t n_tofree = PyArray_NBYTES(self);
+    if (n_tofree == 0) {
+        PyArray_Descr *dtype = PyArray_DESCR(self);
+        n_tofree = dtype->elsize ? dtype->elsize : 1;
+    }
     Py_XDECREF(PyArray_DESCR(self));
     fa->descr = typecode;
     Py_INCREF(typecode);
@@ -2049,7 +2061,18 @@ array_setstate(PyArrayObject *self, PyObject *args)
     }
 
     if ((PyArray_FLAGS(self) & NPY_ARRAY_OWNDATA)) {
-        PyDataMem_FREE(PyArray_DATA(self));
+        /*
+         * Allocation will never be 0, see comment in ctors.c
+         * line 820
+         */
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return NULL;
+        }
+        PyDataMem_UserFREE(PyArray_DATA(self), n_tofree, handler);
         PyArray_CLEARFLAGS(self, NPY_ARRAY_OWNDATA);
     }
     Py_XDECREF(PyArray_BASE(self));
@@ -2085,7 +2108,6 @@ array_setstate(PyArrayObject *self, PyObject *args)
 
     if (!PyDataType_FLAGCHK(typecode, NPY_LIST_PICKLE)) {
         int swap = PyArray_ISBYTESWAPPED(self);
-        fa->data = datastr;
         /* Bytes should always be considered immutable, but we just grab the
          * pointer if they are large, to save memory. */
         if (!IsAligned(self) || swap || (len <= 1000)) {
@@ -2094,8 +2116,16 @@ array_setstate(PyArrayObject *self, PyObject *args)
                 Py_DECREF(rawdata);
                 Py_RETURN_NONE;
             }
-            fa->data = PyDataMem_NEW(num);
+            /* Store the handler in case the default is modified */
+            Py_XDECREF(fa->mem_handler);
+            fa->mem_handler = PyDataMem_GetHandler();
+            if (fa->mem_handler == NULL) {
+                Py_DECREF(rawdata);
+                return NULL;
+            }
+            fa->data = PyDataMem_UserNEW(num, PyArray_HANDLER(self));
             if (PyArray_DATA(self) == NULL) {
+                Py_DECREF(fa->mem_handler);
                 Py_DECREF(rawdata);
                 return PyErr_NoMemory();
             }
@@ -2131,7 +2161,12 @@ array_setstate(PyArrayObject *self, PyObject *args)
             Py_DECREF(rawdata);
         }
         else {
+            /* The handlers should never be called in this case */
+            Py_XDECREF(fa->mem_handler);
+            fa->mem_handler = NULL;
+            fa->data = datastr;
             if (PyArray_SetBaseObject(self, rawdata) < 0) {
+                Py_DECREF(rawdata);
                 return NULL;
             }
         }
@@ -2142,8 +2177,15 @@ array_setstate(PyArrayObject *self, PyObject *args)
         if (num == 0 || elsize == 0) {
             Py_RETURN_NONE;
         }
-        fa->data = PyDataMem_NEW(num);
+        /* Store the functions in case the default handler is modified */
+        Py_XDECREF(fa->mem_handler);
+        fa->mem_handler = PyDataMem_GetHandler();
+        if (fa->mem_handler == NULL) {
+            return NULL;
+        }
+        fa->data = PyDataMem_UserNEW(num, PyArray_HANDLER(self));
         if (PyArray_DATA(self) == NULL) {
+            Py_DECREF(fa->mem_handler);
             return PyErr_NoMemory();
         }
         if (PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_NEEDS_INIT)) {
@@ -2152,6 +2194,7 @@ array_setstate(PyArrayObject *self, PyObject *args)
         PyArray_ENABLEFLAGS(self, NPY_ARRAY_OWNDATA);
         fa->base = NULL;
         if (_setlist_pkl(self, rawdata) < 0) {
+            Py_DECREF(fa->mem_handler);
             return NULL;
         }
     }
@@ -2707,6 +2750,30 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     return c;
 }
 
+static PyObject *
+array_class_getitem(PyObject *cls, PyObject *args)
+{
+    PyObject *generic_alias;
+
+#ifdef Py_GENERICALIASOBJECT_H
+    Py_ssize_t args_len;
+
+    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
+    if (args_len != 2) {
+        return PyErr_Format(PyExc_TypeError,
+                            "Too %s arguments for %s",
+                            args_len > 2 ? "many" : "few",
+                            ((PyTypeObject *)cls)->tp_name);
+    }
+    generic_alias = Py_GenericAlias(cls, args);
+#else
+    PyErr_SetString(PyExc_TypeError,
+                    "Type subscription requires python >= 3.9");
+    generic_alias = NULL;
+#endif
+    return generic_alias;
+}
+
 NPY_NO_EXPORT PyMethodDef array_methods[] = {
 
     /* for subtypes */
@@ -2764,6 +2831,11 @@ NPY_NO_EXPORT PyMethodDef array_methods[] = {
         (PyCFunction) array_format,
         METH_VARARGS, NULL},
 
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)array_class_getitem,
+        METH_CLASS | METH_O, NULL},
+
     /* Original and Extended methods added 2005 */
     {"all",
         (PyCFunction)array_all,
diff --git a/numpy/core/src/multiarray/methods.h b/numpy/core/src/multiarray/methods.h
index c0de23c35..bcada0fea 100644
--- a/numpy/core/src/multiarray/methods.h
+++ b/numpy/core/src/multiarray/methods.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_METHODS_H_
-#define _NPY_ARRAY_METHODS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_METHODS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_METHODS_H_
 
 #include "npy_import.h"
 
@@ -31,4 +31,4 @@ NpyPath_PathlikeToFspath(PyObject *file)
     return PyObject_CallFunctionObjArgs(os_fspath, file, NULL);
 }
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_METHODS_H_ */
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index d33c7060b..c00f14045 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -11,16 +11,14 @@
   oliphant@ee.byu.edu
   Brigham Young University
 */
-
-/* $Id: multiarraymodule.c,v 1.36 2005/09/14 00:14:00 teoliphant Exp $ */
-
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
 #include <numpy/npy_common.h>
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
@@ -70,6 +68,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "typeinfo.h"
 
 #include "get_attr_string.h"
+#include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
 
 /*
  *****************************************************************************
@@ -84,11 +83,12 @@ NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */
 
 /*
  * global variable to determine if legacy printing is enabled, accessible from
- * C. For simplicity the mode is encoded as an integer where '0' means no
- * legacy mode, and '113' means 1.13 legacy mode. We can upgrade this if we
- * have more complex requirements in the future.
+ * C. For simplicity the mode is encoded as an integer where INT_MAX means no
+ * legacy mode, and '113'/'121' means 1.13/1.21 legacy mode; and 0 maps to
+ * INT_MAX. We can upgrade this if we have more complex requirements in the
+ * future.
  */
-int npy_legacy_print_mode = 0;
+int npy_legacy_print_mode = INT_MAX;
 
 static PyObject *
 set_legacy_print_mode(PyObject *NPY_UNUSED(self), PyObject *args)
@@ -96,6 +96,9 @@ set_legacy_print_mode(PyObject *NPY_UNUSED(self), PyObject *args)
     if (!PyArg_ParseTuple(args, "i", &npy_legacy_print_mode)) {
         return NULL;
     }
+    if (!npy_legacy_print_mode) {
+        npy_legacy_print_mode = INT_MAX;
+    }
     Py_RETURN_NONE;
 }
 
@@ -4431,7 +4434,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
-     METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
+        METH_O, NULL},
     /* from umath */
     {"frompyfunc",
         (PyCFunction) ufunc_frompyfunc,
@@ -4442,6 +4447,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"geterrobj",
         (PyCFunction) ufunc_geterr,
         METH_VARARGS, NULL},
+    {"get_handler_name",
+        (PyCFunction) get_handler_name,
+        METH_VARARGS, NULL},
     {"_add_newdoc_ufunc", (PyCFunction)add_newdoc_ufunc,
         METH_VARARGS, NULL},
     {"_get_sfloat_dtype",
@@ -4919,6 +4927,20 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (initumath(m) != 0) {
         goto err;
     }
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    /*
+     * Initialize the context-local PyDataMem_Handler capsule.
+     */
+    c_api = PyCapsule_New(&default_handler, "mem_handler", NULL);
+    if (c_api == NULL) {
+        goto err;
+    }
+    current_handler = PyContextVar_New("current_allocator", c_api);
+    Py_DECREF(c_api);
+    if (current_handler == NULL) {
+        goto err;
+    }
+#endif
     return m;
 
  err:
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 4cdb6ef72..640940d2a 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_MULTIARRAY_H_
-#define _NPY_MULTIARRAY_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_wrap;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_finalize;
@@ -9,4 +9,4 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_ */
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index a1ca5bff5..860c8c1f6 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -11,8 +11,9 @@
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
-/* Indicate that this .c file is allowed to include the header */
+/* Allow this .c file to include nditer_impl.h */
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
+
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
@@ -115,7 +116,7 @@ NpyIter_RemoveAxis(NpyIter *iter, int axis)
                 --p;
             }
         }
-        else if (p <= 0) {
+        else {
             if (p < -1-axis) {
                 ++p;
             }
@@ -2129,7 +2130,7 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs)
         /*
          * Try to do make the outersize as big as possible. This allows
          * it to shrink when processing the last bit of the outer reduce loop,
-         * then grow again at the beginnning of the next outer reduce loop.
+         * then grow again at the beginning of the next outer reduce loop.
          */
         NBF_REDUCE_OUTERSIZE(bufferdata) = (NAD_SHAPE(reduce_outeraxisdata)-
                                             NAD_INDEX(reduce_outeraxisdata));
@@ -2803,9 +2804,9 @@ npyiter_checkreducesize(NpyIter *iter, npy_intp count,
     if (coord != 0) {
         /*
          * In this case, it is only safe to reuse the buffer if the amount
-         * of data copied is not more then the current axes, as is the
+         * of data copied is not more than the current axes, as is the
          * case when reuse_reduce_loops was active already.
-         * It should be in principle OK when the idim loop returns immidiatly.
+         * It should be in principle OK when the idim loop returns immediately.
          */
         NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_REUSE_REDUCE_LOOPS;
     }
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 98d4f5a75..bf32e1f6b 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -11,10 +11,10 @@
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
-/* Indicate that this .c file is allowed to include the header */
+/* Allow this .c file to include nditer_impl.h */
 #define NPY_ITERATOR_IMPLEMENTATION_CODE
-#include "nditer_impl.h"
 
+#include "nditer_impl.h"
 #include "arrayobject.h"
 #include "array_coercion.h"
 #include "templ_common.h"
@@ -1405,7 +1405,7 @@ check_mask_for_writemasked_reduction(NpyIter *iter, int iop)
 /*
  * Check whether a reduction is OK based on the flags and the operand being
  * readwrite. This path is deprecated, since usually only specific axes
- * should be reduced. If axes are specified explicitely, the flag is
+ * should be reduced. If axes are specified explicitly, the flag is
  * unnecessary.
  */
 static int
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index a5a9177e5..2a82b7e54 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -4,20 +4,21 @@
  * should use the exposed iterator API.
  */
 #ifndef NPY_ITERATOR_IMPLEMENTATION_CODE
-#error "This header is intended for use ONLY by iterator implementation code."
+#error This header is intended for use ONLY by iterator implementation code.
 #endif
 
-#ifndef _NPY_PRIVATE__NDITER_IMPL_H_
-#define _NPY_PRIVATE__NDITER_IMPL_H_
-
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_NDITER_IMPL_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_NDITER_IMPL_H_
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
-#include <npy_pycompat.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#include "numpy/arrayobject.h"
+#include "npy_pycompat.h"
 #include "convert_datatype.h"
 
 #include "lowlevel_strided_loops.h"
@@ -288,7 +289,7 @@ struct NpyIter_AxisData_tag {
         1 + \
         /* intp stride[nop+1] AND char* ptr[nop+1] */ \
         2*((nop)+1) \
-        )*NPY_SIZEOF_INTP )
+        )*(size_t)NPY_SIZEOF_INTP)
 
 /*
  * Macro to advance an AXISDATA pointer by a specified count.
@@ -355,4 +356,4 @@ npyiter_copy_to_buffers(NpyIter *iter, char **prev_dataptrs);
 NPY_NO_EXPORT void
 npyiter_clear_buffers(NpyIter *iter);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_NDITER_IMPL_H_ */
diff --git a/numpy/core/src/multiarray/nditer_pywrap.c b/numpy/core/src/multiarray/nditer_pywrap.c
index 8acc7f87f..8e072d5f4 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.c
+++ b/numpy/core/src/multiarray/nditer_pywrap.c
@@ -6,13 +6,14 @@
  *
  * See LICENSE.txt for the license.
  */
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
-
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
-#include <numpy/arrayobject.h>
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#include "numpy/arrayobject.h"
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "alloc.h"
diff --git a/numpy/core/src/multiarray/nditer_pywrap.h b/numpy/core/src/multiarray/nditer_pywrap.h
index 49eb5d89d..d2fcafebd 100644
--- a/numpy/core/src/multiarray/nditer_pywrap.h
+++ b/numpy/core/src/multiarray/nditer_pywrap.h
@@ -1,8 +1,8 @@
-#ifndef __NDITER_PYWRAP_H
-#define __NDITER_PYWRAP_H
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_NDITER_PYWRAP_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_NDITER_PYWRAP_H_
 
 NPY_NO_EXPORT PyObject *
 NpyIter_NestedIters(PyObject *NPY_UNUSED(self),
                     PyObject *args, PyObject *kwds);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_NDITER_PYWRAP_H_ */
diff --git a/numpy/core/src/multiarray/nditer_templ.c.src b/numpy/core/src/multiarray/nditer_templ.c.src
index 05ce6ae75..3f91a482b 100644
--- a/numpy/core/src/multiarray/nditer_templ.c.src
+++ b/numpy/core/src/multiarray/nditer_templ.c.src
@@ -132,7 +132,7 @@ npyiter_iternext_itflags@tag_itflags@_dims@tag_ndim@_iters@tag_nop@(
         /* Reset the 1st and 2nd indices to 0 */
         NAD_INDEX(axisdata0) = 0;
         NAD_INDEX(axisdata1) = 0;
-        /* Reset the 1st and 2nd pointers to the value of the 3nd */
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
         for (istrides = 0; istrides < nstrides; ++istrides) {
             NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
             NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
diff --git a/numpy/core/src/multiarray/npy_buffer.h b/numpy/core/src/multiarray/npy_buffer.h
index d10f1a020..62e08573c 100644
--- a/numpy/core/src/multiarray/npy_buffer.h
+++ b/numpy/core/src/multiarray/npy_buffer.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE_BUFFER_H_
-#define _NPY_PRIVATE_BUFFER_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_NPY_BUFFER_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_NPY_BUFFER_H_
 
 extern NPY_NO_EXPORT PyBufferProcs array_as_buffer;
 
@@ -12,4 +12,4 @@ _descriptor_from_pep3118_format(char const *s);
 NPY_NO_EXPORT int
 void_getbuffer(PyObject *obj, Py_buffer *view, int flags);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_NPY_BUFFER_H_ */
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 9ed7cde47..292ef55a6 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -1,10 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-/*#include <stdio.h>*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index 4f426f964..054840305 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_NUMBER_H_
-#define _NPY_ARRAY_NUMBER_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_NUMBER_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_NUMBER_H_
 
 typedef struct {
     PyObject *add;
@@ -69,4 +69,4 @@ NPY_NO_EXPORT PyObject *
 PyArray_GenericAccumulateFunction(PyArrayObject *m1, PyObject *op, int axis,
                                   int rtype, PyArrayObject *out);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_NUMBER_H_ */
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index 41dd059b0..a1c310700 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -2,13 +2,13 @@
  * This module corresponds to the `Special functions for NPY_OBJECT`
  * section in the numpy reference for C-API.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "iterators.h"
diff --git a/numpy/core/src/multiarray/refcount.h b/numpy/core/src/multiarray/refcount.h
index 761d53dd0..959eef5ba 100644
--- a/numpy/core/src/multiarray/refcount.h
+++ b/numpy/core/src/multiarray/refcount.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE_REFCOUNT_H_
-#define _NPY_PRIVATE_REFCOUNT_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr);
@@ -16,4 +16,4 @@ PyArray_XDECREF(PyArrayObject *mp);
 NPY_NO_EXPORT void
 PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_ */
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 0e93cbbe9..e409e9874 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 740ec8cc2..bbbc5bfa2 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -1,7 +1,7 @@
 /* -*- c -*- */
 #define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#include "structmember.h"
+#include <Python.h>
+#include <structmember.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #ifndef _MULTIARRAYMODULE
@@ -34,6 +34,16 @@
 
 #include "binop_override.h"
 
+/*
+ * used for allocating a single scalar, so use the default numpy
+ * memory allocators instead of the (maybe) user overrides
+ */
+NPY_NO_EXPORT void *
+npy_alloc_cache_zero(size_t nmemb, size_t size);
+
+NPY_NO_EXPORT void
+npy_free_cache(void * p, npy_uintp sz);
+
 NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[] = {
     {PyObject_HEAD_INIT(&PyBoolArrType_Type) 0},
     {PyObject_HEAD_INIT(&PyBoolArrType_Type) 1},
@@ -209,6 +219,27 @@ gentype_multiply(PyObject *m1, PyObject *m2)
 }
 
 /**begin repeat
+ * #TYPE    = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *            LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type    = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *            npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #c       = hh, uhh, h, uh,, u, l, ul, ll, ull#
+ * #Name    = Byte, UByte, Short, UShort, Int, UInt,
+ *            Long, ULong, LongLong, ULongLong#
+ * #convert = Long*8, LongLong*2#
+ */
+static PyObject *
+@type@_bit_count(PyObject *self)
+{
+    @type@ scalar = PyArrayScalar_VAL(self, @Name@);
+    uint8_t count = npy_popcount@c@(scalar);
+    PyObject *result = PyLong_From@convert@(count);
+
+    return result;
+}
+/**end repeat**/
+
+/**begin repeat
  *
  * #name = positive, negative, absolute, invert, int, float#
  */
@@ -866,7 +897,7 @@ static PyObject *
 {
     npy_@name@ absval;
 
-    if (npy_legacy_print_mode == 113) {
+    if (npy_legacy_print_mode <= 113) {
         return legacy_@name@_format@kind@(val);
     }
 
@@ -892,7 +923,7 @@ c@name@type_@kind@(PyObject *self)
     npy_c@name@ val = PyArrayScalar_VAL(self, C@Name@);
     TrimMode trim = TrimMode_DptZeros;
 
-    if (npy_legacy_print_mode == 113) {
+    if (npy_legacy_print_mode <= 113) {
         return legacy_c@name@_format@kind@(val);
     }
 
@@ -957,7 +988,7 @@ halftype_@kind@(PyObject *self)
     float floatval = npy_half_to_float(val);
     float absval;
 
-    if (npy_legacy_print_mode == 113) {
+    if (npy_legacy_print_mode <= 113) {
         return legacy_float_format@kind@(floatval);
     }
 
@@ -1321,7 +1352,7 @@ gentype_imag_get(PyObject *self, void *NPY_UNUSED(ignored))
         int elsize;
         typecode = PyArray_DescrFromScalar(self);
         elsize = typecode->elsize;
-        temp = npy_alloc_cache_zero(elsize);
+        temp = npy_alloc_cache_zero(1, elsize);
         ret = PyArray_Scalar(temp, typecode, NULL);
         npy_free_cache(temp, elsize);
     }
@@ -1805,6 +1836,59 @@ gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
     Py_RETURN_NONE;
 }
 
+static PyObject *
+numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
+{
+    PyObject *generic_alias;
+
+#ifdef Py_GENERICALIASOBJECT_H
+    Py_ssize_t args_len;
+    int args_len_expected;
+
+    /* complexfloating should take 2 parameters, all others take 1 */
+    if (PyType_IsSubtype((PyTypeObject *)cls,
+                         &PyComplexFloatingArrType_Type)) {
+        args_len_expected = 2;
+    }
+    else {
+        args_len_expected = 1;
+    }
+
+    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
+    if (args_len != args_len_expected) {
+        return PyErr_Format(PyExc_TypeError,
+                            "Too %s arguments for %s",
+                            args_len > args_len_expected ? "many" : "few",
+                            ((PyTypeObject *)cls)->tp_name);
+    }
+    generic_alias = Py_GenericAlias(cls, args);
+#else
+    PyErr_SetString(PyExc_TypeError,
+                    "Type subscription requires python >= 3.9");
+    generic_alias = NULL;
+#endif
+    return generic_alias;
+}
+
+/*
+ * Use for concrete np.number subclasses, making them act as if they
+ * were subtyped from e.g. np.signedinteger[object], thus lacking any
+ * free subscription parameters. Requires python >= 3.9.
+ */
+static PyObject *
+numbertype_class_getitem(PyObject *cls, PyObject *args)
+{
+#ifdef Py_GENERICALIASOBJECT_H
+    PyErr_Format(PyExc_TypeError,
+                 "There are no type variables left in %s",
+                 ((PyTypeObject *)cls)->tp_name);
+#else
+    PyErr_SetString(PyExc_TypeError,
+                    "Type subscription requires python >= 3.9");
+#endif
+    return NULL;
+}
+
 /*
  * casting complex numbers (that don't inherit from Python complex)
  * to Python complex
@@ -2188,6 +2272,14 @@ static PyGetSetDef inttype_getsets[] = {
     {NULL, NULL, NULL, NULL, NULL}
 };
 
+static PyMethodDef numbertype_methods[] = {
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem_abc,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}  /* sentinel */
+};
+
 /**begin repeat
  * #name = cfloat,clongdouble#
  */
@@ -2195,6 +2287,10 @@ static PyMethodDef @name@type_methods[] = {
     {"__complex__",
         (PyCFunction)@name@_complex,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
     {NULL, NULL, 0, NULL}
 };
 /**end repeat**/
@@ -2232,10 +2328,43 @@ static PyMethodDef @name@type_methods[] = {
     {"is_integer",
         (PyCFunction)@name@_is_integer,
         METH_NOARGS, NULL},
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+/**end repeat**/
+
+/**begin repeat
+ * #name = timedelta, cdouble#
+ */
+static PyMethodDef @name@type_methods[] = {
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
     {NULL, NULL, 0, NULL}
 };
 /**end repeat**/
 
+/**begin repeat
+ * #name = byte, ubyte, short, ushort, int, uint,
+ *         long, ulong, longlong, ulonglong#
+ */
+static PyMethodDef @name@type_methods[] = {
+    /* for typing; requires python >= 3.9 */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_@name@_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+/**end repeat**/
+
+
 /************* As_mapping functions for void array scalar ************/
 
 static Py_ssize_t
@@ -3069,7 +3198,10 @@ void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
                     (int) NPY_MAX_INT);
             return NULL;
         }
-        destptr = npy_alloc_cache_zero(memu);
+        if (memu == 0) {
+            memu = 1;
+        }
+        destptr = npy_alloc_cache_zero(memu, 1);
         if (destptr == NULL) {
             return PyErr_NoMemory();
         }
@@ -3951,6 +4083,8 @@ initialize_numeric_types(void)
 
     PyIntegerArrType_Type.tp_getset = inttype_getsets;
 
+    PyNumberArrType_Type.tp_methods = numbertype_methods;
+
     /**begin repeat
      * #NAME= Number, Integer, SignedInteger, UnsignedInteger, Inexact,
      *        Floating, ComplexFloating, Flexible, Character#
@@ -4008,6 +4142,17 @@ initialize_numeric_types(void)
     /**end repeat**/
 
     /**begin repeat
+     * #name = byte, short, int, long, longlong,
+     *         ubyte, ushort, uint, ulong, ulonglong#
+     * #Name = Byte, Short, Int, Long, LongLong,
+     *         UByte, UShort, UInt, ULong, ULongLong#
+     */
+
+    Py@Name@ArrType_Type.tp_methods = @name@type_methods;
+
+    /**end repeat**/
+
+    /**begin repeat
      * #name = half, float, double, longdouble#
      * #Name = Half, Float, Double, LongDouble#
      */
@@ -4016,6 +4161,17 @@ initialize_numeric_types(void)
 
     /**end repeat**/
 
+    /**begin repeat
+     * #name = byte, short, int, long, longlong, ubyte, ushort,
+     *         uint, ulong, ulonglong, timedelta, cdouble#
+     * #Name = Byte, Short, Int, Long, LongLong, UByte, UShort,
+     *         UInt, ULong, ULongLong, Timedelta, CDouble#
+     */
+
+    Py@Name@ArrType_Type.tp_methods = @name@type_methods;
+
+    /**end repeat**/
+
     /* We won't be inheriting from Python Int type. */
     PyIntArrType_Type.tp_hash = int_arrtype_hash;
 
diff --git a/numpy/core/src/multiarray/scalartypes.h b/numpy/core/src/multiarray/scalartypes.h
index 861f2c943..95a2f66c6 100644
--- a/numpy/core/src/multiarray/scalartypes.h
+++ b/numpy/core/src/multiarray/scalartypes.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_SCALARTYPES_H_
-#define _NPY_SCALARTYPES_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_SCALARTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_SCALARTYPES_H_
 
 /* Internal look-up tables */
 extern NPY_NO_EXPORT unsigned char
@@ -31,4 +31,4 @@ _typenum_fromtypeobj(PyObject *type, int user);
 NPY_NO_EXPORT void *
 scalar_value(PyObject *scalar, PyArray_Descr *descr);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_SCALARTYPES_H_ */
diff --git a/numpy/core/src/multiarray/sequence.c b/numpy/core/src/multiarray/sequence.c
index 1c74f1719..8db0690a1 100644
--- a/numpy/core/src/multiarray/sequence.c
+++ b/numpy/core/src/multiarray/sequence.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
diff --git a/numpy/core/src/multiarray/sequence.h b/numpy/core/src/multiarray/sequence.h
index b28c50d97..aff6aeb7e 100644
--- a/numpy/core/src/multiarray/sequence.h
+++ b/numpy/core/src/multiarray/sequence.h
@@ -1,6 +1,6 @@
-#ifndef _NPY_ARRAY_SEQUENCE_H_
-#define _NPY_ARRAY_SEQUENCE_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_SEQUENCE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_SEQUENCE_H_
 
 extern NPY_NO_EXPORT PySequenceMethods array_as_sequence;
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_SEQUENCE_H_ */
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 02c349759..162abd6a4 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -1,9 +1,10 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -120,8 +121,16 @@ PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int refcheck,
         }
 
         /* Reallocate space if needed - allocating 0 is forbidden */
-        new_data = PyDataMem_RENEW(
-            PyArray_DATA(self), newnbytes == 0 ? elsize : newnbytes);
+        PyObject *handler = PyArray_HANDLER(self);
+        if (handler == NULL) {
+            /* This can happen if someone arbitrarily sets NPY_ARRAY_OWNDATA */
+            PyErr_SetString(PyExc_RuntimeError,
+                            "no memory handler found but OWNDATA flag set");
+            return NULL;
+        }
+        new_data = PyDataMem_UserRENEW(PyArray_DATA(self),
+                                       newnbytes == 0 ? elsize : newnbytes,
+                                       handler);
         if (new_data == NULL) {
             PyErr_SetString(PyExc_MemoryError,
                     "cannot allocate memory for array");
diff --git a/numpy/core/src/multiarray/shape.h b/numpy/core/src/multiarray/shape.h
index 875b5430f..bef386ed1 100644
--- a/numpy/core/src/multiarray/shape.h
+++ b/numpy/core/src/multiarray/shape.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_SHAPE_H_
-#define _NPY_ARRAY_SHAPE_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_SHAPE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_SHAPE_H_
 
 /*
  * Creates a sorted stride perm matching the KEEPORDER behavior
@@ -21,4 +21,4 @@ PyArray_CreateMultiSortedStridePerm(int narrays, PyArrayObject **arrays,
 NPY_NO_EXPORT PyObject *
 PyArray_SqueezeSelected(PyArrayObject *self, npy_bool *axis_flags);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_SHAPE_H_ */
diff --git a/numpy/core/src/multiarray/strfuncs.c b/numpy/core/src/multiarray/strfuncs.c
index d9d9b7c0a..ba457f4f4 100644
--- a/numpy/core/src/multiarray/strfuncs.c
+++ b/numpy/core/src/multiarray/strfuncs.c
@@ -1,8 +1,10 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include <numpy/arrayobject.h>
+
+#include "numpy/arrayobject.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
 #include "strfuncs.h"
diff --git a/numpy/core/src/multiarray/strfuncs.h b/numpy/core/src/multiarray/strfuncs.h
index 5dd661a20..134b56ed3 100644
--- a/numpy/core/src/multiarray/strfuncs.h
+++ b/numpy/core/src/multiarray/strfuncs.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_ARRAY_STRFUNCS_H_
-#define _NPY_ARRAY_STRFUNCS_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_STRFUNCS_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_STRFUNCS_H_
 
 NPY_NO_EXPORT void
 PyArray_SetStringFunction(PyObject *op, int repr);
@@ -13,4 +13,4 @@ array_str(PyArrayObject *self);
 NPY_NO_EXPORT PyObject *
 array_format(PyArrayObject *self, PyObject *args);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_STRFUNCS_H_ */
diff --git a/numpy/core/src/multiarray/temp_elide.c b/numpy/core/src/multiarray/temp_elide.c
index 2b4621744..f615aa336 100644
--- a/numpy/core/src/multiarray/temp_elide.c
+++ b/numpy/core/src/multiarray/temp_elide.c
@@ -1,8 +1,9 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "npy_config.h"
 #include "numpy/arrayobject.h"
 
diff --git a/numpy/core/src/multiarray/temp_elide.h b/numpy/core/src/multiarray/temp_elide.h
index 206bb0253..a1fec98d5 100644
--- a/numpy/core/src/multiarray/temp_elide.h
+++ b/numpy/core/src/multiarray/temp_elide.h
@@ -1,5 +1,6 @@
-#ifndef _NPY_ARRAY_TEMP_AVOID_H_
-#define _NPY_ARRAY_TEMP_AVOID_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEMP_ELIDE_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TEMP_ELIDE_H_
+
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 #include <numpy/ndarraytypes.h>
@@ -12,4 +13,4 @@ try_binary_elide(PyObject * m1, PyObject * m2,
                  PyObject * (inplace_op)(PyArrayObject * m1, PyObject * m2),
                  PyObject ** res, int commutative);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_TEMP_ELIDE_H_ */
diff --git a/numpy/core/src/multiarray/typeinfo.c b/numpy/core/src/multiarray/typeinfo.c
index b0563b3c0..8cf6bc1e0 100644
--- a/numpy/core/src/multiarray/typeinfo.c
+++ b/numpy/core/src/multiarray/typeinfo.c
@@ -3,6 +3,10 @@
  * Unfortunately, we need two different types to cover the cases where min/max
  * do and do not appear in the tuple.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include "npy_pycompat.h"
 #include "typeinfo.h"
 
 #if (defined(PYPY_VERSION_NUM) && (PYPY_VERSION_NUM <= 0x07030000))
@@ -10,9 +14,6 @@
 #include <structseq.h>
 #endif
 
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
-#include "npy_pycompat.h"
 
 
 static PyTypeObject PyArray_typeinfoType;
diff --git a/numpy/core/src/multiarray/typeinfo.h b/numpy/core/src/multiarray/typeinfo.h
index 28afa4120..af4637fc9 100644
--- a/numpy/core/src/multiarray/typeinfo.h
+++ b/numpy/core/src/multiarray/typeinfo.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE_TYPEINFO_H_
-#define _NPY_PRIVATE_TYPEINFO_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_TYPEINFO_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_TYPEINFO_H_
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
@@ -18,4 +18,4 @@ PyArray_typeinforanged(
     char typechar, int typenum, int nbits, int align,
     PyObject *max, PyObject *min, PyTypeObject *type_obj);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_TYPEINFO_H_ */
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index 5602304e9..a338d712d 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -20,13 +20,13 @@ maintainer email:  oliphant.travis@ieee.org
   Space Science Telescope Institute
   (J. Todd Miller, Perry Greenfield, Rick White)
 */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
-#include "structmember.h"
+#include <structmember.h>
 
-/*#include <stdio.h>*/
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-#define _MULTIARRAYMODULE
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 
@@ -268,6 +268,56 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
     return typenum;
 }
 
+
+/*
+ * Checks that there is no cast already cached using the new casting-impl
+ * mechanism.
+ * In that case, we do not clear out the cache (but otherwise silently
+ * continue).  Users should not modify casts after they have been used,
+ * but this may also happen accidentally during setup (and may never have
+ * mattered).  See https://github.com/numpy/numpy/issues/20009
+ */
+static int _warn_if_cast_exists_already(
+        PyArray_Descr *descr, int totype, char *funcname)
+{
+    PyArray_DTypeMeta *to_DType = PyArray_DTypeFromTypeNum(totype);
+    if (to_DType == NULL) {
+        return -1;
+    }
+    PyObject *cast_impl = PyDict_GetItemWithError(
+            NPY_DT_SLOTS(NPY_DTYPE(descr))->castingimpls, (PyObject *)to_DType);
+    Py_DECREF(to_DType);
+    if (cast_impl == NULL) {
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+    }
+    else {
+        char *extra_msg;
+        if (cast_impl == Py_None) {
+            extra_msg = "the cast will continue to be considered impossible.";
+        }
+        else {
+            extra_msg = "the previous definition will continue to be used.";
+        }
+        Py_DECREF(cast_impl);
+        PyArray_Descr *to_descr = PyArray_DescrFromType(totype);
+        int ret = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
+                "A cast from %R to %R was registered/modified using `%s` "
+                "after the cast had been used.  "
+                "This registration will have (mostly) no effect: %s\n"
+                "The most likely fix is to ensure that casts are the first "
+                "thing initialized after dtype registration.  "
+                "Please contact the NumPy developers with any questions!",
+                descr, to_descr, funcname, extra_msg);
+        Py_DECREF(to_descr);
+        if (ret < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
 /*NUMPY_API
   Register Casting Function
   Replaces any function currently stored.
@@ -279,14 +329,19 @@ PyArray_RegisterCastFunc(PyArray_Descr *descr, int totype,
     PyObject *cobj, *key;
     int ret;
 
-    if (totype < NPY_NTYPES_ABI_COMPATIBLE) {
-        descr->f->cast[totype] = castfunc;
-        return 0;
-    }
     if (totype >= NPY_NTYPES && !PyTypeNum_ISUSERDEF(totype)) {
         PyErr_SetString(PyExc_TypeError, "invalid type number.");
         return -1;
     }
+    if (_warn_if_cast_exists_already(
+            descr, totype, "PyArray_RegisterCastFunc") < 0) {
+        return -1;
+    }
+
+    if (totype < NPY_NTYPES_ABI_COMPATIBLE) {
+        descr->f->cast[totype] = castfunc;
+        return 0;
+    }
     if (descr->f->castdict == NULL) {
         descr->f->castdict = PyDict_New();
         if (descr->f->castdict == NULL) {
@@ -328,6 +383,10 @@ PyArray_RegisterCanCast(PyArray_Descr *descr, int totype,
                         "RegisterCanCast must be user-defined.");
         return -1;
     }
+    if (_warn_if_cast_exists_already(
+            descr, totype, "PyArray_RegisterCanCast") < 0) {
+        return -1;
+    }
 
     if (scalar == NPY_NOSCALAR) {
         /*
diff --git a/numpy/core/src/multiarray/usertypes.h b/numpy/core/src/multiarray/usertypes.h
index 8b2fc80e6..6768e2c42 100644
--- a/numpy/core/src/multiarray/usertypes.h
+++ b/numpy/core/src/multiarray/usertypes.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_PRIVATE_USERTYPES_H_
-#define _NPY_PRIVATE_USERTYPES_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_USERTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_USERTYPES_H_
 
 #include "array_method.h"
 
@@ -27,4 +27,4 @@ NPY_NO_EXPORT int
 PyArray_AddLegacyWrapping_CastingImpl(
         PyArray_DTypeMeta *from, PyArray_DTypeMeta *to, NPY_CASTING casting);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_USERTYPES_H_ */
diff --git a/numpy/core/src/multiarray/vdot.c b/numpy/core/src/multiarray/vdot.c
index 9b5d19522..ff08ed2d4 100644
--- a/numpy/core/src/multiarray/vdot.c
+++ b/numpy/core/src/multiarray/vdot.c
@@ -1,7 +1,9 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
 #include "common.h"
 #include "vdot.h"
 #include "npy_cblas.h"
diff --git a/numpy/core/src/multiarray/vdot.h b/numpy/core/src/multiarray/vdot.h
index 0f60ca6d1..f6da5ddea 100644
--- a/numpy/core/src/multiarray/vdot.h
+++ b/numpy/core/src/multiarray/vdot.h
@@ -1,5 +1,5 @@
-#ifndef _NPY_VDOT_H_
-#define _NPY_VDOT_H_
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_VDOT_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_VDOT_H_
 
 #include "common.h"
 
@@ -15,4 +15,4 @@ CLONGDOUBLE_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
 NPY_NO_EXPORT void
 OBJECT_vdot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
 
-#endif
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_VDOT_H_ */
diff --git a/numpy/core/src/npymath/halffloat.c b/numpy/core/src/npymath/halffloat.c
index cbaa11e43..51948c736 100644
--- a/numpy/core/src/npymath/halffloat.c
+++ b/numpy/core/src/npymath/halffloat.c
@@ -1,4 +1,5 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
 #include "numpy/halffloat.h"
 
 /*
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index 1e46a2303..dd2424db8 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -55,6 +55,29 @@
  */
 #include "npy_math_private.h"
 
+/* Magic binary numbers used by bit_count
+ * For type T, the magic numbers are computed as follows:
+ * Magic[0]: 01 01 01 01 01 01... = (T)~(T)0/3
+ * Magic[1]: 0011 0011 0011...    = (T)~(T)0/15  * 3
+ * Magic[2]: 00001111 00001111... = (T)~(T)0/255 * 15
+ * Magic[3]: 00000001 00000001... = (T)~(T)0/255
+ *
+ * Counting bits set, in parallel
+ * Based on: http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ *
+ * Generic Algorithm for type T:
+ * a = a - ((a >> 1) & (T)~(T)0/3);
+ * a = (a & (T)~(T)0/15*3) + ((a >> 2) & (T)~(T)0/15*3);
+ * a = (a + (a >> 4)) & (T)~(T)0/255*15;
+ * c = (T)(a * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT;
+*/
+
+static const npy_uint8  MAGIC8[]  = {0x55u,                 0x33u,                 0x0Fu,                 0x01u};
+static const npy_uint16 MAGIC16[] = {0x5555u,               0x3333u,               0x0F0Fu,               0x0101u};
+static const npy_uint32 MAGIC32[] = {0x55555555ul,          0x33333333ul,          0x0F0F0F0Ful,          0x01010101ul};
+static const npy_uint64 MAGIC64[] = {0x5555555555555555ull, 0x3333333333333333ull, 0x0F0F0F0F0F0F0F0Full, 0x0101010101010101ull};
+
+
 /*
  *****************************************************************************
  **                     BASIC MATH FUNCTIONS                                **
@@ -457,21 +480,40 @@ NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
  * #c = l,,f#
  * #C = L,,F#
  */
+
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
 /**begin repeat1
  * #kind = sin,cos,tan,sinh,cosh,tanh,fabs,floor,ceil,rint,trunc,sqrt,log10,
  *         log,exp,expm1,asin,acos,atan,asinh,acosh,atanh,log1p,exp2,log2#
  * #KIND = SIN,COS,TAN,SINH,COSH,TANH,FABS,FLOOR,CEIL,RINT,TRUNC,SQRT,LOG10,
  *         LOG,EXP,EXPM1,ASIN,ACOS,ATAN,ASINH,ACOSH,ATANH,LOG1P,EXP2,LOG2#
+ * #TRIG_WORKAROUND = WORKAROUND_APPLE_TRIG_BUG*3, 0*22#
  */
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 {
+#if @TRIG_WORKAROUND@
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
     return @kind@@c@(x);
 }
 #endif
 
 /**end repeat1**/
 
+#undef WORKAROUND_APPLE_TRIG_BUG
+
 /**begin repeat1
  * #kind = atan2,hypot,pow,fmod,copysign#
  * #KIND = ATAN2,HYPOT,POW,FMOD,COPYSIGN#
@@ -795,3 +837,66 @@ npy_rshift@u@@c@(npy_@u@@type@ a, npy_@u@@type@ b)
 }
 /**end repeat1**/
 /**end repeat**/
+
+
+#define __popcnt32 __popcnt
+/**begin repeat
+ *
+ * #type  = ubyte, ushort, uint, ulong, ulonglong#
+ * #STYPE = BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ * #c     = hh,    h,      ,     l,     ll#
+ */
+#undef TO_BITS_LEN
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_BITS_LEN(X) X##@len@
+/**end repeat1**/
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallel@c@(npy_@type@ a)
+{
+    a = a - ((a >> 1) & (npy_@type@) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_@type@) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_@type@) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_@type@) TO_BITS_LEN(MAGIC)[2];
+    return (npy_@type@) (a * (npy_@type@) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_@STYPE@ - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountu@c@(npy_@type@ a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_@STYPE@ >= 32
+    return __builtin_popcount@c@(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_@STYPE@ >= 16
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_@STYPE@ != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_@STYPE@ == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallel@c@(a);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *
+ * #type = byte, short, int, long, longlong#
+ * #c    = hh,   h,     ,    l,    ll#
+ */
+NPY_INPLACE uint8_t
+npy_popcount@c@(npy_@type@ a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountu@c@(a < 0 ? -a : a);
+}
+/**end repeat**/
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index 212d11a0b..7ca0c5ba0 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -19,7 +19,13 @@
 #define _NPY_MATH_PRIVATE_H_
 
 #include <Python.h>
+#ifdef __cplusplus
+#include <cmath>
+using std::isgreater;
+using std::isless;
+#else
 #include <math.h>
+#endif
 
 #include "npy_config.h"
 #include "npy_fpmath.h"
@@ -507,17 +513,29 @@ typedef union {
 #else /* !_MSC_VER */
 typedef union {
         npy_cdouble npy_z;
+#ifdef __cplusplus
+        std::complex<double> c99z;
+#else
         complex double c99_z;
+#endif
 } __npy_cdouble_to_c99_cast;
 
 typedef union {
         npy_cfloat npy_z;
+#ifdef __cplusplus
+        std::complex<float> c99z;
+#else
         complex float c99_z;
+#endif
 } __npy_cfloat_to_c99_cast;
 
 typedef union {
         npy_clongdouble npy_z;
+#ifdef __cplusplus
+        std::complex<long double> c99_z;
+#else
         complex long double c99_z;
+#endif
 } __npy_clongdouble_to_c99_cast;
 #endif /* !_MSC_VER */
 
diff --git a/numpy/core/src/npysort/radixsort.c.src b/numpy/core/src/npysort/radixsort.c.src
deleted file mode 100644
index 99d8ed42a..000000000
--- a/numpy/core/src/npysort/radixsort.c.src
+++ /dev/null
@@ -1,231 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "npy_sort.h"
-#include "npysort_common.h"
-#include <stdlib.h>
-
-/*
- *****************************************************************************
- **                            INTEGER SORTS                                **
- *****************************************************************************
- */
-
-
-/**begin repeat
- *
- * #TYPE = BOOL, BYTE, UBYTE, SHORT, USHORT, INT, UINT, LONG, ULONG,
- *         LONGLONG, ULONGLONG#
- * #suff = bool, byte, ubyte, short, ushort, int, uint, long, ulong,
- *         longlong, ulonglong#
- * #type = npy_ubyte, npy_ubyte, npy_ubyte, npy_ushort, npy_ushort, npy_uint,
- *         npy_uint, npy_ulong, npy_ulong, npy_ulonglong, npy_ulonglong#
- * #sign = 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
- * #floating = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0#
- */
-
-// Reference: https://github.com/eloj/radix-sorting#-key-derivation
-#if @sign@
-    // Floating-point is currently disabled.
-    // Floating-point tests succeed for double and float on macOS but not on Windows/Linux.
-    // Basic sorting tests succeed but others relying on sort fail.
-    // Possibly related to floating-point normalisation or multiple NaN reprs? Not sure.
-    #if @floating@
-        // For floats, we invert the key if the sign bit is set, else we invert the sign bit.
-        #define KEY_OF(x) ((x) ^ (-((x) >> (sizeof(@type@) * 8 - 1)) | ((@type@)1 << (sizeof(@type@) * 8 - 1))))
-    #else
-        // For signed ints, we flip the sign bit so the negatives are below the positives.
-        #define KEY_OF(x) ((x) ^ ((@type@)1 << (sizeof(@type@) * 8 - 1)))
-    #endif
-#else
-    // For unsigned ints, the key is as-is
-    #define KEY_OF(x) (x)
-#endif
-
-static inline npy_ubyte
-nth_byte_@suff@(@type@ key, npy_intp l) {
-    return (key >> (l << 3)) & 0xFF;
-}
-
-static @type@*
-radixsort0_@suff@(@type@ *arr, @type@ *aux, npy_intp num)
-{
-    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
-    npy_intp i;
-    size_t l;
-    @type@ key0 = KEY_OF(arr[0]);
-    size_t ncols = 0;
-    npy_ubyte cols[sizeof(@type@)];
-
-    for (i = 0; i < num; i++) {
-        @type@ k = KEY_OF(arr[i]);
-
-        for (l = 0; l < sizeof(@type@); l++) {
-            cnt[l][nth_byte_@suff@(k, l)]++;
-        }
-    }
-
-    for (l = 0; l < sizeof(@type@); l++) {
-	    if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
-	        cols[ncols++] = l;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp a = 0;
-        for (i = 0; i < 256; i++) {
-            npy_intp b = cnt[cols[l]][i];
-            cnt[cols[l]][i] = a;
-            a += b;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        @type@* temp;
-        for (i = 0; i < num; i++) {
-            @type@ k = KEY_OF(arr[i]);
-            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
-            aux[dst] = arr[i];
-        }
-
-        temp = aux;
-        aux = arr;
-        arr = temp;
-    }
-
-    return arr;
-}
-
-NPY_NO_EXPORT int
-radixsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr))
-{
-    void *sorted;
-    @type@ *aux;
-    @type@ *arr = start;
-    @type@ k1, k2;
-    npy_bool all_sorted = 1;
-
-    if (num < 2) {
-        return 0;
-    }
-
-    k1 = KEY_OF(arr[0]);
-    for (npy_intp i = 1; i < num; i++) {
-        k2 = KEY_OF(arr[i]);
-        if (k1 > k2) {
-            all_sorted = 0;
-            break;
-        }
-        k1 = k2;
-    }
-
-    if (all_sorted) {
-        return 0;
-    }
-
-    aux = malloc(num * sizeof(@type@));
-    if (aux == NULL) {
-        return -NPY_ENOMEM;
-    }
-
-    sorted = radixsort0_@suff@(start, aux, num);
-    if (sorted != start) {
-        memcpy(start, sorted, num * sizeof(@type@));
-    }
-
-    free(aux);
-    return 0;
-}
-
-static npy_intp*
-aradixsort0_@suff@(@type@ *arr, npy_intp *aux, npy_intp *tosort, npy_intp num)
-{
-    npy_intp cnt[sizeof(@type@)][1 << 8] = { { 0 } };
-    npy_intp i;
-    size_t l;
-    @type@ key0 = KEY_OF(arr[0]);
-    size_t ncols = 0;
-    npy_ubyte cols[sizeof(@type@)];
-
-    for (i = 0; i < num; i++) {
-        @type@ k = KEY_OF(arr[i]);
-
-        for (l = 0; l < sizeof(@type@); l++) {
-            cnt[l][nth_byte_@suff@(k, l)]++;
-        }
-    }
-
-    for (l = 0; l < sizeof(@type@); l++) {
-        if (cnt[l][nth_byte_@suff@(key0, l)] != num) {
-            cols[ncols++] = l;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp a = 0;
-        for (i = 0; i < 256; i++) {
-            npy_intp b = cnt[cols[l]][i];
-            cnt[cols[l]][i] = a;
-            a += b;
-        }
-    }
-
-    for (l = 0; l < ncols; l++) {
-        npy_intp* temp;
-        for (i = 0; i < num; i++) {
-            @type@ k = KEY_OF(arr[tosort[i]]);
-            npy_intp dst = cnt[cols[l]][nth_byte_@suff@(k, cols[l])]++;
-            aux[dst] = tosort[i];
-        }
-
-        temp = aux;
-        aux = tosort;
-        tosort = temp;
-    }
-
-    return tosort;
-}
-
-NPY_NO_EXPORT int
-aradixsort_@suff@(void *start, npy_intp* tosort, npy_intp num, void *NPY_UNUSED(varr))
-{
-    npy_intp *sorted;
-    npy_intp *aux;
-    @type@ *arr = start;
-    @type@ k1, k2;
-    npy_bool all_sorted = 1;
-
-    if (num < 2) {
-        return 0;
-    }
-
-    k1 = KEY_OF(arr[tosort[0]]);
-    for (npy_intp i = 1; i < num; i++) {
-        k2 = KEY_OF(arr[tosort[i]]);
-        if (k1 > k2) {
-            all_sorted = 0;
-            break;
-        }
-        k1 = k2;
-    }
-
-    if (all_sorted) {
-        return 0;
-    }
-
-    aux = malloc(num * sizeof(npy_intp));
-    if (aux == NULL) {
-        return -NPY_ENOMEM;
-    }
-
-    sorted = aradixsort0_@suff@(start, aux, tosort, num);
-    if (sorted != tosort) {
-        memcpy(tosort, sorted, num * sizeof(npy_intp));
-    }
-
-    free(aux);
-    return 0;
-}
-
-#undef KEY_OF
-
-/**end repeat**/
diff --git a/numpy/core/src/npysort/radixsort.cpp b/numpy/core/src/npysort/radixsort.cpp
new file mode 100644
index 000000000..017ea43b6
--- /dev/null
+++ b/numpy/core/src/npysort/radixsort.cpp
@@ -0,0 +1,354 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_sort.h"
+#include "npysort_common.h"
+
+#include "../common/numpy_tag.h"
+#include <stdlib.h>
+#include <type_traits>
+
+/*
+ *****************************************************************************
+ **                            INTEGER SORTS                                **
+ *****************************************************************************
+ */
+
+// Reference: https://github.com/eloj/radix-sorting#-key-derivation
+template <class T>
+T
+KEY_OF(T x)
+{
+    // Floating-point is currently disabled.
+    // Floating-point tests succeed for double and float on macOS but not on
+    // Windows/Linux. Basic sorting tests succeed but others relying on sort
+    // fail. Possibly related to floating-point normalisation or multiple NaN
+    // reprs? Not sure.
+    if (std::is_floating_point<T>::value) {
+        // For floats, we invert the key if the sign bit is set, else we invert
+        // the sign bit.
+        return ((x) ^ (-((x) >> (sizeof(T) * 8 - 1)) |
+                       ((T)1 << (sizeof(T) * 8 - 1))));
+    }
+    else if (std::is_signed<T>::value) {
+        // For signed ints, we flip the sign bit so the negatives are below the
+        // positives.
+        return ((x) ^ ((T)1 << (sizeof(T) * 8 - 1)));
+    }
+    else {
+        return x;
+    }
+}
+
+template <class T>
+static inline npy_ubyte
+nth_byte(T key, npy_intp l)
+{
+    return (key >> (l << 3)) & 0xFF;
+}
+
+template <class T>
+static T *
+radixsort0(T *start, T *aux, npy_intp num)
+{
+    npy_intp cnt[sizeof(T)][1 << 8] = {{0}};
+    T key0 = KEY_OF(start[0]);
+
+    for (npy_intp i = 0; i < num; i++) {
+        T k = KEY_OF(start[i]);
+
+        for (size_t l = 0; l < sizeof(T); l++) {
+            cnt[l][nth_byte(k, l)]++;
+        }
+    }
+
+    size_t ncols = 0;
+    npy_ubyte cols[sizeof(T)];
+    for (size_t l = 0; l < sizeof(T); l++) {
+        if (cnt[l][nth_byte(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (npy_intp i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        T *temp;
+        for (npy_intp i = 0; i < num; i++) {
+            T k = KEY_OF(start[i]);
+            npy_intp dst = cnt[cols[l]][nth_byte(k, cols[l])]++;
+            aux[dst] = start[i];
+        }
+
+        temp = aux;
+        aux = start;
+        start = temp;
+    }
+
+    return start;
+}
+
+template <class T>
+static int
+radixsort_(T *start, npy_intp num)
+{
+    if (num < 2) {
+        return 0;
+    }
+
+    npy_bool all_sorted = 1;
+    T k1 = KEY_OF(start[0]), k2;
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(start[i]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    T *aux = (T *)malloc(num * sizeof(T));
+    if (aux == nullptr) {
+        return -NPY_ENOMEM;
+    }
+
+    T *sorted = radixsort0(start, aux, num);
+    if (sorted != start) {
+        memcpy(start, sorted, num * sizeof(T));
+    }
+
+    free(aux);
+    return 0;
+}
+
+template <class T>
+static int
+radixsort(void *start, npy_intp num)
+{
+    return radixsort_((T *)start, num);
+}
+
+template <class T>
+static npy_intp *
+aradixsort0(T *start, npy_intp *aux, npy_intp *tosort, npy_intp num)
+{
+    npy_intp cnt[sizeof(T)][1 << 8] = {{0}};
+    T key0 = KEY_OF(start[0]);
+
+    for (npy_intp i = 0; i < num; i++) {
+        T k = KEY_OF(start[i]);
+
+        for (size_t l = 0; l < sizeof(T); l++) {
+            cnt[l][nth_byte(k, l)]++;
+        }
+    }
+
+    size_t ncols = 0;
+    npy_ubyte cols[sizeof(T)];
+    for (size_t l = 0; l < sizeof(T); l++) {
+        if (cnt[l][nth_byte(key0, l)] != num) {
+            cols[ncols++] = l;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp a = 0;
+        for (npy_intp i = 0; i < 256; i++) {
+            npy_intp b = cnt[cols[l]][i];
+            cnt[cols[l]][i] = a;
+            a += b;
+        }
+    }
+
+    for (size_t l = 0; l < ncols; l++) {
+        npy_intp *temp;
+        for (npy_intp i = 0; i < num; i++) {
+            T k = KEY_OF(start[tosort[i]]);
+            npy_intp dst = cnt[cols[l]][nth_byte(k, cols[l])]++;
+            aux[dst] = tosort[i];
+        }
+
+        temp = aux;
+        aux = tosort;
+        tosort = temp;
+    }
+
+    return tosort;
+}
+
+template <class T>
+static int
+aradixsort_(T *start, npy_intp *tosort, npy_intp num)
+{
+    npy_intp *sorted;
+    npy_intp *aux;
+    T k1, k2;
+    npy_bool all_sorted = 1;
+
+    if (num < 2) {
+        return 0;
+    }
+
+    k1 = KEY_OF(start[tosort[0]]);
+    for (npy_intp i = 1; i < num; i++) {
+        k2 = KEY_OF(start[tosort[i]]);
+        if (k1 > k2) {
+            all_sorted = 0;
+            break;
+        }
+        k1 = k2;
+    }
+
+    if (all_sorted) {
+        return 0;
+    }
+
+    aux = (npy_intp *)malloc(num * sizeof(npy_intp));
+    if (aux == NULL) {
+        return -NPY_ENOMEM;
+    }
+
+    sorted = aradixsort0(start, aux, tosort, num);
+    if (sorted != tosort) {
+        memcpy(tosort, sorted, num * sizeof(npy_intp));
+    }
+
+    free(aux);
+    return 0;
+}
+
+template <class T>
+static int
+aradixsort(void *start, npy_intp *tosort, npy_intp num)
+{
+    return aradixsort_((T *)start, tosort, num);
+}
+
+extern "C" {
+NPY_NO_EXPORT int
+radixsort_bool(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_bool>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_byte(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_byte>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ubyte(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ubyte>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_short(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_short>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ushort(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ushort>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_int(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_int>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_uint(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_uint>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_long(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_long>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ulong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ulong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_longlong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_longlong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+radixsort_ulonglong(void *vec, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return radixsort<npy_ulonglong>(vec, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_bool>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_byte>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ubyte>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_short(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_short>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ushort(void *vec, npy_intp *ind, npy_intp cnt,
+                  void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ushort>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_int>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_uint>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_long>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ulong(void *vec, npy_intp *ind, npy_intp cnt,
+                 void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ulong>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_longlong(void *vec, npy_intp *ind, npy_intp cnt,
+                    void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_longlong>(vec, ind, cnt);
+}
+NPY_NO_EXPORT int
+aradixsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt,
+                     void *NPY_UNUSED(null))
+{
+    return aradixsort<npy_ulonglong>(vec, ind, cnt);
+}
+}
diff --git a/numpy/core/src/umath/_operand_flag_tests.c.src b/numpy/core/src/umath/_operand_flag_tests.c.src
index d22a5c507..c59e13baf 100644
--- a/numpy/core/src/umath/_operand_flag_tests.c.src
+++ b/numpy/core/src/umath/_operand_flag_tests.c.src
@@ -1,6 +1,7 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include <numpy/arrayobject.h>
 #include <numpy/ufuncobject.h>
 #include "numpy/npy_3kcompat.h"
diff --git a/numpy/core/src/umath/_rational_tests.c.src b/numpy/core/src/umath/_rational_tests.c.src
index 7b1e5627a..bf50a2226 100644
--- a/numpy/core/src/umath/_rational_tests.c.src
+++ b/numpy/core/src/umath/_rational_tests.c.src
@@ -1,16 +1,16 @@
 /* Fixed size rational numbers exposed to Python */
-
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <structmember.h>
-#include <numpy/arrayobject.h>
-#include <numpy/ufuncobject.h>
-#include <numpy/npy_3kcompat.h>
-#include <math.h>
 
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_3kcompat.h"
 #include "common.h"  /* for error_converting */
 
+#include <math.h>
+
 
 /* Relevant arithmetic exceptions */
 
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index cbea378f0..b6c19362a 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -11,10 +11,10 @@
  * NOTE: The tests were initially written using private API and ABI, ideally
  *       they should be replaced/modified with versions using public API.
  */
-
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
 #include "numpy/ndarrayobject.h"
 #include "numpy/ufuncobject.h"
 
@@ -398,6 +398,42 @@ float_to_from_sfloat_resolve_descriptors(
 }
 
 
+/*
+ * Cast to boolean (for testing the logical functions a bit better).
+ */
+static int
+cast_sfloat_to_bool(PyArrayMethod_Context *NPY_UNUSED(context),
+        char *const data[], npy_intp const dimensions[],
+        npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *in = data[0];
+    char *out = data[1];
+    for (npy_intp i = 0; i < N; i++) {
+        *(npy_bool *)out = *(double *)in != 0;
+        in += strides[0];
+        out += strides[1];
+    }
+    return 0;
+}
+
+static NPY_CASTING
+sfloat_to_bool_resolve_descriptors(
+        PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
+        PyArray_Descr *given_descrs[2],
+        PyArray_Descr *loop_descrs[2])
+{
+    Py_INCREF(given_descrs[0]);
+    loop_descrs[0] = given_descrs[0];
+    if (loop_descrs[0] == NULL) {
+        return -1;
+    }
+    loop_descrs[1] = PyArray_DescrFromType(NPY_BOOL);  /* cannot fail */
+    return NPY_UNSAFE_CASTING;
+}
+
+
 static int
 init_casts(void)
 {
@@ -453,6 +489,22 @@ init_casts(void)
         return -1;
     }
 
+    slots[0].slot = NPY_METH_resolve_descriptors;
+    slots[0].pfunc = &sfloat_to_bool_resolve_descriptors;
+    slots[1].slot = NPY_METH_strided_loop;
+    slots[1].pfunc = &cast_sfloat_to_bool;
+    slots[2].slot = 0;
+    slots[2].pfunc = NULL;
+
+    spec.name = "sfloat_to_bool_cast";
+    dtypes[0] = &PyArray_SFloatDType;
+    dtypes[1] = PyArray_DTypeFromTypeNum(NPY_BOOL);
+    Py_DECREF(dtypes[1]);  /* immortal anyway */
+
+    if (PyArray_AddCastingImplementation_FromSpec(&spec, 0)) {
+        return -1;
+    }
+
     return 0;
 }
 
@@ -733,9 +785,9 @@ NPY_NO_EXPORT PyObject *
 get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
 {
     /* Allow calling the function multiple times. */
-    static npy_bool initalized = NPY_FALSE;
+    static npy_bool initialized = NPY_FALSE;
 
-    if (initalized) {
+    if (initialized) {
         Py_INCREF(&PyArray_SFloatDType);
         return (PyObject *)&PyArray_SFloatDType;
     }
@@ -764,6 +816,6 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
         return NULL;
     }
 
-    initalized = NPY_TRUE;
+    initialized = NPY_TRUE;
     return (PyObject *)&PyArray_SFloatDType;
 }
diff --git a/numpy/core/src/umath/_struct_ufunc_tests.c.src b/numpy/core/src/umath/_struct_ufunc_tests.c.src
index d602656c8..ee71c4698 100644
--- a/numpy/core/src/umath/_struct_ufunc_tests.c.src
+++ b/numpy/core/src/umath/_struct_ufunc_tests.c.src
@@ -1,11 +1,13 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
-#include "Python.h"
-#include "math.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_3kcompat.h"
 
+#include <math.h>
+
 
 /*
  * struct_ufunc_test.c
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 2e79d377e..ce42fc271 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -5,9 +5,10 @@
  **                            INCLUDES                                     **
  *****************************************************************************
  */
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
-#include "Python.h"
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -57,6 +58,19 @@
  *****************************************************************************
  */
 
+static void
+always_error_loop(
+        char **NPY_UNUSED(args), npy_intp const *NPY_UNUSED(dimensions),
+        npy_intp const *NPY_UNUSED(steps), void *NPY_UNUSED(func))
+{
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_SetString(PyExc_RuntimeError, "How unexpected :)!");
+    NPY_DISABLE_C_API;
+    return;
+}
+
+
 char *inner1d_signature = "(i),(i)->()";
 
 /**begin repeat
@@ -347,6 +361,9 @@ defdict = {
 
 */
 
+static PyUFuncGenericFunction always_error_functions[] = { always_error_loop };
+static void *always_error_data[] = { (void *)NULL };
+static char always_error_signatures[] = { NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
 static PyUFuncGenericFunction inner1d_functions[] = { LONG_inner1d, DOUBLE_inner1d };
 static void *inner1d_data[] = { (void *)NULL, (void *)NULL };
 static char inner1d_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
@@ -374,6 +391,25 @@ static int
 addUfuncs(PyObject *dictionary) {
     PyObject *f;
 
+    f = PyUFunc_FromFuncAndData(always_error_functions, always_error_data,
+            always_error_signatures, 1, 2, 1, PyUFunc_None, "always_error",
+            "simply, broken, ufunc that sets an error (but releases the GIL).",
+            0);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "always_error", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(always_error_functions,
+            always_error_data, always_error_signatures, 1, 2, 1, PyUFunc_None,
+            "always_error_gufunc",
+            "simply, broken, gufunc that sets an error (but releases the GIL).",
+            0, "(i),()->()");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "always_error_gufunc", f);
+    Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
                     inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d",
                     "inner on the last dimension and broadcast on the rest \n"
@@ -585,7 +621,7 @@ fail:
     return NULL;
 }
 
-// Testing the utilites of the CPU dispatcher
+// Testing the utilities of the CPU dispatcher
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "_umath_tests.dispatch.h"
 #endif
diff --git a/numpy/core/src/umath/_umath_tests.dispatch.c b/numpy/core/src/umath/_umath_tests.dispatch.c
index 85f365010..9d8df4c86 100644
--- a/numpy/core/src/umath/_umath_tests.dispatch.c
+++ b/numpy/core/src/umath/_umath_tests.dispatch.c
@@ -1,12 +1,14 @@
 /**
- * Testing the utilites of the CPU dispatcher
+ * Testing the utilities of the CPU dispatcher
  *
  * @targets $werror baseline
  * SSE2 SSE41 AVX2
  * VSX VSX2 VSX3
  * NEON ASIMD ASIMDHP
  */
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
 #include "npy_cpu_dispatch.h"
 
 #ifndef NPY_DISABLE_OPTIMIZATION
diff --git a/numpy/core/src/umath/clip.c.src b/numpy/core/src/umath/clip.c.src
deleted file mode 100644
index 9c4bac2d1..000000000
--- a/numpy/core/src/umath/clip.c.src
+++ /dev/null
@@ -1,119 +0,0 @@
-/**
- * This module provides the inner loops for the clip ufunc
- */
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "Python.h"
-
-#include "numpy/halffloat.h"
-#include "numpy/npy_math.h"
-#include "numpy/ndarraytypes.h"
-#include "numpy/npy_common.h"
-#include "numpy/utils.h"
-#include "fast_loop_macros.h"
-
-/*
- * Produce macros that perform nan/nat-propagating min and max
- */
-
-/**begin repeat
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG#
- */
-#define _NPY_@name@_MIN(a, b) PyArray_MIN(a, b)
-#define _NPY_@name@_MAX(a, b) PyArray_MAX(a, b)
-/**end repeat**/
-
-#define _NPY_HALF_MIN(a, b) (npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b))
-#define _NPY_HALF_MAX(a, b) (npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b))
-
-/**begin repeat
- * #name = FLOAT, DOUBLE, LONGDOUBLE#
- */
-#define _NPY_@name@_MIN(a, b) (npy_isnan(a) ? (a) : PyArray_MIN(a, b))
-#define _NPY_@name@_MAX(a, b) (npy_isnan(a) ? (a) : PyArray_MAX(a, b))
-/**end repeat**/
-
-/**begin repeat
- * #name = CFLOAT, CDOUBLE, CLONGDOUBLE#
- */
-#define _NPY_@name@_MIN(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b) ? (a) : (b))
-#define _NPY_@name@_MAX(a, b) (npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b) ? (a) : (b))
-/**end repeat**/
-
-/**begin repeat
- * #name = DATETIME, TIMEDELTA#
- */
-#define _NPY_@name@_MIN(a, b) ( \
-    (a) == NPY_DATETIME_NAT ? (a) : \
-    (b) == NPY_DATETIME_NAT ? (b) : \
-    (a) < (b) ? (a) : (b) \
-)
-#define _NPY_@name@_MAX(a, b) ( \
-    (a) == NPY_DATETIME_NAT ? (a) : \
-    (b) == NPY_DATETIME_NAT ? (b) : \
-    (a) > (b) ? (a) : (b) \
-)
-/**end repeat**/
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- * #type = npy_bool,
- *         npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble,
- *         npy_datetime, npy_timedelta#
- */
-
-#define _NPY_CLIP(x, min, max) \
-    _NPY_@name@_MIN(_NPY_@name@_MAX((x), (min)), (max))
-
-NPY_NO_EXPORT void
-@name@_clip(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (steps[1] == 0 && steps[2] == 0) {
-        /* min and max are constant throughout the loop, the most common case */
-        /* NOTE: it may be possible to optimize these checks for nan */
-        @type@ min_val = *(@type@ *)args[1];
-        @type@ max_val = *(@type@ *)args[2];
-
-        char *ip1 = args[0], *op1 = args[3];
-        npy_intp is1 = steps[0], os1 = steps[3];
-        npy_intp n = dimensions[0];
-
-        /* contiguous, branch to let the compiler optimize */
-        if (is1 == sizeof(@type@) && os1 == sizeof(@type@)) {
-            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
-            }
-        }
-        else {
-            for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
-            }
-        }
-    }
-    else {
-        TERNARY_LOOP {
-            *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, *(@type@ *)ip2, *(@type@ *)ip3);
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-
-// clean up the macros we defined above
-#undef _NPY_CLIP
-#undef _NPY_@name@_MAX
-#undef _NPY_@name@_MIN
-
-/**end repeat**/
diff --git a/numpy/core/src/umath/clip.cpp b/numpy/core/src/umath/clip.cpp
new file mode 100644
index 000000000..19d05c848
--- /dev/null
+++ b/numpy/core/src/umath/clip.cpp
@@ -0,0 +1,282 @@
+/**
+ * This module provides the inner loops for the clip ufunc
+ */
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "numpy/halffloat.h"
+#include "numpy/ndarraytypes.h"
+#include "numpy/npy_common.h"
+#include "numpy/npy_math.h"
+#include "numpy/utils.h"
+
+#include "fast_loop_macros.h"
+
+#include "../common/numpy_tag.h"
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::integral_tag const &)
+{
+    return PyArray_MIN(a, b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::integral_tag const &)
+{
+    return PyArray_MAX(a, b);
+}
+
+npy_half
+_NPY_MIN(npy_half a, npy_half b, npy::half_tag const &)
+{
+    return npy_half_isnan(a) || npy_half_le(a, b) ? (a) : (b);
+}
+npy_half
+_NPY_MAX(npy_half a, npy_half b, npy::half_tag const &)
+{
+    return npy_half_isnan(a) || npy_half_ge(a, b) ? (a) : (b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::floating_point_tag const &)
+{
+    return npy_isnan(a) ? (a) : PyArray_MIN(a, b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::floating_point_tag const &)
+{
+    return npy_isnan(a) ? (a) : PyArray_MAX(a, b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::complex_tag const &)
+{
+    return npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CLT(a, b)
+                   ? (a)
+                   : (b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::complex_tag const &)
+{
+    return npy_isnan((a).real) || npy_isnan((a).imag) || PyArray_CGT(a, b)
+                   ? (a)
+                   : (b);
+}
+
+template <class T>
+T
+_NPY_MIN(T a, T b, npy::date_tag const &)
+{
+    return (a) == NPY_DATETIME_NAT   ? (a)
+           : (b) == NPY_DATETIME_NAT ? (b)
+           : (a) < (b)               ? (a)
+                                     : (b);
+}
+template <class T>
+T
+_NPY_MAX(T a, T b, npy::date_tag const &)
+{
+    return (a) == NPY_DATETIME_NAT   ? (a)
+           : (b) == NPY_DATETIME_NAT ? (b)
+           : (a) > (b)               ? (a)
+                                     : (b);
+}
+
+/* generic dispatcher */
+template <class Tag, class T = typename Tag::type>
+T
+_NPY_MIN(T const &a, T const &b)
+{
+    return _NPY_MIN(a, b, Tag{});
+}
+template <class Tag, class T = typename Tag::type>
+T
+_NPY_MAX(T const &a, T const &b)
+{
+    return _NPY_MAX(a, b, Tag{});
+}
+
+template <class Tag, class T>
+T
+_NPY_CLIP(T x, T min, T max)
+{
+    return _NPY_MIN<Tag>(_NPY_MAX<Tag>((x), (min)), (max));
+}
+
+template <class Tag, class T = typename Tag::type>
+static void
+_npy_clip_(T **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    npy_intp n = dimensions[0];
+    if (steps[1] == 0 && steps[2] == 0) {
+        /* min and max are constant throughout the loop, the most common case
+         */
+        /* NOTE: it may be possible to optimize these checks for nan */
+        T min_val = *args[1];
+        T max_val = *args[2];
+
+        T *ip1 = args[0], *op1 = args[3];
+        npy_intp is1 = steps[0] / sizeof(T), os1 = steps[3] / sizeof(T);
+
+        /* contiguous, branch to let the compiler optimize */
+        if (is1 == 1 && os1 == 1) {
+            for (npy_intp i = 0; i < n; i++, ip1++, op1++) {
+                *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
+            }
+        }
+        else {
+            for (npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
+                *op1 = _NPY_CLIP<Tag>(*ip1, min_val, max_val);
+            }
+        }
+    }
+    else {
+        T *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];
+        npy_intp is1 = steps[0] / sizeof(T), is2 = steps[1] / sizeof(T),
+                 is3 = steps[2] / sizeof(T), os1 = steps[3] / sizeof(T);
+        for (npy_intp i = 0; i < n;
+             i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
+            *op1 = _NPY_CLIP<Tag>(*ip1, *ip2, *ip3);
+    }
+    npy_clear_floatstatus_barrier((char *)dimensions);
+}
+
+template <class Tag>
+static void
+_npy_clip(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    using T = typename Tag::type;
+    return _npy_clip_<Tag>((T **)args, dimensions, steps);
+}
+
+extern "C" {
+NPY_NO_EXPORT void
+BOOL_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::bool_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+BYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::byte_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+UBYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ubyte_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+SHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::short_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+USHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ushort_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+INT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::int_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+UINT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::uint_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::long_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+ULONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ulong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::longlong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+ULONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::ulonglong_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+HALF_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::half_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+FLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::float_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+DOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::double_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+LONGDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::longdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CFLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::cfloat_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::cdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+CLONGDOUBLE_clip(char **args, npy_intp const *dimensions,
+                 npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::clongdouble_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+DATETIME_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::datetime_tag>(args, dimensions, steps);
+}
+NPY_NO_EXPORT void
+TIMEDELTA_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func))
+{
+    return _npy_clip<npy::timedelta_tag>(args, dimensions, steps);
+}
+}
diff --git a/numpy/core/src/umath/clip.h b/numpy/core/src/umath/clip.h
new file mode 100644
index 000000000..f69ebd1e3
--- /dev/null
+++ b/numpy/core/src/umath/clip.h
@@ -0,0 +1,73 @@
+#ifndef _NPY_UMATH_CLIP_H_
+#define _NPY_UMATH_CLIP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NPY_NO_EXPORT void
+BOOL_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+BYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+UBYTE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+SHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+USHORT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+INT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+UINT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+ULONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+ULONGLONG_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+HALF_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+          void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+FLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+           void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+DOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CFLOAT_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+            void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CDOUBLE_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+             void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+CLONGDOUBLE_clip(char **args, npy_intp const *dimensions,
+                 npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+DATETIME_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+              void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+TIMEDELTA_clip(char **args, npy_intp const *dimensions, npy_intp const *steps,
+               void *NPY_UNUSED(func));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/numpy/core/src/umath/clip.h.src b/numpy/core/src/umath/clip.h.src
deleted file mode 100644
index f16856cdf..000000000
--- a/numpy/core/src/umath/clip.h.src
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _NPY_UMATH_CLIP_H_
-#define _NPY_UMATH_CLIP_H_
-
-
-/**begin repeat
- *
- * #name = BOOL,
- *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG,
- *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *         CFLOAT, CDOUBLE, CLONGDOUBLE,
- *         DATETIME, TIMEDELTA#
- */
-NPY_NO_EXPORT void
-@name@_clip(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat**/
-
-#endif
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index b97441b13..8e99c0420 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -34,11 +34,12 @@
  *    into the `signature` so that it is available to the ufunc loop.
  *
  */
-#include <Python.h>
-
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #include "numpy/ndarraytypes.h"
 #include "common.h"
@@ -192,6 +193,10 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                 /* Unspecified out always matches (see below for inputs) */
                 continue;
             }
+            if (resolver_dtype == (PyArray_DTypeMeta *)Py_None) {
+                /* always matches */
+                continue;
+            }
             if (given_dtype == resolver_dtype) {
                 continue;
             }
@@ -266,8 +271,39 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                  *       the subclass should be considered a better match
                  *       (subclasses are always more specific).
                  */
+                /* Whether this (normally output) dtype was specified at all */
+                if (op_dtypes[i] == NULL) {
+                    /*
+                     * When DType is completely unspecified, prefer abstract
+                     * over concrete, assuming it will resolve.
+                     * Furthermore, we cannot decide which abstract/None
+                     * is "better", only concrete ones which are subclasses
+                     * of Abstract ones are defined as worse.
+                     */
+                    npy_bool prev_is_concrete = NPY_FALSE;
+                    npy_bool new_is_concrete = NPY_FALSE;
+                    if ((prev_dtype != Py_None) &&
+                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
+                        prev_is_concrete = NPY_TRUE;
+                    }
+                    if ((new_dtype != Py_None) &&
+                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
+                        new_is_concrete = NPY_TRUE;
+                    }
+                    if (prev_is_concrete == new_is_concrete) {
+                        best = -1;
+                    }
+                    else if (prev_is_concrete) {
+                        unambiguously_equally_good = 0;
+                        best = 1;
+                    }
+                    else {
+                        unambiguously_equally_good = 0;
+                        best = 0;
+                    }
+                }
                 /* If either is None, the other is strictly more specific */
-                if (prev_dtype == Py_None) {
+                else if (prev_dtype == Py_None) {
                     unambiguously_equally_good = 0;
                     best = 1;
                 }
@@ -288,13 +324,29 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                      */
                     best = -1;
                 }
+                else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
+                    /* old is not abstract, so better (both not possible) */
+                    unambiguously_equally_good = 0;
+                    best = 0;
+                }
+                else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
+                    /* new is not abstract, so better (both not possible) */
+                    unambiguously_equally_good = 0;
+                    best = 1;
+                }
                 /*
-                 * TODO: Unreachable, but we will need logic for abstract
-                 *       DTypes to decide if one is a subclass of the other
-                 *       (And their subclass relation is well defined.)
+                 * TODO: This will need logic for abstract DTypes to decide if
+                 *       one is a subclass of the other (And their subclass
+                 *       relation is well defined).  For now, we bail out
+                 *       in cas someone manages to get here.
                  */
                 else {
-                    assert(0);
+                    PyErr_SetString(PyExc_NotImplementedError,
+                            "deciding which one of two abstract dtypes is "
+                            "a better match is not yet implemented.  This "
+                            "will pick the better (or bail) in the future.");
+                    *out_info = NULL;
+                    return -1;
                 }
 
                 if ((current_best != -1) && (current_best != best)) {
@@ -611,6 +663,35 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
             }
             return info;
         }
+        else if (info == NULL && op_dtypes[0] == NULL) {
+            /*
+             * If we have a reduction, fill in the unspecified input/array
+             * assuming it should have the same dtype as the operand input
+             * (or the output one if given).
+             * Then, try again.  In some cases, this will choose different
+             * paths, such as `ll->?` instead of an `??->?` loop for `np.equal`
+             * when the input is `.l->.` (`.` meaning undefined).  This will
+             * then cause an error.  But cast to `?` would always lose
+             * information, and in many cases important information:
+             *
+             * ```python
+             * from operator import eq
+             * from functools import reduce
+             *
+             * reduce(eq, [1, 2, 3]) != reduce(eq, [True, True, True])
+             * ```
+             *
+             * The special cases being `logical_(and|or|xor)` which can always
+             * cast to boolean ahead of time and still give the right answer
+             * (unsafe cast to bool is fine here). We special case these at
+             * the time of this comment (NumPy 1.21).
+             */
+            assert(ufunc->nin == 2 && ufunc->nout == 1);
+            op_dtypes[0] = op_dtypes[2] != NULL ? op_dtypes[2] : op_dtypes[1];
+            Py_INCREF(op_dtypes[0]);
+            return promote_and_get_info_and_ufuncimpl(ufunc,
+                    ops, signature, op_dtypes, allow_legacy_promotion, 1);
+        }
     }
 
     /*
@@ -742,3 +823,94 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
 
     return method;
 }
+
+
+/*
+ * Special promoter for the logical ufuncs.  The logical ufuncs can always
+ * use the ??->? and still get the correct output (as long as the output
+ * is not supposed to be `object`).
+ */
+static int
+logical_ufunc_promoter(PyUFuncObject *NPY_UNUSED(ufunc),
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[])
+{
+    /*
+     * If we find any object DType at all, we currently force to object.
+     * However, if the output is specified and not object, there is no point,
+     * it should be just as well to cast the input rather than doing the
+     * unsafe out cast.
+     */
+    int force_object = 0;
+
+    for (int i = 0; i < 3; i++) {
+        PyArray_DTypeMeta *item;
+        if (signature[i] != NULL) {
+            item = signature[i];
+            Py_INCREF(item);
+            if (item->type_num == NPY_OBJECT) {
+                force_object = 1;
+            }
+        }
+        else {
+            /* Always override to boolean */
+            item = PyArray_DTypeFromTypeNum(NPY_BOOL);
+            if (op_dtypes[i] != NULL && op_dtypes[i]->type_num == NPY_OBJECT) {
+                force_object = 1;
+            }
+        }
+        new_op_dtypes[i] = item;
+    }
+
+    if (!force_object || (op_dtypes[2] != NULL
+                          && op_dtypes[2]->type_num != NPY_OBJECT)) {
+        return 0;
+    }
+    /*
+     * Actually, we have to use the OBJECT loop after all, set all we can
+     * to object (that might not work out, but try).
+     *
+     * NOTE: Change this to check for `op_dtypes[0] == NULL` to STOP
+     *       returning `object` for `np.logical_and.reduce(obj_arr)`
+     *       which will also affect `np.all` and `np.any`!
+     */
+    for (int i = 0; i < 3; i++) {
+        if (signature[i] != NULL) {
+            continue;
+        }
+        Py_SETREF(new_op_dtypes[i], PyArray_DTypeFromTypeNum(NPY_OBJECT));
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+install_logical_ufunc_promoter(PyObject *ufunc)
+{
+    if (PyObject_Type(ufunc) != (PyObject *)&PyUFunc_Type) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "internal numpy array, logical ufunc was not a ufunc?!");
+        return -1;
+    }
+    PyObject *dtype_tuple = PyTuple_Pack(3,
+            &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type, NULL);
+    if (dtype_tuple == NULL) {
+        return -1;
+    }
+    PyObject *promoter = PyCapsule_New(&logical_ufunc_promoter,
+            "numpy._ufunc_promoter", NULL);
+    if (promoter == NULL) {
+        Py_DECREF(dtype_tuple);
+        return -1;
+    }
+
+    PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter);
+    Py_DECREF(dtype_tuple);
+    Py_DECREF(promoter);
+    if (info == NULL) {
+        return -1;
+    }
+
+    return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+}
+
diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h
index 8d116873c..2f314615d 100644
--- a/numpy/core/src/umath/dispatching.h
+++ b/numpy/core/src/umath/dispatching.h
@@ -26,4 +26,8 @@ NPY_NO_EXPORT PyObject *
 add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *operation_dtypes[], int ignore_duplicate);
 
+NPY_NO_EXPORT int
+install_logical_ufunc_promoter(PyObject *ufunc);
+
+
 #endif  /*_NPY_DISPATCHING_H */
diff --git a/numpy/core/src/umath/extobj.c b/numpy/core/src/umath/extobj.c
index cd81f7734..6b9a27e26 100644
--- a/numpy/core/src/umath/extobj.c
+++ b/numpy/core/src/umath/extobj.c
@@ -1,7 +1,8 @@
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include "npy_config.h"
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index 4351f1d25..a423823d4 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -2,12 +2,13 @@
  * This file defines most of the machinery in order to wrap legacy style
  * ufunc loops into new style arraymethods.
  */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
+#define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #include "numpy/ndarraytypes.h"
 
 #include "convert_datatype.h"
@@ -216,6 +217,25 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
      */
     int any_output_flexible = 0;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufunc->nargs == 3 &&
+            signature[0]->type_num == NPY_BOOL &&
+            signature[1]->type_num == NPY_BOOL &&
+            signature[2]->type_num == NPY_BOOL && (
+                strcmp(ufunc->name, "logical_or") == 0 ||
+                strcmp(ufunc->name, "logical_and") == 0 ||
+                strcmp(ufunc->name, "logical_xor") == 0)) {
+        /*
+         * This is a logical ufunc, and the `??->?` loop`. It is always OK
+         * to cast any input to bool, because that cast is defined by
+         * truthiness.
+         * This allows to ensure two things:
+         * 1. `np.all`/`np.any` know that force casting the input is OK
+         *    (they must do this since there are no `?l->?`, etc. loops)
+         * 2. The logical functions automatically work for any DType
+         *    implementing a cast to boolean.
+         */
+        flags = _NPY_METH_FORCE_CAST_INPUTS;
+    }
 
     for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
         if (signature[i]->singleton->flags & (
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 8df439aca..7c0710819 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1,11 +1,11 @@
 /* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
-#include "Python.h"
-
 #include "npy_config.h"
 #include "numpy/npy_common.h"
 #include "numpy/arrayobject.h"
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 02d749a5e..0938cd050 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -210,6 +210,32 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos# 
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index cc0fd19bb..95cce553a 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -868,6 +868,32 @@ AVX512F_exp_DOUBLE(npy_double * op,
  *               = p(r)
  *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
  */
+
+/* LLVM has a bug where AVX-512F intrinsic `_mm512_mask_mul_pd` emits an
+ * unmasked operation with a masked store.  This can cause FP exceptions to
+ * occur for the lanes that are suppose to have been masked.
+ *
+ * See https://bugs.llvm.org/show_bug.cgi?id=51988
+ *
+ * Note, this affects LLVM based compilers like Apple Clang, Clang, and Intel's
+ * ICX.
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ > 11000000
+        // Apple Clang after v11
+        #define WORKAROUND_LLVM__mm512_mask_mul_pd
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ > 9
+        // Clang v9+
+        #define WORKAROUND_LLVM__mm512_mask_mul_pd
+        #endif
+    #endif
+#endif
+
 static void
 AVX512F_log_DOUBLE(npy_double * op,
                 npy_double * ip,
@@ -954,8 +980,12 @@ AVX512F_log_DOUBLE(npy_double * op,
             denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
                                 _CMP_EQ_OQ);
             denormal_mask = (~zero_mask) & denormal_mask;
+            __m512d masked_x = x;
+            #ifdef WORKAROUND_LLVM__mm512_mask_mul_pd
+            masked_x = avx512_set_masked_lanes_pd(masked_x, zeros_d, (~denormal_mask));
+            #endif
             ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask,
-                                    x, _mm512_set1_pd(0x1p52)));
+                                    masked_x, _mm512_set1_pd(0x1p52)));
             ix = _mm512_mask_sub_epi64(ix, denormal_mask,
                                     ix, _mm512_set1_epi64(52ULL << 52));
 
@@ -1039,6 +1069,9 @@ AVX512F_log_DOUBLE(npy_double * op,
         npy_set_floatstatus_divbyzero();
     }
 }
+
+#undef WORKAROUND_LLVM__mm512_mask_mul_pd
+
 #endif // AVX512F_NOCLANG_BUG
 
 #ifdef SIMD_AVX512_SKX
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
new file mode 100644
index 000000000..852604655
--- /dev/null
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -0,0 +1,141 @@
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ */
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
+                        npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_@sfx@ x;
+        #if @default_val@
+            if (ssrc == 1) {
+                x = npyv_load_till_@sfx@(src, len, @default_val@);
+            } else {
+                x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_@sfx@(src, len);
+            } else {
+                x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            }
+        #endif
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+        if (sdst == 1) {
+            npyv_store_till_@sfx@(dst, len, out);
+        } else {
+            npyv_storen_till_@sfx@(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+static void
+simd_@func@_f64(const double *src, npy_intp ssrc,
+                      double *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+        npyv_f64 out = __svml_@func@8(x);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ *  #TYPE = DOUBLE, FLOAT#
+ *  #type = npy_double, npy_float#
+ *  #vsub = , f#
+ *  #sfx  = f64, f32#
+ */
+/**begin repeat1
+ *  #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ *  #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const @type@ *src = (@type@*)args[0];
+          @type@ *dst = (@type@*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) &&
+        npyv_storable_stride_@sfx@(sdst)) {
+        simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = npy_@intrin@@vsub@(in1);
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index 3a1ea82f9..2d5917282 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -77,6 +77,56 @@ NPY_FINLINE double c_square_f64(double a)
  */
 #define CONTIG  0
 #define NCONTIG 1
+
+/*
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a reciprocal operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the reciprocal operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
+        #endif
+    #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
+#endif
+
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
@@ -87,6 +137,7 @@ NPY_FINLINE double c_square_f64(double a)
  * #kind     = sqrt, absolute, square, reciprocal#
  * #intr     = sqrt, abs,      square, recip#
  * #repl_0w1 = 0,    0,        0,      1#
+ * #RECIP_WORKAROUND = 0, 0,   0,      WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -101,6 +152,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 
     const int vstep = npyv_nlanes_@sfx@;
     const int wstep = vstep * @unroll@;
+
+    // unrolled iterations
     for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
         /**begin repeat3
          * #N  = 0, 1, 2, 3#
@@ -126,7 +179,24 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
         #endif
         /**end repeat3**/
     }
-    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v_src0 = npyv_load_@sfx@(src);
+    #else
+        npyv_@sfx@ v_src0 = npyv_loadn_@sfx@(src, ssrc);
+    #endif
+        npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
+    #if @DTYPE@ == CONTIG
+        npyv_store_@sfx@(dst, v_unary0);
+    #else
+        npyv_storen_@sfx@(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
     #if @STYPE@ == CONTIG
         #if @repl_0w1@
             npyv_@sfx@ v_src0 = npyv_load_till_@sfx@(src, len, 1);
@@ -140,6 +210,15 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
             npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
         #endif
     #endif
+        #if @RECIP_WORKAROUND@
+            /*
+             * Workaround clang bug.  We use a dummy variable marked 'volatile'
+             * to convince clang that the entire vector is needed.  We only
+             * want to do this for the last iteration / partial load-store of
+             * the loop since 'volatile' forces a refresh of the contents.
+             */
+             volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
+        #endif // @RECIP_WORKAROUND@
         npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
     #if @DTYPE@ == CONTIG
         npyv_store_till_@sfx@(dst, len, v_unary0);
@@ -147,6 +226,7 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
         npyv_storen_till_@sfx@(dst, sdst, len, v_unary0);
     #endif
     }
+
     npyv_cleanup();
 }
 /**end repeat2**/
@@ -154,6 +234,8 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 #endif // @VCHK@
 /**end repeat**/
 
+#undef WORKAROUND_CLANG_RECIPROCAL_BUG
+
 /********************************************************************************
  ** Defining ufunc inner functions
  ********************************************************************************/
diff --git a/numpy/core/src/umath/loops_utils.h.src b/numpy/core/src/umath/loops_utils.h.src
index 1a2a5a32b..762e9ee59 100644
--- a/numpy/core/src/umath/loops_utils.h.src
+++ b/numpy/core/src/umath/loops_utils.h.src
@@ -6,7 +6,7 @@
 
 /**
  * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
- * which lead to break the build, probably releated to the following bug:
+ * which lead to break the build, probably related to the following bug:
  * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
  */
 #if defined(_MSC_VER) && _MSC_VER < 1916
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 0e47d1ab5..4dd0c4759 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -1,11 +1,11 @@
 /* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
-#include "Python.h"
-
 #include "npy_config.h"
 #include "numpy/npy_common.h"
 #include "numpy/arrayobject.h"
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 86cc20eb1..c28c8abd8 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -6,15 +6,15 @@
  *
  * See LICENSE.txt for the license.
  */
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
 #include "npy_config.h"
-#include <numpy/arrayobject.h>
+#include "numpy/arrayobject.h"
 
 #include "npy_pycompat.h"
 #include "ctors.h"
@@ -145,14 +145,12 @@ PyArray_CopyInitialReduceValues(
  * boilerplate code, just calling the appropriate inner loop function where
  * necessary.
  *
+ * context     : The ArrayMethod context (with ufunc, method, and descriptors).
  * operand     : The array to be reduced.
  * out         : NULL, or the array into which to place the result.
  * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
  *               so that support can be added in the future without breaking
  *               API compatibility. Pass in NULL.
- * operand_dtype : The dtype the inner loop expects for the operand.
- * result_dtype : The dtype the inner loop expects for the result.
- * casting     : The casting rule to apply to the operands.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
  * reorderable : If True, the reduction being done is reorderable, which
  *               means specifying multiple axes of reduction at once is ok,
@@ -182,10 +180,8 @@ PyArray_CopyInitialReduceValues(
  * generalized ufuncs!)
  */
 NPY_NO_EXPORT PyArrayObject *
-PyUFunc_ReduceWrapper(
+PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        PyArray_Descr *operand_dtype, PyArray_Descr *result_dtype,
-        NPY_CASTING casting,
         npy_bool *axis_flags, int reorderable, int keepdims,
         PyObject *identity, PyArray_ReduceLoopFunc *loop,
         void *data, npy_intp buffersize, const char *funcname, int errormask)
@@ -199,6 +195,8 @@ PyUFunc_ReduceWrapper(
     PyArrayObject *op[3];
     PyArray_Descr *op_dtypes[3];
     npy_uint32 it_flags, op_flags[3];
+    /* Loop auxdata (must be freed on error) */
+    NpyAuxData *auxdata = NULL;
 
     /* More than one axis means multiple orders are possible */
     if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
@@ -221,8 +219,8 @@ PyUFunc_ReduceWrapper(
     /* Set up the iterator */
     op[0] = out;
     op[1] = operand;
-    op_dtypes[0] = result_dtype;
-    op_dtypes[1] = operand_dtype;
+    op_dtypes[0] = context->descriptors[0];
+    op_dtypes[1] = context->descriptors[1];
 
     it_flags = NPY_ITER_BUFFERED |
             NPY_ITER_EXTERNAL_LOOP |
@@ -291,7 +289,7 @@ PyUFunc_ReduceWrapper(
     }
 
     iter = NpyIter_AdvancedNew(wheremask == NULL ? 2 : 3, op, it_flags,
-                               NPY_KEEPORDER, casting,
+                               NPY_KEEPORDER, NPY_UNSAFE_CASTING,
                                op_flags,
                                op_dtypes,
                                PyArray_NDIM(operand), op_axes, NULL, buffersize);
@@ -301,9 +299,29 @@ PyUFunc_ReduceWrapper(
 
     result = NpyIter_GetOperandArray(iter)[0];
 
-    int needs_api = NpyIter_IterationNeedsAPI(iter);
-    /* Start with the floating-point exception flags cleared */
-    npy_clear_floatstatus_barrier((char*)&iter);
+    PyArrayMethod_StridedLoop *strided_loop;
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    npy_intp fixed_strides[3];
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    if (wheremask != NULL) {
+        if (PyArrayMethod_GetMaskedStridedLoop(context,
+                1, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+            goto fail;
+        }
+    }
+    else {
+        if (context->method->get_strided_loop(context,
+                1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+            goto fail;
+        }
+    }
+
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    needs_api |= NpyIter_IterationNeedsAPI(iter);
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
 
     /*
      * Initialize the result to the reduction unit if possible,
@@ -345,16 +363,18 @@ PyUFunc_ReduceWrapper(
         strideptr = NpyIter_GetInnerStrideArray(iter);
         countptr = NpyIter_GetInnerLoopSizePtr(iter);
 
-        if (loop(iter, dataptr, strideptr, countptr,
-                        iternext, needs_api, skip_first_count, data) < 0) {
+        if (loop(context, strided_loop, auxdata,
+                iter, dataptr, strideptr, countptr, iternext,
+                needs_api, skip_first_count) < 0) {
             goto fail;
         }
     }
 
-    /* Check whether any errors occurred during the loop */
-    if (PyErr_Occurred() ||
-            _check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
-        goto fail;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even on error */
+        if (_check_ufunc_fperr(errormask, NULL, "reduce") < 0) {
+            goto fail;
+        }
     }
 
     if (out != NULL) {
@@ -369,6 +389,7 @@ PyUFunc_ReduceWrapper(
     return result;
 
 fail:
+    NPY_AUXDATA_FREE(auxdata);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
     }
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 372605dba..2170e27a7 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -19,93 +19,17 @@ typedef int (PyArray_AssignReduceIdentityFunc)(PyArrayObject *result,
                                                void *data);
 
 /*
- * This is a function for the reduce loop.
+ * Inner definition of the reduce loop, only used for a static function.
+ * At some point around NumPy 1.6, there was probably an intention to make
+ * the reduce loop customizable at this level (per ufunc?).
  *
- * The needs_api parameter indicates whether it's ok to release the GIL during
- * the loop, such as when the iternext() function never calls
- * a function which could raise a Python exception.
- *
- * The skip_first_count parameter indicates how many elements need to be
- * skipped based on NpyIter_IsFirstVisit checks. This can only be positive
- * when the 'assign_identity' parameter was NULL when calling
- * PyArray_ReduceWrapper.
- *
- * The loop gets two data pointers and two strides, and should
- * look roughly like this:
- *  {
- *      NPY_BEGIN_THREADS_DEF;
- *      if (!needs_api) {
- *          NPY_BEGIN_THREADS;
- *      }
- *      // This first-visit loop can be skipped if 'assign_identity' was non-NULL
- *      if (skip_first_count > 0) {
- *          do {
- *              char *data0 = dataptr[0], *data1 = dataptr[1];
- *              npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *              npy_intp count = *countptr;
- *
- *              // Skip any first-visit elements
- *              if (NpyIter_IsFirstVisit(iter, 0)) {
- *                  if (stride0 == 0) {
- *                      --count;
- *                      --skip_first_count;
- *                      data1 += stride1;
- *                  }
- *                  else {
- *                      skip_first_count -= count;
- *                      count = 0;
- *                  }
- *              }
- *
- *              while (count--) {
- *                  *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                    *(operand_t *)data1);
- *                  data0 += stride0;
- *                  data1 += stride1;
- *              }
- *
- *              // Jump to the faster loop when skipping is done
- *              if (skip_first_count == 0) {
- *                  if (iternext(iter)) {
- *                      break;
- *                  }
- *                  else {
- *                      goto finish_loop;
- *                  }
- *              }
- *          } while (iternext(iter));
- *      }
- *      do {
- *          char *data0 = dataptr[0], *data1 = dataptr[1];
- *          npy_intp stride0 = strideptr[0], stride1 = strideptr[1];
- *          npy_intp count = *countptr;
- *
- *          while (count--) {
- *              *(result_t *)data0 = my_reduce_op(*(result_t *)data0,
- *                                                *(operand_t *)data1);
- *              data0 += stride0;
- *              data1 += stride1;
- *          }
- *      } while (iternext(iter));
- *  finish_loop:
- *      if (!needs_api) {
- *          NPY_END_THREADS;
- *      }
- *      return (needs_api && PyErr_Occurred()) ? -1 : 0;
- *  }
- *
- * If needs_api is True, this function should call PyErr_Occurred()
- * to check if an error occurred during processing, and return -1 for
- * error, 0 for success.
+ * TODO: This should be refactored/removed.
  */
-typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
-                                            char **dataptr,
-                                            npy_intp const *strideptr,
-                                            npy_intp const *countptr,
-                                            NpyIter_IterNextFunc *iternext,
-                                            int needs_api,
-                                            npy_intp skip_first_count,
-                                            void *data);
+typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
+        PyArrayMethod_StridedLoop *strided_loop, NpyAuxData *auxdata,
+        NpyIter *iter, char **dataptrs, npy_intp const *strides,
+        npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
+        int needs_api, npy_intp skip_first_count);
 
 /*
  * This function executes all the standard NumPy reduction function
@@ -138,16 +62,10 @@ typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
  * errormask   : forwarded from _get_bufsize_errmask
  */
 NPY_NO_EXPORT PyArrayObject *
-PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
-                      PyArrayObject *wheremask,
-                      PyArray_Descr *operand_dtype,
-                      PyArray_Descr *result_dtype,
-                      NPY_CASTING casting,
-                      npy_bool *axis_flags, int reorderable,
-                      int keepdims,
-                      PyObject *identity,
-                      PyArray_ReduceLoopFunc *loop,
-                      void *data, npy_intp buffersize, const char *funcname,
-                      int errormask);
+PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
+        PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
+        npy_bool *axis_flags, int reorderable, int keepdims,
+        PyObject *identity, PyArray_ReduceLoopFunc *loop,
+        void *data, npy_intp buffersize, const char *funcname, int errormask);
 
 #endif
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 5836545f8..402e6b561 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -5,12 +5,13 @@
 
    but still supports error-modes.
 */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 
-#include "Python.h"
 #include "npy_config.h"
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 654ab81cc..d47be9a30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -116,9 +116,8 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
 #endif
     return 0;
 }
-
-
 /**end repeat1**/
+
 /**end repeat**/
 
 /**begin repeat
@@ -1152,6 +1151,7 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
  * #is_finite = 0, 1, 0, 0#
  * #is_signbit = 0, 0, 0, 1#
  */
+
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
 AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml
new file mode 160000
+Subproject 9f8af767ed6c75455d9a382af829048f8dd1806
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index ebc6bf02a..237af81b2 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -23,12 +23,14 @@
  * Rick White
  *
  */
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
-#include "Python.h"
-#include "stddef.h"
+#include <stddef.h>
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
@@ -614,9 +616,24 @@ _is_same_name(const char* s1, const char* s2)
 }
 
 /*
- * Sets core_num_dim_ix, core_num_dims, core_dim_ixs, core_offsets,
- * and core_signature in PyUFuncObject "ufunc".  Returns 0 unless an
- * error occurred.
+ * Sets the following fields in the PyUFuncObject 'ufunc':
+ *
+ * Field             Type                     Array Length
+ * core_enabled      int (effectively bool)   N/A
+ * core_num_dim_ix   int                      N/A
+ * core_dim_flags    npy_uint32 *             core_num_dim_ix
+ * core_dim_sizes    npy_intp *               core_num_dim_ix
+ * core_num_dims     int *                    nargs (i.e. nin+nout)
+ * core_offsets      int *                    nargs
+ * core_dim_ixs      int *                    sum(core_num_dims)
+ * core_signature    char *                   strlen(signature) + 1
+ *
+ * The function assumes that the values that are arrays have not
+ * been set already, and sets these pointers to memory allocated
+ * with PyArray_malloc.  These are freed when the ufunc dealloc
+ * method is called.
+ *
+ * Returns 0 unless an error occurred.
  */
 static int
 _parse_signature(PyUFuncObject *ufunc, const char *signature)
@@ -988,6 +1005,7 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
     }
 
     /* Convert and fill in output arguments */
+    memset(out_op_DTypes + nin, 0, nout * sizeof(*out_op_DTypes));
     if (full_args.out != NULL) {
         for (int i = 0; i < nout; i++) {
             obj = PyTuple_GET_ITEM(full_args.out, i);
@@ -1045,6 +1063,7 @@ check_for_trivial_loop(PyArrayMethodObject *ufuncimpl,
         PyArrayObject **op, PyArray_Descr **dtypes,
         NPY_CASTING casting, npy_intp buffersize)
 {
+    int force_cast_input = ufuncimpl->flags & _NPY_METH_FORCE_CAST_INPUTS;
     int i, nin = ufuncimpl->nin, nop = nin + ufuncimpl->nout;
 
     for (i = 0; i < nop; ++i) {
@@ -1060,15 +1079,21 @@ check_for_trivial_loop(PyArrayMethodObject *ufuncimpl,
         if (dtypes[i] != PyArray_DESCR(op[i])) {
             NPY_CASTING safety = PyArray_GetCastSafety(
                     PyArray_DESCR(op[i]), dtypes[i], NULL);
-            if (safety < 0) {
-                /* A proper error during a cast check should be rare */
+            if (safety < 0 && PyErr_Occurred()) {
+                /* A proper error during a cast check, should be rare */
                 return -1;
             }
             if (!(safety & _NPY_CAST_IS_VIEW)) {
                 must_copy = 1;
             }
 
-            if (PyArray_MinCastSafety(safety, casting) != casting) {
+            if (force_cast_input && i < nin) {
+                /*
+                 * ArrayMethod flagged to ignore casting (logical funcs
+                 * can  force cast to bool)
+                 */
+            }
+            else if (PyArray_MinCastSafety(safety, casting) != casting) {
                 return 0;  /* the cast is not safe enough */
             }
         }
@@ -1323,6 +1348,14 @@ try_trivial_single_output_loop(PyArrayMethod_Context *context,
 
     NPY_END_THREADS;
     NPY_AUXDATA_FREE(auxdata);
+    /*
+     * An error should only be possible if `res != 0` is already set.
+     * But this is not strictly correct for old-style ufuncs (e.g. `power`
+     * released the GIL but manually set an Exception).
+     */
+    if (PyErr_Occurred()) {
+        res = -1;
+    }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
         /* NOTE: We could check float errors even when `res < 0` */
@@ -1350,8 +1383,15 @@ validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc,
          */
         return 0;
     }
-    if (PyUFunc_ValidateCasting(ufunc, casting, ops, descriptors) < 0) {
-        return -1;
+    if (method->flags & _NPY_METH_FORCE_CAST_INPUTS) {
+        if (PyUFunc_ValidateOutCasting(ufunc, casting, ops, descriptors) < 0) {
+            return -1;
+        }
+    }
+    else {
+        if (PyUFunc_ValidateCasting(ufunc, casting, ops, descriptors) < 0) {
+            return -1;
+        }
     }
     return 0;
 }
@@ -2460,9 +2500,9 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     /* Final preparation of the arraymethod call */
     PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = operation_descrs,
     };
     PyArrayMethod_StridedLoop *strided_loop;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
@@ -2517,7 +2557,7 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     PyArray_free(inner_strides);
     NPY_AUXDATA_FREE(auxdata);
-    if (NpyIter_Deallocate(iter) < 0) {
+    if (!NpyIter_Deallocate(iter)) {
         retval = -1;
     }
 
@@ -2582,9 +2622,9 @@ PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
 
     /* Final preparation of the arraymethod call */
     PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = operation_descrs,
     };
 
     /* Do the ufunc loop */
@@ -2651,195 +2691,129 @@ PyUFunc_GenericFunction(PyUFuncObject *NPY_UNUSED(ufunc),
 
 
 /*
- * Given the output type, finds the specified binary op.  The
- * ufunc must have nin==2 and nout==1.  The function may modify
- * otype if the given type isn't found.
+ * Promote and resolve a reduction like operation.
  *
- * Returns 0 on success, -1 on failure.
+ * @param ufunc
+ * @param arr The operation array
+ * @param out The output array or NULL if not provided.  Note that NumPy always
+ *        used out to mean the same as `dtype=out.dtype` and never passed
+ *        the array itself to the type-resolution.
+ * @param signature The DType signature, which may already be set due to the
+ *        dtype passed in by the user, or the special cases (add, multiply).
+ *        (Contains strong references and may be modified.)
+ * @param enforce_uniform_args If `NPY_TRUE` fully uniform dtypes/descriptors
+ *        are enforced as required for accumulate and (currently) reduceat.
+ * @param out_descrs New references to the resolved descriptors (on success).
+ * @param method The ufunc method, "reduce", "reduceat", or "accumulate".
+
+ * @returns ufuncimpl The `ArrayMethod` implemention to use. Or NULL if an
+ *          error occurred.
  */
-static int
-get_binary_op_function(PyUFuncObject *ufunc, int *otype,
-                        PyUFuncGenericFunction *out_innerloop,
-                        void **out_innerloopdata)
+static PyArrayMethodObject *
+reducelike_promote_and_resolve(PyUFuncObject *ufunc,
+        PyArrayObject *arr, PyArrayObject *out,
+        PyArray_DTypeMeta *signature[3],
+        npy_bool enforce_uniform_args, PyArray_Descr *out_descrs[3],
+        char *method)
 {
-    int i;
-
-    NPY_UF_DBG_PRINT1("Getting binary op function for type number %d\n",
-                                *otype);
-
-    /* If the type is custom and there are userloops, search for it here */
-    if (ufunc->userloops != NULL && PyTypeNum_ISUSERDEF(*otype)) {
-        PyObject *key, *obj;
-        key = PyLong_FromLong(*otype);
-        if (key == NULL) {
-            return -1;
-        }
-        obj = PyDict_GetItemWithError(ufunc->userloops, key);
-        Py_DECREF(key);
-        if (obj == NULL && PyErr_Occurred()) {
-            return -1;
-        }
-        else if (obj != NULL) {
-            PyUFunc_Loop1d *funcdata = PyCapsule_GetPointer(obj, NULL);
-            if (funcdata == NULL) {
-                return -1;
-            }
-            while (funcdata != NULL) {
-                int *types = funcdata->arg_types;
-
-                if (types[0] == *otype && types[1] == *otype &&
-                                                types[2] == *otype) {
-                    *out_innerloop = funcdata->func;
-                    *out_innerloopdata = funcdata->data;
-                    return 0;
-                }
+    /*
+     * Note that the `ops` is not realy correct.  But legacy resolution
+     * cannot quite handle the correct ops (e.g. a NULL first item if `out`
+     * is NULL), and it should only matter in very strange cases.
+     */
+    PyArrayObject *ops[3] = {arr, arr, NULL};
+    /*
+     * TODO: If `out` is not provided, arguably `initial` could define
+     *       the first DType (and maybe also the out one), that way
+     *       `np.add.reduce([1, 2, 3], initial=3.4)` would return a float
+     *       value.  As of 1.20, it returned an integer, so that should
+     *       probably go to an error/warning first.
+     */
+    PyArray_DTypeMeta *operation_DTypes[3] = {
+            NULL, NPY_DTYPE(PyArray_DESCR(arr)), NULL};
+    Py_INCREF(operation_DTypes[1]);
 
-                funcdata = funcdata->next;
-            }
-        }
+    if (out != NULL) {
+        operation_DTypes[0] = NPY_DTYPE(PyArray_DESCR(out));
+        Py_INCREF(operation_DTypes[0]);
+        operation_DTypes[2] = operation_DTypes[0];
+        Py_INCREF(operation_DTypes[2]);
     }
 
-    /* Search for a function with compatible inputs */
-    for (i = 0; i < ufunc->ntypes; ++i) {
-        char *types = ufunc->types + i*ufunc->nargs;
-
-        NPY_UF_DBG_PRINT3("Trying loop with signature %d %d -> %d\n",
-                                types[0], types[1], types[2]);
-
-        if (PyArray_CanCastSafely(*otype, types[0]) &&
-                    types[0] == types[1] &&
-                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
-            /* If the signature is "xx->x", we found the loop */
-            if (types[2] == types[0]) {
-                *out_innerloop = ufunc->functions[i];
-                *out_innerloopdata = ufunc->data[i];
-                *otype = types[0];
-                return 0;
-            }
-            /*
-             * Otherwise, we found the natural type of the reduction,
-             * replace otype and search again
-             */
-            else {
-                *otype = types[2];
-                break;
-            }
-        }
+    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+            ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE);
+    Py_DECREF(operation_DTypes[1]);
+    if (out != NULL) {
+        Py_DECREF(operation_DTypes[0]);
+        Py_DECREF(operation_DTypes[2]);
     }
-
-    /* Search for the exact function */
-    for (i = 0; i < ufunc->ntypes; ++i) {
-        char *types = ufunc->types + i*ufunc->nargs;
-
-        if (PyArray_CanCastSafely(*otype, types[0]) &&
-                    types[0] == types[1] &&
-                    types[1] == types[2] &&
-                    (*otype == NPY_OBJECT || types[0] != NPY_OBJECT)) {
-            /* Since the signature is "xx->x", we found the loop */
-            *out_innerloop = ufunc->functions[i];
-            *out_innerloopdata = ufunc->data[i];
-            *otype = types[0];
-            return 0;
-        }
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    return -1;
-}
-
-static int
-reduce_type_resolver(PyUFuncObject *ufunc, PyArrayObject *arr,
-                        PyArray_Descr *odtype, PyArray_Descr **out_dtype)
-{
-    int i, retcode;
-    PyArrayObject *op[3] = {arr, arr, NULL};
-    PyArray_Descr *dtypes[3] = {NULL, NULL, NULL};
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
-    PyObject *type_tup = NULL;
-
-    *out_dtype = NULL;
-
     /*
-     * If odtype is specified, make a type tuple for the type
-     * resolution.
+     * Find the correct descriptors for the operation.  We use unsafe casting
+     * for historic reasons: The logic ufuncs required it to cast everything to
+     * boolean.  However, we now special case the logical ufuncs, so that the
+     * casting safety could in principle be set to the default same-kind.
+     * (although this should possibly happen through a deprecation)
      */
-    if (odtype != NULL) {
-        type_tup = PyTuple_Pack(3, odtype, odtype, Py_None);
-        if (type_tup == NULL) {
-            return -1;
-        }
-    }
-
-    /* Use the type resolution function to find our loop */
-    retcode = ufunc->type_resolver(
-                        ufunc, NPY_UNSAFE_CASTING,
-                        op, type_tup, dtypes);
-    Py_DECREF(type_tup);
-    if (retcode == -1) {
-        return -1;
-    }
-    else if (retcode == -2) {
-        PyErr_Format(PyExc_RuntimeError,
-                "type resolution returned NotImplemented to "
-                "reduce ufunc %s", ufunc_name);
-        return -1;
+    if (resolve_descriptors(3, ufunc, ufuncimpl,
+            ops, out_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
+        return NULL;
     }
 
     /*
-     * The first two type should be equivalent. Because of how
-     * reduce has historically behaved in NumPy, the return type
-     * could be different, and it is the return type on which the
-     * reduction occurs.
+     * The first operand and output should be the same array, so they should
+     * be identical.  The second argument can be different for reductions,
+     * but is checked to be identical for accumulate and reduceat.
      */
-    if (!PyArray_EquivTypes(dtypes[0], dtypes[1])) {
-        for (i = 0; i < 3; ++i) {
-            Py_DECREF(dtypes[i]);
-        }
-        PyErr_Format(PyExc_RuntimeError,
-                "could not find a type resolution appropriate for "
-                "reduce ufunc %s", ufunc_name);
-        return -1;
+    if (out_descrs[0] != out_descrs[2] || (
+            enforce_uniform_args && out_descrs[0] != out_descrs[1])) {
+        PyErr_Format(PyExc_TypeError,
+                "the resolved dtypes are not compatible with %s.%s",
+                ufunc_get_name_cstr(ufunc), method);
+        goto fail;
+    }
+    /* TODO: This really should _not_ be unsafe casting (same above)! */
+    if (validate_casting(ufuncimpl,
+            ufunc, ops, out_descrs, NPY_UNSAFE_CASTING) < 0) {
+        goto fail;
     }
 
-    Py_DECREF(dtypes[0]);
-    Py_DECREF(dtypes[1]);
-    *out_dtype = dtypes[2];
+    return ufuncimpl;
 
-    return 0;
+  fail:
+    for (int i = 0; i < 3; ++i) {
+        Py_DECREF(out_descrs[i]);
+    }
+    return NULL;
 }
 
+
 static int
-reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
-            npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
-            int needs_api, npy_intp skip_first_count, void *data)
+reduce_loop(PyArrayMethod_Context *context,
+        PyArrayMethod_StridedLoop *strided_loop, NpyAuxData *auxdata,
+        NpyIter *iter, char **dataptrs, npy_intp const *strides,
+        npy_intp const *countptr, NpyIter_IterNextFunc *iternext,
+        int needs_api, npy_intp skip_first_count)
 {
-    PyArray_Descr *dtypes[3], **iter_dtypes;
-    PyUFuncObject *ufunc = (PyUFuncObject *)data;
-    char *dataptrs_copy[3];
-    npy_intp strides_copy[3];
+    int retval;
+    char *dataptrs_copy[4];
+    npy_intp strides_copy[4];
     npy_bool masked;
 
-    /* The normal selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
-
     NPY_BEGIN_THREADS_DEF;
     /* Get the number of operands, to determine whether "where" is used */
     masked = (NpyIter_GetNOp(iter) == 3);
 
-    /* Get the inner loop */
-    iter_dtypes = NpyIter_GetDescrArray(iter);
-    dtypes[0] = iter_dtypes[0];
-    dtypes[1] = iter_dtypes[1];
-    dtypes[2] = iter_dtypes[0];
-    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-                            &innerloop, &innerloopdata, &needs_api) < 0) {
-        return -1;
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
     }
 
-    NPY_BEGIN_THREADS_NDITER(iter);
-
     if (skip_first_count > 0) {
-        do {
+        assert(!masked);  /* Path currently not available for masked */
+        while (1) {
             npy_intp count = *countptr;
 
             /* Skip any first-visit elements */
@@ -2862,27 +2836,23 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
             strides_copy[0] = strides[0];
             strides_copy[1] = strides[1];
             strides_copy[2] = strides[0];
-            innerloop(dataptrs_copy, &count,
-                        strides_copy, innerloopdata);
 
-            if (needs_api && PyErr_Occurred()) {
+            retval = strided_loop(context,
+                    dataptrs_copy, &count, strides_copy, auxdata);
+            if (retval < 0) {
                 goto finish_loop;
             }
 
-            /* Jump to the faster loop when skipping is done */
-            if (skip_first_count == 0) {
-                if (iternext(iter)) {
-                    break;
-                }
-                else {
-                    goto finish_loop;
-                }
+            /* Advance loop, and abort on error (or finish) */
+            if (!iternext(iter)) {
+                goto finish_loop;
             }
-        } while (iternext(iter));
-    }
 
-    if (needs_api && PyErr_Occurred()) {
-        goto finish_loop;
+            /* When skipping is done break and continue with faster loop */
+            if (skip_first_count == 0) {
+                break;
+            }
+        }
     }
 
     do {
@@ -2893,42 +2863,23 @@ reduce_loop(NpyIter *iter, char **dataptrs, npy_intp const *strides,
         strides_copy[0] = strides[0];
         strides_copy[1] = strides[1];
         strides_copy[2] = strides[0];
-
-        if (!masked) {
-            innerloop(dataptrs_copy, countptr,
-                      strides_copy, innerloopdata);
+        if (masked) {
+            dataptrs_copy[3] = dataptrs[2];
+            strides_copy[3] = strides[2];
         }
-        else {
-            npy_intp count = *countptr;
-            char *maskptr = dataptrs[2];
-            npy_intp mask_stride = strides[2];
-            /* Optimization for when the mask is broadcast */
-            npy_intp n = mask_stride == 0 ? count : 1;
-            while (count) {
-                char mask = *maskptr;
-                maskptr += mask_stride;
-                while (n < count && mask == *maskptr) {
-                    n++;
-                    maskptr += mask_stride;
-                }
-                /* If mask set, apply inner loop on this contiguous region */
-                if (mask) {
-                    innerloop(dataptrs_copy, &n,
-                              strides_copy, innerloopdata);
-                }
-                dataptrs_copy[0] += n * strides[0];
-                dataptrs_copy[1] += n * strides[1];
-                dataptrs_copy[2] = dataptrs_copy[0];
-                count -= n;
-                n = 1;
-            }
+
+        retval = strided_loop(context,
+                dataptrs_copy, countptr, strides_copy, auxdata);
+        if (retval < 0) {
+            goto finish_loop;
         }
-    } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+
+    } while (iternext(iter));
 
 finish_loop:
     NPY_END_THREADS;
 
-    return (needs_api && PyErr_Occurred()) ? -1 : 0;
+    return retval;
 }
 
 /*
@@ -2949,15 +2900,14 @@ finish_loop:
  * this function does not validate them.
  */
 static PyArrayObject *
-PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
-        int naxes, int *axes, PyArray_Descr *odtype, int keepdims,
+PyUFunc_Reduce(PyUFuncObject *ufunc,
+        PyArrayObject *arr, PyArrayObject *out,
+        int naxes, int *axes, PyArray_DTypeMeta *signature[3], int keepdims,
         PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
     npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
-    PyArray_Descr *dtype;
-    PyArrayObject *result;
     PyObject *identity;
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
@@ -2984,6 +2934,7 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     }
 
     /* Get the identity */
+    /* TODO: Both of these should be provided by the ArrayMethod! */
     identity = _get_identity(ufunc, &reorderable);
     if (identity == NULL) {
         return NULL;
@@ -3007,21 +2958,27 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         Py_INCREF(initial);  /* match the reference count in the if above */
     }
 
-    /* Get the reduction dtype */
-    if (reduce_type_resolver(ufunc, arr, odtype, &dtype) < 0) {
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_FALSE, descrs, "reduce");
+    if (ufuncimpl == NULL) {
         Py_DECREF(initial);
         return NULL;
     }
 
-    result = PyUFunc_ReduceWrapper(arr, out, wheremask, dtype, dtype,
-                                   NPY_UNSAFE_CASTING,
-                                   axis_flags, reorderable,
-                                   keepdims,
-                                   initial,
-                                   reduce_loop,
-                                   ufunc, buffersize, ufunc_name, errormask);
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
 
-    Py_DECREF(dtype);
+    PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+            arr, out, wheremask, axis_flags, reorderable, keepdims,
+            initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+
+    for (int i = 0; i < 3; i++) {
+        Py_DECREF(descrs[i]);
+    }
     Py_DECREF(initial);
     return result;
 }
@@ -3029,23 +2986,21 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
 static PyObject *
 PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
-                   int axis, int otype)
+                   int axis, PyArray_DTypeMeta *signature[3])
 {
     PyArrayObject *op[2];
-    PyArray_Descr *op_dtypes[2] = {NULL, NULL};
     int op_axes_arrays[2][NPY_MAXDIMS];
     int *op_axes[2] = {op_axes_arrays[0], op_axes_arrays[1]};
     npy_uint32 op_flags[2];
-    int idim, ndim, otype_final;
+    int idim, ndim;
     int needs_api, need_outer_iterator;
 
-    NpyIter *iter = NULL;
+    int res = 0;
 
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
 
-    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
+    NpyIter *iter = NULL;
 
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -3067,42 +3022,32 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     /* Take a reference to out for later returning */
     Py_XINCREF(out);
 
-    otype_final = otype;
-    if (get_binary_op_function(ufunc, &otype_final,
-                                &innerloop, &innerloopdata) < 0) {
-        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
-        PyErr_Format(PyExc_ValueError,
-                     "could not find a matching type for %s.accumulate, "
-                     "requested type has type code '%c'",
-                            ufunc_name, dtype ? dtype->type : '-');
-        Py_XDECREF(dtype);
-        goto fail;
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_TRUE, descrs, "accumulate");
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    ndim = PyArray_NDIM(arr);
+    /* The below code assumes that all descriptors are identical: */
+    assert(descrs[0] == descrs[1] && descrs[0] == descrs[2]);
 
-    /*
-     * Set up the output data type, using the input's exact
-     * data type if the type number didn't change to preserve
-     * metadata
-     */
-    if (PyArray_DESCR(arr)->type_num == otype_final) {
-        if (PyArray_ISNBO(PyArray_DESCR(arr)->byteorder)) {
-            op_dtypes[0] = PyArray_DESCR(arr);
-            Py_INCREF(op_dtypes[0]);
-        }
-        else {
-            op_dtypes[0] = PyArray_DescrNewByteorder(PyArray_DESCR(arr),
-                                                    NPY_NATIVE);
-        }
-    }
-    else {
-        op_dtypes[0] = PyArray_DescrFromType(otype_final);
-    }
-    if (op_dtypes[0] == NULL) {
+    if (PyDataType_REFCHK(descrs[2]) && descrs[2]->type_num != NPY_OBJECT) {
+        /* This can be removed, but the initial element copy needs fixing */
+        PyErr_SetString(PyExc_TypeError,
+                "accumulation currently only supports `object` dtype with "
+                "references");
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
+
+    ndim = PyArray_NDIM(arr);
+
 #if NPY_UF_DBG_TRACING
     printf("Found %s.accumulate inner loop with dtype :  ", ufunc_name);
     PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
@@ -3128,9 +3073,9 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     need_outer_iterator = (ndim > 1);
     /* We can't buffer, so must do UPDATEIFCOPY */
     if (!PyArray_ISALIGNED(arr) || (out && !PyArray_ISALIGNED(out)) ||
-            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr)) ||
+            !PyArray_EquivTypes(descrs[1], PyArray_DESCR(arr)) ||
             (out &&
-             !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(out)))) {
+             !PyArray_EquivTypes(descrs[0], PyArray_DESCR(out)))) {
         need_outer_iterator = 1;
     }
     /* If input and output overlap in memory, use iterator to figure it out */
@@ -3143,7 +3088,6 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
                            NPY_ITER_REFS_OK|
                            NPY_ITER_COPY_IF_OVERLAP;
-        PyArray_Descr **op_dtypes_param = NULL;
 
         /*
          * The way accumulate is set up, we can't do buffering,
@@ -3160,13 +3104,11 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
          */
         op_flags[0] |= NPY_ITER_UPDATEIFCOPY|NPY_ITER_ALIGNED|NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
         op_flags[1] |= NPY_ITER_COPY|NPY_ITER_ALIGNED|NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
-        op_dtypes_param = op_dtypes;
-        op_dtypes[1] = op_dtypes[0];
+
         NPY_UF_DBG_PRINT("Allocating outer iterator\n");
         iter = NpyIter_AdvancedNew(2, op, flags,
                                    NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                                   op_flags,
-                                   op_dtypes_param,
+                                   op_flags, descrs,
                                    ndim_iter, op_axes, NULL, 0);
         if (iter == NULL) {
             goto fail;
@@ -3184,14 +3126,14 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         }
     }
 
-    /* Get the output */
+    /* Get the output from the iterator if it was allocated */
     if (out == NULL) {
         if (iter) {
             op[0] = out = NpyIter_GetOperandArray(iter)[0];
             Py_INCREF(out);
         }
         else {
-            PyArray_Descr *dtype = op_dtypes[0];
+            PyArray_Descr *dtype = descrs[0];
             Py_INCREF(dtype);
             op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
                                     &PyArray_Type, dtype,
@@ -3200,10 +3142,31 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
             if (out == NULL) {
                 goto fail;
             }
-
         }
     }
 
+    npy_intp fixed_strides[3];
+    if (need_outer_iterator) {
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    }
+    else {
+        fixed_strides[0] = PyArray_STRIDES(op[0])[axis];
+        fixed_strides[1] = PyArray_STRIDES(op[1])[axis];
+        fixed_strides[2] = fixed_strides[0];
+    }
+
+
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufuncimpl->get_strided_loop(&context,
+            1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     /*
      * If the reduction axis has size zero, either return the reduction
      * unit for UFUNC_REDUCE, or return the zero-sized output array
@@ -3224,7 +3187,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
@@ -3232,8 +3195,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
             goto fail;
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
-        needs_api = NpyIter_IterationNeedsAPI(iter);
-
+        needs_api |= NpyIter_IterationNeedsAPI(iter);
 
         /* Execute the loop with just the outer iterator */
         count_m1 = PyArray_DIM(op[1], axis)-1;
@@ -3247,7 +3209,9 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
         stride_copy[1] = stride1;
         stride_copy[2] = stride0;
 
-        NPY_BEGIN_THREADS_NDITER(iter);
+        if (!needs_api) {
+            NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
+        }
 
         do {
             dataptr_copy[0] = dataptr[0];
@@ -3260,7 +3224,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
              * Output (dataptr[0]) and input (dataptr[1]) may point to
              * the same memory, e.g. np.add.accumulate(a, out=a).
              */
-            if (otype == NPY_OBJECT) {
+            if (descrs[2]->type_num == NPY_OBJECT) {
                 /*
                  * Incref before decref to avoid the possibility of the
                  * reference count being zero temporarily.
@@ -3280,18 +3244,17 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
                 dataptr_copy[2] += stride0;
                 NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                 (int)count_m1);
-                innerloop(dataptr_copy, &count_m1,
-                            stride_copy, innerloopdata);
+                res = strided_loop(&context,
+                        dataptr_copy, &count_m1, stride_copy, auxdata);
             }
-        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        } while (res == 0 && iternext(iter));
 
         NPY_END_THREADS;
     }
     else if (iter == NULL) {
         char *dataptr_copy[3];
-        npy_intp stride_copy[3];
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         /* Execute the loop with no iterators */
         npy_intp count = PyArray_DIM(op[1], axis);
@@ -3305,15 +3268,11 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
                                       PyArray_NDIM(op[0]))) {
             PyErr_SetString(PyExc_ValueError,
                     "provided out is the wrong size "
-                    "for the reduction");
+                    "for the accumulation.");
             goto fail;
         }
         stride0 = PyArray_STRIDE(op[0], axis);
 
-        stride_copy[0] = stride0;
-        stride_copy[1] = stride1;
-        stride_copy[2] = stride0;
-
         /* Turn the two items into three for the inner loop */
         dataptr_copy[0] = PyArray_BYTES(op[0]);
         dataptr_copy[1] = PyArray_BYTES(op[1]);
@@ -3325,7 +3284,7 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
          * Output (dataptr[0]) and input (dataptr[1]) may point to the
          * same memory, e.g. np.add.accumulate(a, out=a).
          */
-        if (otype == NPY_OBJECT) {
+        if (descrs[2]->type_num == NPY_OBJECT) {
             /*
              * Incref before decref to avoid the possibility of the
              * reference count being zero temporarily.
@@ -3346,25 +3305,34 @@ PyUFunc_Accumulate(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
 
             NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)count);
 
-            needs_api = PyDataType_REFCHK(op_dtypes[0]);
+            needs_api = PyDataType_REFCHK(descrs[0]);
 
             if (!needs_api) {
                 NPY_BEGIN_THREADS_THRESHOLDED(count);
             }
 
-            innerloop(dataptr_copy, &count,
-                        stride_copy, innerloopdata);
+            res = strided_loop(&context,
+                    dataptr_copy, &count, fixed_strides, auxdata);
 
             NPY_END_THREADS;
         }
     }
 
 finish:
-    Py_XDECREF(op_dtypes[0]);
-    int res = 0;
+    NPY_AUXDATA_FREE(auxdata);
+    Py_DECREF(descrs[0]);
+    Py_DECREF(descrs[1]);
+    Py_DECREF(descrs[2]);
+
     if (!NpyIter_Deallocate(iter)) {
         res = -1;
     }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "accumulate");
+    }
+
     if (res < 0) {
         Py_DECREF(out);
         return NULL;
@@ -3374,7 +3342,11 @@ finish:
 
 fail:
     Py_XDECREF(out);
-    Py_XDECREF(op_dtypes[0]);
+
+    NPY_AUXDATA_FREE(auxdata);
+    Py_XDECREF(descrs[0]);
+    Py_XDECREF(descrs[1]);
+    Py_XDECREF(descrs[2]);
 
     NpyIter_Deallocate(iter);
 
@@ -3399,28 +3371,31 @@ fail:
  * indices[1::2] = range(1,len(array))
  *
  * output shape is based on the size of indices
+ *
+ * TODO: Reduceat duplicates too much code from accumulate!
  */
 static PyObject *
 PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
-                 PyArrayObject *out, int axis, int otype)
+                 PyArrayObject *out, int axis, PyArray_DTypeMeta *signature[3])
 {
     PyArrayObject *op[3];
-    PyArray_Descr *op_dtypes[3] = {NULL, NULL, NULL};
     int op_axes_arrays[3][NPY_MAXDIMS];
     int *op_axes[3] = {op_axes_arrays[0], op_axes_arrays[1],
                             op_axes_arrays[2]};
     npy_uint32 op_flags[3];
-    int idim, ndim, otype_final;
-    int need_outer_iterator = 0;
+    int idim, ndim;
+    int needs_api, need_outer_iterator = 0;
+
+    int res = 0;
 
     NpyIter *iter = NULL;
 
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
     /* The reduceat indices - ind must be validated outside this call */
     npy_intp *reduceat_ind;
     npy_intp i, ind_size, red_axis_size;
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
 
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     char *opname = "reduceat";
@@ -3460,42 +3435,32 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     /* Take a reference to out for later returning */
     Py_XINCREF(out);
 
-    otype_final = otype;
-    if (get_binary_op_function(ufunc, &otype_final,
-                                &innerloop, &innerloopdata) < 0) {
-        PyArray_Descr *dtype = PyArray_DescrFromType(otype);
-        PyErr_Format(PyExc_ValueError,
-                     "could not find a matching type for %s.%s, "
-                     "requested type has type code '%c'",
-                            ufunc_name, opname, dtype ? dtype->type : '-');
-        Py_XDECREF(dtype);
-        goto fail;
+    PyArray_Descr *descrs[3];
+    PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
+            arr, out, signature, NPY_TRUE, descrs, "reduceat");
+    if (ufuncimpl == NULL) {
+        return NULL;
     }
 
-    ndim = PyArray_NDIM(arr);
+    /* The below code assumes that all descriptors are identical: */
+    assert(descrs[0] == descrs[1] && descrs[0] == descrs[2]);
 
-    /*
-     * Set up the output data type, using the input's exact
-     * data type if the type number didn't change to preserve
-     * metadata
-     */
-    if (PyArray_DESCR(arr)->type_num == otype_final) {
-        if (PyArray_ISNBO(PyArray_DESCR(arr)->byteorder)) {
-            op_dtypes[0] = PyArray_DESCR(arr);
-            Py_INCREF(op_dtypes[0]);
-        }
-        else {
-            op_dtypes[0] = PyArray_DescrNewByteorder(PyArray_DESCR(arr),
-                                                    NPY_NATIVE);
-        }
-    }
-    else {
-        op_dtypes[0] = PyArray_DescrFromType(otype_final);
-    }
-    if (op_dtypes[0] == NULL) {
+    if (PyDataType_REFCHK(descrs[2]) && descrs[2]->type_num != NPY_OBJECT) {
+        /* This can be removed, but the initial element copy needs fixing */
+        PyErr_SetString(PyExc_TypeError,
+                "reduceat currently only supports `object` dtype with "
+                "references");
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+        .caller = (PyObject *)ufunc,
+        .method = ufuncimpl,
+        .descriptors = descrs,
+    };
+
+    ndim = PyArray_NDIM(arr);
+
 #if NPY_UF_DBG_TRACING
     printf("Found %s.%s inner loop with dtype :  ", ufunc_name, opname);
     PyObject_Print((PyObject *)op_dtypes[0], stdout, 0);
@@ -3522,11 +3487,13 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     op[2] = ind;
 
     if (out != NULL || ndim > 1 || !PyArray_ISALIGNED(arr) ||
-            !PyArray_EquivTypes(op_dtypes[0], PyArray_DESCR(arr))) {
+            !PyArray_EquivTypes(descrs[0], PyArray_DESCR(arr))) {
         need_outer_iterator = 1;
     }
 
     if (need_outer_iterator) {
+        PyArray_Descr *op_dtypes[3] = {descrs[0], descrs[1], NULL};
+
         npy_uint32 flags = NPY_ITER_ZEROSIZE_OK|
                            NPY_ITER_REFS_OK|
                            NPY_ITER_MULTI_INDEX|
@@ -3555,8 +3522,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         NPY_UF_DBG_PRINT("Allocating outer iterator\n");
         iter = NpyIter_AdvancedNew(3, op, flags,
                                    NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                                   op_flags,
-                                   op_dtypes,
+                                   op_flags, op_dtypes,
                                    ndim, op_axes, NULL, 0);
         if (iter == NULL) {
             goto fail;
@@ -3580,11 +3546,15 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
             Py_INCREF(out);
         }
     }
-    /* Allocate the output for when there's no outer iterator */
-    else if (out == NULL) {
-        Py_INCREF(op_dtypes[0]);
+    else {
+        /*
+         * Allocate the output for when there's no outer iterator, we always
+         * use the outer_iteration path when `out` is passed.
+         */
+        assert(out == NULL);
+        Py_INCREF(descrs[0]);
         op[0] = out = (PyArrayObject *)PyArray_NewFromDescr(
-                                    &PyArray_Type, op_dtypes[0],
+                                    &PyArray_Type, descrs[0],
                                     1, &ind_size, NULL, NULL,
                                     0, NULL);
         if (out == NULL) {
@@ -3592,6 +3562,28 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         }
     }
 
+    npy_intp fixed_strides[3];
+    if (need_outer_iterator) {
+        NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    }
+    else {
+        fixed_strides[1] = PyArray_STRIDES(op[1])[axis];
+    }
+    /* The reduce axis does not advance here in the strided-loop */
+    fixed_strides[0] = 0;
+    fixed_strides[2] = 0;
+
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (ufuncimpl->get_strided_loop(&context,
+            1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     /*
      * If the output has zero elements, return now.
      */
@@ -3609,8 +3601,8 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         npy_intp stride0, stride1;
         npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
 
-        int itemsize = op_dtypes[0]->elsize;
-        int needs_api = NpyIter_IterationNeedsAPI(iter);
+        int itemsize = descrs[0]->elsize;
+        needs_api |= NpyIter_IterationNeedsAPI(iter);
 
         /* Get the variables needed for the loop */
         iternext = NpyIter_GetIterNext(iter, NULL);
@@ -3630,10 +3622,11 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
         stride_copy[1] = stride1;
         stride_copy[2] = stride0;
 
-        NPY_BEGIN_THREADS_NDITER(iter);
+        if (!needs_api) {
+            NPY_BEGIN_THREADS_THRESHOLDED(NpyIter_GetIterSize(iter));
+        }
 
         do {
-
             for (i = 0; i < ind_size; ++i) {
                 npy_intp start = reduceat_ind[i],
                         end = (i == ind_size-1) ? count_m1+1 :
@@ -3651,7 +3644,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                  * to the same memory, e.g.
                  * np.add.reduceat(a, np.arange(len(a)), out=a).
                  */
-                if (otype == NPY_OBJECT) {
+                if (descrs[2]->type_num == NPY_OBJECT) {
                     /*
                      * Incref before decref to avoid the possibility of
                      * the reference count being zero temporarily.
@@ -3671,33 +3664,24 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                     dataptr_copy[1] += stride1;
                     NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                     (int)count);
-                    innerloop(dataptr_copy, &count,
-                                stride_copy, innerloopdata);
+                    res = strided_loop(&context,
+                            dataptr_copy, &count, stride_copy, auxdata);
                 }
             }
-        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        } while (res == 0 && iternext(iter));
 
         NPY_END_THREADS;
     }
     else if (iter == NULL) {
         char *dataptr_copy[3];
-        npy_intp stride_copy[3];
 
-        int itemsize = op_dtypes[0]->elsize;
+        int itemsize = descrs[0]->elsize;
 
         npy_intp stride0_ind = PyArray_STRIDE(op[0], axis);
-
-        /* Execute the loop with no iterators */
-        npy_intp stride0 = 0, stride1 = PyArray_STRIDE(op[1], axis);
-
-        int needs_api = PyDataType_REFCHK(op_dtypes[0]);
+        npy_intp stride1 = PyArray_STRIDE(op[1], axis);
 
         NPY_UF_DBG_PRINT("UFunc: Reduce loop with no iterators\n");
 
-        stride_copy[0] = stride0;
-        stride_copy[1] = stride1;
-        stride_copy[2] = stride0;
-
         if (!needs_api) {
             NPY_BEGIN_THREADS;
         }
@@ -3719,7 +3703,7 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
              * the same memory, e.g.
              * np.add.reduceat(a, np.arange(len(a)), out=a).
              */
-            if (otype == NPY_OBJECT) {
+            if (descrs[2]->type_num == NPY_OBJECT) {
                 /*
                  * Incref before decref to avoid the possibility of the
                  * reference count being zero temporarily.
@@ -3739,8 +3723,11 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
                 dataptr_copy[1] += stride1;
                 NPY_UF_DBG_PRINT1("iterator loop count %d\n",
                                                 (int)count);
-                innerloop(dataptr_copy, &count,
-                            stride_copy, innerloopdata);
+                res = strided_loop(&context,
+                        dataptr_copy, &count, fixed_strides, auxdata);
+                if (res != 0) {
+                    break;
+                }
             }
         }
 
@@ -3748,8 +3735,21 @@ PyUFunc_Reduceat(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *ind,
     }
 
 finish:
-    Py_XDECREF(op_dtypes[0]);
+    NPY_AUXDATA_FREE(auxdata);
+    Py_DECREF(descrs[0]);
+    Py_DECREF(descrs[1]);
+    Py_DECREF(descrs[2]);
+
     if (!NpyIter_Deallocate(iter)) {
+        res = -1;
+    }
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "reduceat");
+    }
+
+    if (res < 0) {
         Py_DECREF(out);
         return NULL;
     }
@@ -3758,9 +3758,14 @@ finish:
 
 fail:
     Py_XDECREF(out);
-    Py_XDECREF(op_dtypes[0]);
+
+    NPY_AUXDATA_FREE(auxdata);
+    Py_XDECREF(descrs[0]);
+    Py_XDECREF(descrs[1]);
+    Py_XDECREF(descrs[2]);
 
     NpyIter_Deallocate(iter);
+
     return NULL;
 }
 
@@ -3858,7 +3863,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     PyArrayObject *mp = NULL, *wheremask = NULL, *ret = NULL;
     PyObject *op = NULL;
     PyArrayObject *indices = NULL;
-    PyArray_Descr *otype = NULL;
+    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
     PyArrayObject *out = NULL;
     int keepdims = 0;
     PyObject *initial = NULL;
@@ -4002,13 +4007,10 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
     if (otype_obj && otype_obj != Py_None) {
         /* Use `_get_dtype` because `dtype` is a DType and not the instance */
-        PyArray_DTypeMeta *dtype = _get_dtype(otype_obj);
-        if (dtype == NULL) {
+        signature[0] = _get_dtype(otype_obj);
+        if (signature[0] == NULL) {
             goto fail;
         }
-        otype = dtype->singleton;
-        Py_INCREF(otype);
-        Py_DECREF(dtype);
     }
     if (out_obj && !PyArray_OutputConverter(out_obj, &out)) {
         goto fail;
@@ -4028,15 +4030,6 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
 
     ndim = PyArray_NDIM(mp);
 
-    /* Check to see that type (and otype) is not FLEXIBLE */
-    if (PyArray_ISFLEXIBLE(mp) ||
-        (otype && PyTypeNum_ISFLEXIBLE(otype->type_num))) {
-        PyErr_Format(PyExc_TypeError,
-                     "cannot perform %s with flexible type",
-                     _reduce_type[operation]);
-        goto fail;
-    }
-
     /* Convert the 'axis' parameter into a list of axes */
     if (axes_obj == NULL) {
         /* apply defaults */
@@ -4099,14 +4092,12 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
 
      /*
-      * If out is specified it determines otype
-      * unless otype already specified.
+      * If no dtype is specified and out is not specified, we override the
+      * integer and bool dtype used for add and multiply.
+      *
+      * TODO: The following should be handled by a promoter!
       */
-    if (otype == NULL && out != NULL) {
-        otype = PyArray_DESCR(out);
-        Py_INCREF(otype);
-    }
-    if (otype == NULL) {
+    if (signature[0] == NULL && out == NULL) {
         /*
          * For integer types --- make sure at least a long
          * is used for add and multiply reduction to avoid overflow
@@ -4126,16 +4117,17 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
                     typenum = NPY_LONG;
                 }
             }
+            signature[0] = PyArray_DTypeFromTypeNum(typenum);
         }
-        otype = PyArray_DescrFromType(typenum);
     }
-
+    Py_XINCREF(signature[0]);
+    signature[2] = signature[0];
 
     switch(operation) {
     case UFUNC_REDUCE:
-        ret = PyUFunc_Reduce(ufunc, mp, out, naxes, axes,
-                             otype, keepdims, initial, wheremask);
-        Py_XDECREF(wheremask);
+        ret = PyUFunc_Reduce(ufunc,
+                mp, out, naxes, axes, signature, keepdims, initial, wheremask);
+        Py_XSETREF(wheremask, NULL);
         break;
     case UFUNC_ACCUMULATE:
         if (ndim == 0) {
@@ -4147,8 +4139,8 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
                         "accumulate does not allow multiple axes");
             goto fail;
         }
-        ret = (PyArrayObject *)PyUFunc_Accumulate(ufunc, mp, out, axes[0],
-                                                  otype->type_num);
+        ret = (PyArrayObject *)PyUFunc_Accumulate(ufunc,
+                mp, out, axes[0], signature);
         break;
     case UFUNC_REDUCEAT:
         if (ndim == 0) {
@@ -4161,19 +4153,22 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
             goto fail;
         }
         ret = (PyArrayObject *)PyUFunc_Reduceat(ufunc,
-                mp, indices, out, axes[0], otype->type_num);
+                mp, indices, out, axes[0], signature);
         Py_SETREF(indices, NULL);
         break;
     }
+    if (ret == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(signature[0]);
+    Py_DECREF(signature[1]);
+    Py_DECREF(signature[2]);
+
     Py_DECREF(mp);
-    Py_DECREF(otype);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
 
-    if (ret == NULL) {
-        return NULL;
-    }
-
     /* Wrap and return the output */
     {
         /* Find __array_wrap__ - note that these rules are different to the
@@ -4201,7 +4196,10 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     }
 
 fail:
-    Py_XDECREF(otype);
+    Py_XDECREF(signature[0]);
+    Py_XDECREF(signature[1]);
+    Py_XDECREF(signature[2]);
+
     Py_XDECREF(mp);
     Py_XDECREF(wheremask);
     Py_XDECREF(indices);
@@ -5576,8 +5574,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     /* Get entry for this user-defined type*/
     cobj = PyDict_GetItemWithError(ufunc->userloops, key);
     if (cobj == NULL && PyErr_Occurred()) {
-        Py_DECREF(key);
-        return 0;
+        goto fail;
     }
     /* If it's not there, then make one and return. */
     else if (cobj == NULL) {
@@ -5883,15 +5880,13 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     PyArrayObject *op2_array = NULL;
     PyArrayMapIterObject *iter = NULL;
     PyArrayIterObject *iter2 = NULL;
-    PyArray_Descr *dtypes[3] = {NULL, NULL, NULL};
     PyArrayObject *operands[3] = {NULL, NULL, NULL};
     PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
 
-    int needs_api = 0;
+    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+    PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
 
-    PyUFuncGenericFunction innerloop;
-    void *innerloopdata;
-    npy_intp i;
     int nop;
 
     /* override vars */
@@ -5904,6 +5899,10 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
     int buffersize;
     int errormask = 0;
     char * err_msg = NULL;
+
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
     NPY_BEGIN_THREADS_DEF;
 
     if (ufunc->nin > 2) {
@@ -5991,26 +5990,51 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
     /*
      * Create dtypes array for either one or two input operands.
-     * The output operand is set to the first input operand
+     * Compare to the logic in `convert_ufunc_arguments`.
+     * TODO: It may be good to review some of this behaviour, since the
+     *       operand array is special (it is written to) similar to reductions.
+     *       Using unsafe-casting as done here, is likely not desirable.
      */
     operands[0] = op1_array;
+    operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+    Py_INCREF(operand_DTypes[0]);
+    int force_legacy_promotion = 0;
+    int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
     if (op2_array != NULL) {
         operands[1] = op2_array;
-        operands[2] = op1_array;
+        operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+        Py_INCREF(operand_DTypes[1]);
+        allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+        operands[2] = operands[0];
+        operand_DTypes[2] = operand_DTypes[0];
+        Py_INCREF(operand_DTypes[2]);
+
         nop = 3;
+        if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+                                       != (PyArray_NDIM(op2_array) == 0))) {
+                /* both are legacy and only one is 0-D: force legacy */
+                force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
+            }
     }
     else {
-        operands[1] = op1_array;
+        operands[1] = operands[0];
+        operand_DTypes[1] = operand_DTypes[0];
+        Py_INCREF(operand_DTypes[1]);
         operands[2] = NULL;
         nop = 2;
     }
 
-    if (ufunc->type_resolver(ufunc, NPY_UNSAFE_CASTING,
-                            operands, NULL, dtypes) < 0) {
+    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+            operands, signature, operand_DTypes,
+            force_legacy_promotion, allow_legacy_promotion);
+    if (ufuncimpl == NULL) {
         goto fail;
     }
-    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-        &innerloop, &innerloopdata, &needs_api) < 0) {
+
+    /* Find the correct descriptors for the operation */
+    if (resolve_descriptors(nop, ufunc, ufuncimpl,
+            operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
         goto fail;
     }
 
@@ -6071,21 +6095,44 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                         NPY_ITER_GROWINNER|
                         NPY_ITER_DELAY_BUFALLOC,
                         NPY_KEEPORDER, NPY_UNSAFE_CASTING,
-                        op_flags, dtypes,
+                        op_flags, operation_descrs,
                         -1, NULL, NULL, buffersize);
 
     if (iter_buffer == NULL) {
         goto fail;
     }
 
-    needs_api = needs_api | NpyIter_IterationNeedsAPI(iter_buffer);
-
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
         NpyIter_Deallocate(iter_buffer);
         goto fail;
     }
 
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    /* Use contiguous strides; if there is such a loop it may be faster */
+    npy_intp strides[3] = {
+            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+    if (nop == 3) {
+        strides[2] = operation_descrs[2]->elsize;
+    }
+
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
+    }
+
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6094,14 +6141,13 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
      */
-    i = iter->size;
-    while (i > 0)
+    int res = 0;
+    for (npy_intp i = iter->size; i > 0; i--)
     {
         char *dataptr[3];
         char **buffer_dataptr;
         /* one element at a time, no stride required but read by innerloop */
-        npy_intp count[3] = {1, 0xDEADBEEF, 0xDEADBEEF};
-        npy_intp stride[3] = {0xDEADBEEF, 0xDEADBEEF, 0xDEADBEEF};
+        npy_intp count = 1;
 
         /*
          * Set up data pointers for either one or two input operands.
@@ -6120,14 +6166,14 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /* Reset NpyIter data pointers which will trigger a buffer copy */
         NpyIter_ResetBasePointers(iter_buffer, dataptr, &err_msg);
         if (err_msg) {
+            res = -1;
             break;
         }
 
         buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
 
-        innerloop(buffer_dataptr, count, stride, innerloopdata);
-
-        if (needs_api && PyErr_Occurred()) {
+        res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+        if (res != 0) {
             break;
         }
 
@@ -6141,27 +6187,35 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         if (iter2 != NULL) {
             PyArray_ITER_NEXT(iter2);
         }
-
-        i--;
     }
 
     NPY_END_THREADS;
 
-    if (err_msg) {
+    if (res != 0 && err_msg) {
         PyErr_SetString(PyExc_ValueError, err_msg);
     }
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "at");
+    }
 
+    NPY_AUXDATA_FREE(auxdata);
     NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
-    for (i = 0; i < 3; i++) {
-        Py_XDECREF(dtypes[i]);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(operation_descrs[i]);
         Py_XDECREF(array_operands[i]);
     }
 
-    if (needs_api && PyErr_Occurred()) {
+    /*
+     * An error should only be possible if needs_api is true or `res != 0`,
+     * but this is not strictly correct for old-style ufuncs
+     * (e.g. `power` released the GIL but manually set an Exception).
+     */
+    if (res != 0 || PyErr_Occurred()) {
         return NULL;
     }
     else {
@@ -6176,10 +6230,11 @@ fail:
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
-    for (i = 0; i < 3; i++) {
-        Py_XDECREF(dtypes[i]);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(operation_descrs[i]);
         Py_XDECREF(array_operands[i]);
     }
+    NPY_AUXDATA_FREE(auxdata);
 
     return NULL;
 }
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index a7d536656..9ed923cf5 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -20,19 +20,18 @@
  *
  * See LICENSE.txt for the license.
  */
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 // printif debug tracing
 #ifndef NPY_UF_DBG_TRACING
     #define NPY_UF_DBG_TRACING 0
 #endif
 
-#include <stdbool.h>
-
-#include "Python.h"
-
 #include "npy_config.h"
 #include "npy_pycompat.h"
 #include "npy_import.h"
@@ -48,6 +47,8 @@
 #include "cblasfuncs.h"
 #endif
 
+#include <stdbool.h>
+
 static PyObject *
 npy_casting_to_py_object(NPY_CASTING casting)
 {
@@ -246,6 +247,28 @@ PyUFunc_ValidateCasting(PyUFuncObject *ufunc,
 }
 
 
+/*
+ * Same as `PyUFunc_ValidateCasting` but only checks output casting.
+ */
+NPY_NO_EXPORT int
+PyUFunc_ValidateOutCasting(PyUFuncObject *ufunc,
+        NPY_CASTING casting, PyArrayObject **operands, PyArray_Descr **dtypes)
+{
+    int i, nin = ufunc->nin, nop = nin + ufunc->nout;
+
+    for (i = nin; i < nop; ++i) {
+        if (operands[i] == NULL) {
+            continue;
+        }
+        if (!PyArray_CanCastTypeTo(dtypes[i],
+                PyArray_DESCR(operands[i]), casting)) {
+            return raise_output_casting_error(
+                    ufunc, casting, dtypes[i], PyArray_DESCR(operands[i]), i);
+        }
+    }
+    return 0;
+}
+
 /*UFUNC_API
  *
  * This function applies the default type resolution rules
@@ -2141,6 +2164,10 @@ type_tuple_type_resolver(PyUFuncObject *self,
      * `signature=(None,)*nin + (dtype,)*nout`.  If the signature matches that
      * exactly (could be relaxed but that is not necessary for backcompat),
      * we also try `signature=(dtype,)*(nin+nout)`.
+     * Since reduction pass in `(dtype, None, dtype)` we broaden this to
+     * replacing all unspecified dtypes with the homogeneous output one.
+     * Note that this can (and often will) lead to unsafe casting.  This is
+     * normally rejected (but not currently for reductions!).
      * This used to be the main meaning for `dtype=dtype`, but some calls broke
      * the expectation, and changing it allows for `dtype=dtype` to be useful
      * for ufuncs like `np.ldexp` in the future while also normalizing it to
@@ -2159,13 +2186,12 @@ type_tuple_type_resolver(PyUFuncObject *self,
     if (homogeneous_type != NPY_NOTYPE) {
         for (int i = 0; i < nin; i++) {
             if (specified_types[i] != NPY_NOTYPE) {
-                homogeneous_type = NPY_NOTYPE;
-                break;
+                /* Never replace a specified type! */
+                continue;
             }
             specified_types[i] = homogeneous_type;
         }
-    }
-    if (homogeneous_type != NPY_NOTYPE) {
+
         /* Try again with the homogeneous specified types. */
         res = type_tuple_type_resolver_core(self,
                 op, input_casting, casting, specified_types, any_object,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index dd88a081a..84a2593f4 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -99,6 +99,10 @@ PyUFunc_DivmodTypeResolver(PyUFuncObject *ufunc,
                               PyObject *type_tup,
                               PyArray_Descr **out_dtypes);
 
+NPY_NO_EXPORT int
+PyUFunc_ValidateOutCasting(PyUFuncObject *ufunc,
+        NPY_CASTING casting, PyArrayObject **operands, PyArray_Descr **dtypes);
+
 /*
  * Does a linear search for the best inner loop of the ufunc.
  *
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 6a718889b..272555704 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -1,25 +1,17 @@
 /* -*- c -*- */
-
-/*
- * vim:syntax=c
- */
-
-/*
- *****************************************************************************
- **                            INCLUDES                                     **
- *****************************************************************************
- */
+/* vim:syntax=c */
 
 /*
  * _UMATHMODULE IS needed in __ufunc_api.h, included from numpy/ufuncobject.h.
  * This is a mess and it would be nice to fix it. It has nothing to do with
  * __ufunc_api.c
  */
-#define _UMATHMODULE
-#define _MULTIARRAYMODULE
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
 
-#include "Python.h"
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #include "npy_config.h"
 
@@ -30,6 +22,7 @@
 
 #include "numpy/npy_math.h"
 #include "number.h"
+#include "dispatching.h"
 
 static PyUFuncGenericFunction pyfunc_functions[] = {PyUFunc_On_Om};
 
@@ -313,5 +306,33 @@ int initumath(PyObject *m)
         return -1;
     }
 
+    /*
+     * Set up promoters for logical functions
+     * TODO: This should probably be done at a better place, or even in the
+     *       code generator directly.
+     */
+    s = _PyDict_GetItemStringWithError(d, "logical_and");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
+
+    s = _PyDict_GetItemStringWithError(d, "logical_or");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
+
+    s = _PyDict_GetItemStringWithError(d, "logical_xor");
+    if (s == NULL) {
+        return -1;
+    }
+    if (install_logical_ufunc_promoter(s) < 0) {
+        return -1;
+    }
     return 0;
 }
diff --git a/numpy/core/tests/data/generate_umath_validation_data.cpp b/numpy/core/tests/data/generate_umath_validation_data.cpp
index 9d97ff4ab..418eae670 100644
--- a/numpy/core/tests/data/generate_umath_validation_data.cpp
+++ b/numpy/core/tests/data/generate_umath_validation_data.cpp
@@ -1,41 +1,46 @@
-#include<math.h>
-#include<stdio.h>
-#include<iostream>
-#include<algorithm>
-#include<vector>
-#include<random>
-#include<fstream>
-#include<time.h>
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <random>
+#include <stdio.h>
+#include <time.h>
+#include <vector>
 
 struct ufunc {
     std::string name;
-    double (*f32func) (double);
-    long double (*f64func) (long double);
+    double (*f32func)(double);
+    long double (*f64func)(long double);
     float f32ulp;
     float f64ulp;
 };
 
-template<typename T>
-T RandomFloat(T a, T b) {
-    T random = ((T) rand()) / (T) RAND_MAX;
+template <typename T>
+T
+RandomFloat(T a, T b)
+{
+    T random = ((T)rand()) / (T)RAND_MAX;
     T diff = b - a;
     T r = random * diff;
     return a + r;
 }
 
-template<typename T>
-void append_random_array(std::vector<T>& arr, T min, T max, size_t N)
+template <typename T>
+void
+append_random_array(std::vector<T> &arr, T min, T max, size_t N)
 {
     for (size_t ii = 0; ii < N; ++ii)
         arr.emplace_back(RandomFloat<T>(min, max));
 }
 
-template<typename T1, typename T2>
-std::vector<T1> computeTrueVal(const std::vector<T1>& in, T2(*mathfunc)(T2)) {
+template <typename T1, typename T2>
+std::vector<T1>
+computeTrueVal(const std::vector<T1> &in, T2 (*mathfunc)(T2))
+{
     std::vector<T1> out;
     for (T1 elem : in) {
-        T2 elem_d = (T2) elem;
-        T1 out_elem = (T1) mathfunc(elem_d);
+        T2 elem_d = (T2)elem;
+        T1 out_elem = (T1)mathfunc(elem_d);
         out.emplace_back(out_elem);
     }
     return out;
@@ -49,17 +54,20 @@ std::vector<T1> computeTrueVal(const std::vector<T1>& in, T2(*mathfunc)(T2)) {
 #define MINDEN std::numeric_limits<T>::denorm_min()
 #define MINFLT std::numeric_limits<T>::min()
 #define MAXFLT std::numeric_limits<T>::max()
-#define INF    std::numeric_limits<T>::infinity()
-#define qNAN   std::numeric_limits<T>::quiet_NaN()
-#define sNAN   std::numeric_limits<T>::signaling_NaN()
+#define INF std::numeric_limits<T>::infinity()
+#define qNAN std::numeric_limits<T>::quiet_NaN()
+#define sNAN std::numeric_limits<T>::signaling_NaN()
 
-template<typename T>
-std::vector<T> generate_input_vector(std::string func) {
-    std::vector<T> input = {MINDEN, -MINDEN, MINFLT, -MINFLT, MAXFLT, -MAXFLT,
-                            INF, -INF, qNAN, sNAN, -1.0, 1.0, 0.0, -0.0};
+template <typename T>
+std::vector<T>
+generate_input_vector(std::string func)
+{
+    std::vector<T> input = {MINDEN,  -MINDEN, MINFLT, -MINFLT, MAXFLT,
+                            -MAXFLT, INF,     -INF,   qNAN,    sNAN,
+                            -1.0,    1.0,     0.0,    -0.0};
 
     // [-1.0, 1.0]
-    if ((func == "arcsin") || (func == "arccos") || (func == "arctanh")){
+    if ((func == "arcsin") || (func == "arccos") || (func == "arctanh")) {
         append_random_array<T>(input, -1.0, 1.0, 700);
     }
     // (0.0, INF]
@@ -98,57 +106,62 @@ std::vector<T> generate_input_vector(std::string func) {
     return input;
 }
 
-int main() {
-    srand (42);
+int
+main()
+{
+    srand(42);
     std::vector<struct ufunc> umathfunc = {
-        {"sin",sin,sin,2.37,3.3},
-        {"cos",cos,cos,2.36,3.38},
-        {"tan",tan,tan,3.91,3.93},
-        {"arcsin",asin,asin,3.12,2.55},
-        {"arccos",acos,acos,2.1,1.67},
-        {"arctan",atan,atan,2.3,2.52},
-        {"sinh",sinh,sinh,1.55,1.89},
-        {"cosh",cosh,cosh,2.48,1.97},
-        {"tanh",tanh,tanh,1.38,1.19},
-        {"arcsinh",asinh,asinh,1.01,1.48},
-        {"arccosh",acosh,acosh,1.16,1.05},
-        {"arctanh",atanh,atanh,1.45,1.46},
-        {"cbrt",cbrt,cbrt,1.94,1.82},
-        //{"exp",exp,exp,3.76,1.53},
-        {"exp2",exp2,exp2,1.01,1.04},
-        {"expm1",expm1,expm1,2.62,2.1},
-        //{"log",log,log,1.84,1.67},
-        {"log10",log10,log10,3.5,1.92},
-        {"log1p",log1p,log1p,1.96,1.93},
-        {"log2",log2,log2,2.12,1.84},
+            {"sin", sin, sin, 2.37, 3.3},
+            {"cos", cos, cos, 2.36, 3.38},
+            {"tan", tan, tan, 3.91, 3.93},
+            {"arcsin", asin, asin, 3.12, 2.55},
+            {"arccos", acos, acos, 2.1, 1.67},
+            {"arctan", atan, atan, 2.3, 2.52},
+            {"sinh", sinh, sinh, 1.55, 1.89},
+            {"cosh", cosh, cosh, 2.48, 1.97},
+            {"tanh", tanh, tanh, 1.38, 1.19},
+            {"arcsinh", asinh, asinh, 1.01, 1.48},
+            {"arccosh", acosh, acosh, 1.16, 1.05},
+            {"arctanh", atanh, atanh, 1.45, 1.46},
+            {"cbrt", cbrt, cbrt, 1.94, 1.82},
+            //{"exp",exp,exp,3.76,1.53},
+            {"exp2", exp2, exp2, 1.01, 1.04},
+            {"expm1", expm1, expm1, 2.62, 2.1},
+            //{"log",log,log,1.84,1.67},
+            {"log10", log10, log10, 3.5, 1.92},
+            {"log1p", log1p, log1p, 1.96, 1.93},
+            {"log2", log2, log2, 2.12, 1.84},
     };
 
     for (int ii = 0; ii < umathfunc.size(); ++ii) {
-         // ignore sin/cos
+        // ignore sin/cos
         if ((umathfunc[ii].name != "sin") && (umathfunc[ii].name != "cos")) {
-            std::string fileName = "umath-validation-set-" + umathfunc[ii].name + ".csv";
+            std::string fileName =
+                    "umath-validation-set-" + umathfunc[ii].name + ".csv";
             std::ofstream txtOut;
-            txtOut.open (fileName, std::ofstream::trunc);
+            txtOut.open(fileName, std::ofstream::trunc);
             txtOut << "dtype,input,output,ulperrortol" << std::endl;
 
             // Single Precision
             auto f32in = generate_input_vector<float>(umathfunc[ii].name);
-            auto f32out = computeTrueVal<float, double>(f32in, umathfunc[ii].f32func);
+            auto f32out = computeTrueVal<float, double>(f32in,
+                                                        umathfunc[ii].f32func);
             for (int jj = 0; jj < f32in.size(); ++jj) {
-                txtOut << "np.float32" << std::hex <<
-                          ",0x" << *reinterpret_cast<uint32_t*>(&f32in[jj]) <<
-                          ",0x" << *reinterpret_cast<uint32_t*>(&f32out[jj]) <<
-                          "," << ceil(umathfunc[ii].f32ulp) << std::endl;
+                txtOut << "np.float32" << std::hex << ",0x"
+                       << *reinterpret_cast<uint32_t *>(&f32in[jj]) << ",0x"
+                       << *reinterpret_cast<uint32_t *>(&f32out[jj]) << ","
+                       << ceil(umathfunc[ii].f32ulp) << std::endl;
             }
 
             // Double Precision
             auto f64in = generate_input_vector<double>(umathfunc[ii].name);
-            auto f64out = computeTrueVal<double, long double>(f64in, umathfunc[ii].f64func);
+            auto f64out = computeTrueVal<double, long double>(
+                    f64in, umathfunc[ii].f64func);
             for (int jj = 0; jj < f64in.size(); ++jj) {
-                txtOut << "np.float64" << std::hex <<
-                          ",0x" << *reinterpret_cast<uint64_t*>(&f64in[jj]) <<
-                          ",0x" << *reinterpret_cast<uint64_t*>(&f64out[jj]) <<
-                          "," << ceil(umathfunc[ii].f64ulp) << std::endl;
+                txtOut << "np.float64" << std::hex << ",0x"
+                       << *reinterpret_cast<uint64_t *>(&f64in[jj]) << ",0x"
+                       << *reinterpret_cast<uint64_t *>(&f64out[jj]) << ","
+                       << ceil(umathfunc[ii].f64ulp) << std::endl;
             }
             txtOut.close();
         }
diff --git a/numpy/core/tests/test__exceptions.py b/numpy/core/tests/test__exceptions.py
index c87412aa4..10b87e052 100644
--- a/numpy/core/tests/test__exceptions.py
+++ b/numpy/core/tests/test__exceptions.py
@@ -40,7 +40,7 @@ class TestArrayMemoryError:
         # 1023.9999 Mib should round to 1 GiB
         assert f(int(Ki*Ki*Ki*0.9999)) == '1.00 GiB'
         assert f(Ki*Ki*Ki*Ki*Ki*Ki) == '1.00 EiB'
-        # larger than sys.maxsize, adding larger prefices isn't going to help
+        # larger than sys.maxsize, adding larger prefixes isn't going to help
         # anyway.
         assert f(Ki*Ki*Ki*Ki*Ki*Ki*123456) == '123456. EiB'
 
diff --git a/numpy/core/tests/test_array_coercion.py b/numpy/core/tests/test_array_coercion.py
index 076d8e43f..293f5a68f 100644
--- a/numpy/core/tests/test_array_coercion.py
+++ b/numpy/core/tests/test_array_coercion.py
@@ -376,7 +376,7 @@ class TestScalarDiscovery:
     def test_scalar_to_int_coerce_does_not_cast(self, dtype):
         """
         Signed integers are currently different in that they do not cast other
-        NumPy scalar, but instead use scalar.__int__(). The harcoded
+        NumPy scalar, but instead use scalar.__int__(). The hardcoded
         exception to this rule is `np.array(scalar, dtype=integer)`.
         """
         dtype = np.dtype(dtype)
@@ -444,7 +444,7 @@ class TestTimeScalars:
         # never use casting.  This is because casting will error in this
         # case, and traditionally in most cases the behaviour is maintained
         # like this.  (`np.array(scalar, dtype="U6")` would have failed before)
-        # TODO: This discrepency _should_ be resolved, either by relaxing the
+        # TODO: This discrepancy _should_ be resolved, either by relaxing the
         #       cast, or by deprecating the first part.
         scalar = np.datetime64(val, unit)
         dtype = np.dtype(dtype)
diff --git a/numpy/core/tests/test_arraymethod.py b/numpy/core/tests/test_arraymethod.py
index b1bc79b80..49aa9f6df 100644
--- a/numpy/core/tests/test_arraymethod.py
+++ b/numpy/core/tests/test_arraymethod.py
@@ -3,6 +3,10 @@ This file tests the generic aspects of ArrayMethod.  At the time of writing
 this is private API, but when added, public API may be added here.
 """
 
+import sys
+import types
+from typing import Any, Type
+
 import pytest
 
 import numpy as np
@@ -56,3 +60,35 @@ class TestSimpleStridedCall:
         # This is private API, which may be modified freely
         with pytest.raises(error):
             self.method._simple_strided_call(*args)
+
+
+@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
+class TestClassGetItem:
+    @pytest.mark.parametrize(
+        "cls", [np.ndarray, np.recarray, np.chararray, np.matrix, np.memmap]
+    )
+    def test_class_getitem(self, cls: Type[np.ndarray]) -> None:
+        """Test `ndarray.__class_getitem__`."""
+        alias = cls[Any, Any]
+        assert isinstance(alias, types.GenericAlias)
+        assert alias.__origin__ is cls
+
+    @pytest.mark.parametrize("arg_len", range(4))
+    def test_subscript_tuple(self, arg_len: int) -> None:
+        arg_tup = (Any,) * arg_len
+        if arg_len == 2:
+            assert np.ndarray[arg_tup]
+        else:
+            with pytest.raises(TypeError):
+                np.ndarray[arg_tup]
+
+    def test_subscript_scalar(self) -> None:
+        with pytest.raises(TypeError):
+            np.ndarray[Any]
+
+
+@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
+def test_class_getitem_38() -> None:
+    match = "Type subscription requires python >= 3.9"
+    with pytest.raises(TypeError, match=match):
+        np.ndarray[Any, Any]
diff --git a/numpy/core/tests/test_arrayprint.py b/numpy/core/tests/test_arrayprint.py
index 09cc79f72..25826d8ed 100644
--- a/numpy/core/tests/test_arrayprint.py
+++ b/numpy/core/tests/test_arrayprint.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 import sys
 import gc
 from hypothesis import given
diff --git a/numpy/core/tests/test_casting_unittests.py b/numpy/core/tests/test_casting_unittests.py
index a13e807e2..cb4792090 100644
--- a/numpy/core/tests/test_casting_unittests.py
+++ b/numpy/core/tests/test_casting_unittests.py
@@ -9,7 +9,6 @@ than integration tests.
 import pytest
 import textwrap
 import enum
-import itertools
 import random
 
 import numpy as np
@@ -127,7 +126,7 @@ CAST_TABLE = _get_cancast_table()
 
 class TestChanges:
     """
-    These test cases excercise some behaviour changes
+    These test cases exercise some behaviour changes
     """
     @pytest.mark.parametrize("string", ["S", "U"])
     @pytest.mark.parametrize("floating", ["e", "f", "d", "g"])
@@ -699,9 +698,14 @@ class TestCasting:
         else:
             assert_array_equal(expected, arr_NULLs.astype(dtype))
 
-    def test_float_to_bool(self):
-        # test case corresponding to gh-19514
-        # simple test for casting bool_ to float16 
-        res = np.array([0, 3, -7], dtype=np.int8).view(bool)
+    @pytest.mark.parametrize("dtype",
+            np.typecodes["AllInteger"] + np.typecodes["AllFloat"])
+    def test_nonstandard_bool_to_other(self, dtype):
+        # simple test for casting bool_ to numeric types, which should not
+        # expose the detail that NumPy bools can sometimes take values other
+        # than 0 and 1.  See also gh-19514.
+        nonstandard_bools = np.array([0, 3, -7], dtype=np.int8).view(bool)
+        res = nonstandard_bools.astype(dtype)
         expected = [0, 1, 1]
         assert_array_equal(res, expected)
+
diff --git a/numpy/core/tests/test_cpu_dispatcher.py b/numpy/core/tests/test_cpu_dispatcher.py
index 8712dee1a..2f7eac7e8 100644
--- a/numpy/core/tests/test_cpu_dispatcher.py
+++ b/numpy/core/tests/test_cpu_dispatcher.py
@@ -4,7 +4,7 @@ from numpy.testing import assert_equal
 
 def test_dispatcher():
     """
-    Testing the utilites of the CPU dispatcher
+    Testing the utilities of the CPU dispatcher
     """
     targets = (
         "SSE2", "SSE41", "AVX2",
@@ -16,7 +16,7 @@ def test_dispatcher():
     for feature in reversed(targets):
         # skip baseline features, by the default `CCompilerOpt` do not generate separated objects
         # for the baseline,  just one object combined all of them via 'baseline' option
-        # within the configuration statments.
+        # within the configuration statements.
         if feature in __cpu_baseline__:
             continue
         # check compiler and running machine support
diff --git a/numpy/core/tests/test_custom_dtypes.py b/numpy/core/tests/test_custom_dtypes.py
index 5eb82bc93..6bcc45d6b 100644
--- a/numpy/core/tests/test_custom_dtypes.py
+++ b/numpy/core/tests/test_custom_dtypes.py
@@ -101,18 +101,52 @@ class TestSFloat:
         expected_view = a.view(np.float64) * b.view(np.float64)
         assert_array_equal(res.view(np.float64), expected_view)
 
+    def test_possible_and_impossible_reduce(self):
+        # For reductions to work, the first and last operand must have the
+        # same dtype.  For this parametric DType that is not necessarily true.
+        a = self._get_array(2.)
+        # Addition reductin works (as of writing requires to pass initial
+        # because setting a scaled-float from the default `0` fails).
+        res = np.add.reduce(a, initial=0.)
+        assert res == a.astype(np.float64).sum()
+
+        # But each multiplication changes the factor, so a reduction is not
+        # possible (the relaxed version of the old refusal to handle any
+        # flexible dtype).
+        with pytest.raises(TypeError,
+                match="the resolved dtypes are not compatible"):
+            np.multiply.reduce(a)
+
+    def test_basic_ufunc_at(self):
+        float_a = np.array([1., 2., 3.])
+        b = self._get_array(2.)
+
+        float_b = b.view(np.float64).copy()
+        np.multiply.at(float_b, [1, 1, 1], float_a)
+        np.multiply.at(b, [1, 1, 1], float_a)
+
+        assert_array_equal(b.view(np.float64), float_b)
+
     def test_basic_multiply_promotion(self):
         float_a = np.array([1., 2., 3.])
         b = self._get_array(2.)
 
         res1 = float_a * b
         res2 = b * float_a
+
         # one factor is one, so we get the factor of b:
         assert res1.dtype == res2.dtype == b.dtype
         expected_view = float_a * b.view(np.float64)
         assert_array_equal(res1.view(np.float64), expected_view)
         assert_array_equal(res2.view(np.float64), expected_view)
 
+        # Check that promotion works when `out` is used:
+        np.multiply(b, float_a, out=res2)
+        with pytest.raises(TypeError):
+            # The promoter accepts this (maybe it should not), but the SFloat
+            # result cannot be cast to integer:
+            np.multiply(b, float_a, out=np.arange(3))
+
     def test_basic_addition(self):
         a = self._get_array(2.)
         b = self._get_array(4.)
@@ -145,3 +179,23 @@ class TestSFloat:
         # Check that casting the output fails also (done by the ufunc here)
         with pytest.raises(TypeError):
             np.add(a, a, out=c, casting="safe")
+
+    @pytest.mark.parametrize("ufunc",
+            [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_casts_to_bool(self, ufunc):
+        a = self._get_array(2.)
+        a[0] = 0.  # make sure first element is considered False.
+
+        float_equiv = a.astype(float)
+        expected = ufunc(float_equiv, float_equiv)
+        res = ufunc(a, a)
+        assert_array_equal(res, expected)
+
+        # also check that the same works for reductions:
+        expected = ufunc.reduce(float_equiv)
+        res = ufunc.reduce(a)
+        assert_array_equal(res, expected)
+
+        # The output casting does not match the bool, bool -> bool loop:
+        with pytest.raises(TypeError):
+            ufunc(a, a, out=np.empty(a.shape, dtype=int), casting="equiv")
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index 5a490646e..b95d669a8 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -63,6 +63,7 @@ class TestDateTime:
         assert_raises(TypeError, np.dtype, 'm7')
         assert_raises(TypeError, np.dtype, 'M16')
         assert_raises(TypeError, np.dtype, 'm16')
+        assert_raises(TypeError, np.dtype, 'M8[3000000000ps]')
 
     def test_datetime_casting_rules(self):
         # Cannot cast safely/same_kind between timedelta and datetime
@@ -137,6 +138,42 @@ class TestDateTime:
         assert_(not np.can_cast('M8[h]', 'M8', casting='same_kind'))
         assert_(not np.can_cast('M8[h]', 'M8', casting='safe'))
 
+    def test_datetime_prefix_conversions(self):
+        # regression tests related to gh-19631;
+        # test metric prefixes from seconds down to
+        # attoseconds for bidirectional conversions
+        smaller_units = ['M8[7000ms]',
+                         'M8[2000us]',
+                         'M8[1000ns]',
+                         'M8[5000ns]',
+                         'M8[2000ps]',
+                         'M8[9000fs]',
+                         'M8[1000as]',
+                         'M8[2000000ps]',
+                         'M8[1000000as]',
+                         'M8[2000000000ps]',
+                         'M8[1000000000as]']
+        larger_units = ['M8[7s]',
+                        'M8[2ms]',
+                        'M8[us]',
+                        'M8[5us]',
+                        'M8[2ns]',
+                        'M8[9ps]',
+                        'M8[1fs]',
+                        'M8[2us]',
+                        'M8[1ps]',
+                        'M8[2ms]',
+                        'M8[1ns]']
+        for larger_unit, smaller_unit in zip(larger_units, smaller_units):
+            assert np.can_cast(larger_unit, smaller_unit, casting='safe')
+            assert np.can_cast(smaller_unit, larger_unit, casting='safe')
+
+    @pytest.mark.parametrize("unit", [
+        "s", "ms", "us", "ns", "ps", "fs", "as"])
+    def test_prohibit_negative_datetime(self, unit):
+        with assert_raises(TypeError):
+            np.array([1], dtype=f"M8[-1{unit}]")
+
     def test_compare_generic_nat(self):
         # regression tests for gh-6452
         assert_(np.datetime64('NaT') !=
@@ -1992,6 +2029,21 @@ class TestDateTime:
         assert_equal(np.maximum.reduce(a),
                      np.timedelta64(7, 's'))
 
+    def test_datetime_no_subtract_reducelike(self):
+        # subtracting two datetime64 works, but we cannot reduce it, since
+        # the result of that subtraction will have a different dtype.
+        arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]")
+        msg = r"the resolved dtypes are not compatible with subtract\."
+
+        with pytest.raises(TypeError, match=msg + "reduce"):
+            np.subtract.reduce(arr)
+
+        with pytest.raises(TypeError, match=msg + "accumulate"):
+            np.subtract.accumulate(arr)
+
+        with pytest.raises(TypeError, match=msg + "reduceat"):
+            np.subtract.reduceat(arr, [0])
+
     def test_datetime_busday_offset(self):
         # First Monday in June
         assert_equal(
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 44c76e0b8..a1b379d92 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -791,7 +791,7 @@ class TestFutureWarningArrayLikeNotIterable(_DeprecationTestCase):
         *not* define the sequence protocol.
 
         NOTE: Tests for the versions including __len__ and __getitem__ exist
-              in `test_array_coercion.py` and they can be modified or ammended
+              in `test_array_coercion.py` and they can be modified or amended
               when this deprecation expired.
         """
         blueprint = np.arange(10)
@@ -1192,3 +1192,41 @@ class TestUFuncForcedDTypeWarning(_DeprecationTestCase):
             np.maximum(arr, arr, dtype="m8[ns]")  # previously used the "ns"
         with pytest.warns(DeprecationWarning, match=self.message):
             np.maximum.reduce(arr, dtype="m8[ns]")  # never preserved the "ns"
+
+
+PARTITION_DICT = {
+    "partition method": np.arange(10).partition,
+    "argpartition method": np.arange(10).argpartition,
+    "partition function": lambda kth: np.partition(np.arange(10), kth),
+    "argpartition function": lambda kth: np.argpartition(np.arange(10), kth),
+}
+
+
+@pytest.mark.parametrize("func", PARTITION_DICT.values(), ids=PARTITION_DICT)
+class TestPartitionBoolIndex(_DeprecationTestCase):
+    # Deprecated 2021-09-29, NumPy 1.22
+    warning_cls = DeprecationWarning
+    message = "Passing booleans as partition index is deprecated"
+
+    def test_deprecated(self, func):
+        self.assert_deprecated(lambda: func(True))
+        self.assert_deprecated(lambda: func([False, True]))
+
+    def test_not_deprecated(self, func):
+        self.assert_not_deprecated(lambda: func(1))
+        self.assert_not_deprecated(lambda: func([0, 1]))
+
+
+class TestMachAr(_DeprecationTestCase):
+    # Deprecated 2021-10-19, NumPy 1.22
+    warning_cls = DeprecationWarning
+
+    def test_deprecated(self):
+        self.assert_deprecated(lambda: np.MachAr)
+
+    def test_deprecated_module(self):
+        self.assert_deprecated(lambda: getattr(np.core, "machar"))
+
+    def test_deprecated_attr(self):
+        finfo = np.finfo(float)
+        self.assert_deprecated(lambda: getattr(finfo, "machar"))
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 23269f01b..8fe859919 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -3,7 +3,8 @@ import operator
 import pytest
 import ctypes
 import gc
-import warnings
+import types
+from typing import Any
 
 import numpy as np
 from numpy.core._rational_tests import rational
@@ -111,9 +112,9 @@ class TestBuiltin:
     @pytest.mark.parametrize("dtype",
              ['Bool', 'Bytes0', 'Complex32', 'Complex64',
               'Datetime64', 'Float16', 'Float32', 'Float64',
-              'Int8', 'Int16', 'Int32', 'Int64', 
+              'Int8', 'Int16', 'Int32', 'Int64',
               'Object0', 'Str0', 'Timedelta64',
-              'UInt8', 'UInt16', 'Uint32', 'UInt32', 
+              'UInt8', 'UInt16', 'Uint32', 'UInt32',
               'Uint64', 'UInt64', 'Void0',
               "Float128", "Complex128"])
     def test_numeric_style_types_are_invalid(self, dtype):
@@ -876,14 +877,24 @@ class TestString:
                                    ('bright', '>f4', (8, 36))])],
                        align=True)
         assert_equal(str(dt),
-                    "{'names':['top','bottom'], "
-                     "'formats':[([('tiles', ('>f4', (64, 64)), (1,)), "
-                                  "('rtile', '>f4', (64, 36))], (3,)),"
-                                 "[('bleft', ('>f4', (8, 64)), (1,)), "
-                                  "('bright', '>f4', (8, 36))]], "
-                     "'offsets':[0,76800], "
-                     "'itemsize':80000, "
-                     "'aligned':True}")
+                    "{'names': ['top', 'bottom'],"
+                    " 'formats': [([('tiles', ('>f4', (64, 64)), (1,)), "
+                                   "('rtile', '>f4', (64, 36))], (3,)), "
+                                  "[('bleft', ('>f4', (8, 64)), (1,)), "
+                                   "('bright', '>f4', (8, 36))]],"
+                    " 'offsets': [0, 76800],"
+                    " 'itemsize': 80000,"
+                    " 'aligned': True}")
+        with np.printoptions(legacy='1.21'):
+            assert_equal(str(dt),
+                        "{'names':['top','bottom'], "
+                         "'formats':[([('tiles', ('>f4', (64, 64)), (1,)), "
+                                      "('rtile', '>f4', (64, 36))], (3,)),"
+                                     "[('bleft', ('>f4', (8, 64)), (1,)), "
+                                      "('bright', '>f4', (8, 36))]], "
+                         "'offsets':[0,76800], "
+                         "'itemsize':80000, "
+                         "'aligned':True}")
         assert_equal(np.dtype(eval(str(dt))), dt)
 
         dt = np.dtype({'names': ['r', 'g', 'b'], 'formats': ['u1', 'u1', 'u1'],
@@ -900,22 +911,22 @@ class TestString:
                        'titles': ['Color', 'Red pixel',
                                   'Green pixel', 'Blue pixel']})
         assert_equal(str(dt),
-                    "{'names':['rgba','r','g','b'],"
-                    " 'formats':['<u4','u1','u1','u1'],"
-                    " 'offsets':[0,0,1,2],"
-                    " 'titles':['Color','Red pixel',"
-                              "'Green pixel','Blue pixel'],"
-                    " 'itemsize':4}")
+                    "{'names': ['rgba', 'r', 'g', 'b'],"
+                    " 'formats': ['<u4', 'u1', 'u1', 'u1'],"
+                    " 'offsets': [0, 0, 1, 2],"
+                    " 'titles': ['Color', 'Red pixel', "
+                               "'Green pixel', 'Blue pixel'],"
+                    " 'itemsize': 4}")
 
         dt = np.dtype({'names': ['r', 'b'], 'formats': ['u1', 'u1'],
                         'offsets': [0, 2],
                         'titles': ['Red pixel', 'Blue pixel']})
         assert_equal(str(dt),
-                    "{'names':['r','b'],"
-                    " 'formats':['u1','u1'],"
-                    " 'offsets':[0,2],"
-                    " 'titles':['Red pixel','Blue pixel'],"
-                    " 'itemsize':3}")
+                    "{'names': ['r', 'b'],"
+                    " 'formats': ['u1', 'u1'],"
+                    " 'offsets': [0, 2],"
+                    " 'titles': ['Red pixel', 'Blue pixel'],"
+                    " 'itemsize': 3}")
 
         dt = np.dtype([('a', '<m8[D]'), ('b', '<M8[us]')])
         assert_equal(str(dt),
@@ -948,23 +959,23 @@ class TestString:
                        'titles': ['Color', 'Red pixel',
                                   'Green pixel', 'Blue pixel']}, align=True)
         assert_equal(repr(dt),
-                    "dtype({'names':['rgba','r','g','b'],"
-                    " 'formats':['<u4','u1','u1','u1'],"
-                    " 'offsets':[0,0,1,2],"
-                    " 'titles':['Color','Red pixel',"
-                              "'Green pixel','Blue pixel'],"
-                    " 'itemsize':4}, align=True)")
+                    "dtype({'names': ['rgba', 'r', 'g', 'b'],"
+                    " 'formats': ['<u4', 'u1', 'u1', 'u1'],"
+                    " 'offsets': [0, 0, 1, 2],"
+                    " 'titles': ['Color', 'Red pixel', "
+                                "'Green pixel', 'Blue pixel'],"
+                    " 'itemsize': 4}, align=True)")
 
         dt = np.dtype({'names': ['r', 'b'], 'formats': ['u1', 'u1'],
                         'offsets': [0, 2],
                         'titles': ['Red pixel', 'Blue pixel'],
                         'itemsize': 4})
         assert_equal(repr(dt),
-                    "dtype({'names':['r','b'], "
-                    "'formats':['u1','u1'], "
-                    "'offsets':[0,2], "
-                    "'titles':['Red pixel','Blue pixel'], "
-                    "'itemsize':4})")
+                    "dtype({'names': ['r', 'b'], "
+                    "'formats': ['u1', 'u1'], "
+                    "'offsets': [0, 2], "
+                    "'titles': ['Red pixel', 'Blue pixel'], "
+                    "'itemsize': 4})")
 
     def test_repr_structured_datetime(self):
         dt = np.dtype([('a', '<M8[D]'), ('b', '<m8[us]')])
@@ -1549,3 +1560,45 @@ class TestUserDType:
             # Tests that a dtype must have its type field set up to np.dtype
             # or in this case a builtin instance.
             create_custom_field_dtype(blueprint, mytype, 2)
+
+
+@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
+class TestClassGetItem:
+    def test_dtype(self) -> None:
+        alias = np.dtype[Any]
+        assert isinstance(alias, types.GenericAlias)
+        assert alias.__origin__ is np.dtype
+
+    @pytest.mark.parametrize("code", np.typecodes["All"])
+    def test_dtype_subclass(self, code: str) -> None:
+        cls = type(np.dtype(code))
+        alias = cls[Any]
+        assert isinstance(alias, types.GenericAlias)
+        assert alias.__origin__ is cls
+
+    @pytest.mark.parametrize("arg_len", range(4))
+    def test_subscript_tuple(self, arg_len: int) -> None:
+        arg_tup = (Any,) * arg_len
+        if arg_len == 1:
+            assert np.dtype[arg_tup]
+        else:
+            with pytest.raises(TypeError):
+                np.dtype[arg_tup]
+
+    def test_subscript_scalar(self) -> None:
+        assert np.dtype[Any]
+
+
+def test_result_type_integers_and_unitless_timedelta64():
+    # Regression test for gh-20077.  The following call of `result_type`
+    # would cause a seg. fault.
+    td = np.timedelta64(4)
+    result = np.result_type(0, td)
+    assert_dtype_equal(result, td.dtype)
+
+
+@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
+def test_class_getitem_38() -> None:
+    match = "Type subscription requires python >= 3.9"
+    with pytest.raises(TypeError, match=match):
+        np.dtype[Any]
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index c697d0c2d..78c5e527b 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -1025,7 +1025,7 @@ class TestEinsumPath:
         self.assert_path_equal(path, ['einsum_path', (0, 1), (0, 1, 2, 3, 4, 5)])
 
     def test_path_type_input(self):
-        # Test explicit path handeling
+        # Test explicit path handling
         path_test = self.build_operands('dcc,fce,ea,dbf->ab')
 
         path, path_str = np.einsum_path(*path_test, optimize=False)
diff --git a/numpy/core/tests/test_getlimits.py b/numpy/core/tests/test_getlimits.py
index de7b3e769..c5148db2c 100644
--- a/numpy/core/tests/test_getlimits.py
+++ b/numpy/core/tests/test_getlimits.py
@@ -46,7 +46,7 @@ class TestFinfo:
                        [np.float16, np.float32, np.float64, np.complex64,
                         np.complex128]))
         for dt1, dt2 in dts:
-            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machar', 'machep',
+            for attr in ('bits', 'eps', 'epsneg', 'iexp', 'machep',
                          'max', 'maxexp', 'min', 'minexp', 'negep', 'nexp',
                          'nmant', 'precision', 'resolution', 'tiny',
                          'smallest_normal', 'smallest_subnormal'):
diff --git a/numpy/core/tests/test_machar.py b/numpy/core/tests/test_machar.py
index 673f309f1..3a66ec51f 100644
--- a/numpy/core/tests/test_machar.py
+++ b/numpy/core/tests/test_machar.py
@@ -3,7 +3,7 @@ Test machar. Given recent changes to hardcode type data, we might want to get
 rid of both MachAr and this test at some point.
 
 """
-from numpy.core.machar import MachAr
+from numpy.core._machar import MachAr
 import numpy.core.numerictypes as ntypes
 from numpy import errstate, array
 
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
new file mode 100644
index 000000000..7fec8897f
--- /dev/null
+++ b/numpy/core/tests/test_mem_policy.py
@@ -0,0 +1,396 @@
+import asyncio
+import gc
+import os
+import pytest
+import numpy as np
+import threading
+import warnings
+from numpy.testing import extbuild, assert_warns
+import sys
+
+
+@pytest.fixture
+def get_module(tmp_path):
+    """ Add a memory policy that returns a false pointer 64 bytes into the
+    actual allocation, and fill the prefix with some text. Then check at each
+    memory manipulation that the prefix exists, to make sure all alloc/realloc/
+    free/calloc go via the functions here.
+    """
+    if sys.platform.startswith('cygwin'):
+        pytest.skip('link fails on cygwin')
+    functions = [
+        ("set_secret_data_policy", "METH_NOARGS", """
+             PyObject *secret_data =
+                 PyCapsule_New(&secret_data_handler, "mem_handler", NULL);
+             if (secret_data == NULL) {
+                 return NULL;
+             }
+             PyObject *old = PyDataMem_SetHandler(secret_data);
+             Py_DECREF(secret_data);
+             return old;
+         """),
+        ("set_old_policy", "METH_O", """
+             PyObject *old;
+             if (args != NULL && PyCapsule_CheckExact(args)) {
+                 old = PyDataMem_SetHandler(args);
+             }
+             else {
+                 old = PyDataMem_SetHandler(NULL);
+             }
+             if (old == NULL) {
+                 return NULL;
+             }
+             Py_DECREF(old);
+             Py_RETURN_NONE;
+         """),
+        ("get_array", "METH_NOARGS", """
+            char *buf = (char *)malloc(20);
+            npy_intp dims[1];
+            dims[0] = 20;
+            PyArray_Descr *descr =  PyArray_DescrNewFromType(NPY_UINT8);
+            return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, NULL,
+                                        buf, NPY_ARRAY_WRITEABLE, NULL);
+         """),
+        ("set_own", "METH_O", """
+            if (!PyArray_Check(args)) {
+                PyErr_SetString(PyExc_ValueError,
+                             "need an ndarray");
+                return NULL;
+            }
+            PyArray_ENABLEFLAGS((PyArrayObject*)args, NPY_ARRAY_OWNDATA);
+            // Maybe try this too?
+            // PyArray_BASE(PyArrayObject *)args) = NULL;
+            Py_RETURN_NONE;
+         """),
+        ("get_array_with_base", "METH_NOARGS", """
+            char *buf = (char *)malloc(20);
+            npy_intp dims[1];
+            dims[0] = 20;
+            PyArray_Descr *descr =  PyArray_DescrNewFromType(NPY_UINT8);
+            PyObject *arr = PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims,
+                                                 NULL, buf,
+                                                 NPY_ARRAY_WRITEABLE, NULL);
+            if (arr == NULL) return NULL;
+            PyObject *obj = PyCapsule_New(buf, "buf capsule",
+                                          (PyCapsule_Destructor)&warn_on_free);
+            if (obj == NULL) {
+                Py_DECREF(arr);
+                return NULL;
+            }
+            if (PyArray_SetBaseObject((PyArrayObject *)arr, obj) < 0) {
+                Py_DECREF(arr);
+                Py_DECREF(obj);
+                return NULL;
+            }
+            return arr;
+
+         """),
+    ]
+    prologue = '''
+        #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+        #include <numpy/arrayobject.h>
+        /*
+         * This struct allows the dynamic configuration of the allocator funcs
+         * of the `secret_data_allocator`. It is provided here for
+         * demonstration purposes, as a valid `ctx` use-case scenario.
+         */
+        typedef struct {
+            void *(*malloc)(size_t);
+            void *(*calloc)(size_t, size_t);
+            void *(*realloc)(void *, size_t);
+            void (*free)(void *);
+        } SecretDataAllocatorFuncs;
+
+        NPY_NO_EXPORT void *
+        shift_alloc(void *ctx, size_t sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            char *real = (char *)funcs->malloc(sz + 64);
+            if (real == NULL) {
+                return NULL;
+            }
+            snprintf(real, 64, "originally allocated %ld", (unsigned long)sz);
+            return (void *)(real + 64);
+        }
+        NPY_NO_EXPORT void *
+        shift_zero(void *ctx, size_t sz, size_t cnt) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            char *real = (char *)funcs->calloc(sz + 64, cnt);
+            if (real == NULL) {
+                return NULL;
+            }
+            snprintf(real, 64, "originally allocated %ld via zero",
+                     (unsigned long)sz);
+            return (void *)(real + 64);
+        }
+        NPY_NO_EXPORT void
+        shift_free(void *ctx, void * p, npy_uintp sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            if (p == NULL) {
+                return ;
+            }
+            char *real = (char *)p - 64;
+            if (strncmp(real, "originally allocated", 20) != 0) {
+                fprintf(stdout, "uh-oh, unmatched shift_free, "
+                        "no appropriate prefix\\n");
+                /* Make C runtime crash by calling free on the wrong address */
+                funcs->free((char *)p + 10);
+                /* funcs->free(real); */
+            }
+            else {
+                npy_uintp i = (npy_uintp)atoi(real +20);
+                if (i != sz) {
+                    fprintf(stderr, "uh-oh, unmatched shift_free"
+                            "(ptr, %ld) but allocated %ld\\n", sz, i);
+                    /* This happens in some places, only print */
+                    funcs->free(real);
+                }
+                else {
+                    funcs->free(real);
+                }
+            }
+        }
+        NPY_NO_EXPORT void *
+        shift_realloc(void *ctx, void * p, npy_uintp sz) {
+            SecretDataAllocatorFuncs *funcs = (SecretDataAllocatorFuncs *)ctx;
+            if (p != NULL) {
+                char *real = (char *)p - 64;
+                if (strncmp(real, "originally allocated", 20) != 0) {
+                    fprintf(stdout, "uh-oh, unmatched shift_realloc\\n");
+                    return realloc(p, sz);
+                }
+                return (void *)((char *)funcs->realloc(real, sz + 64) + 64);
+            }
+            else {
+                char *real = (char *)funcs->realloc(p, sz + 64);
+                if (real == NULL) {
+                    return NULL;
+                }
+                snprintf(real, 64, "originally allocated "
+                         "%ld  via realloc", (unsigned long)sz);
+                return (void *)(real + 64);
+            }
+        }
+        /* As an example, we use the standard {m|c|re}alloc/free funcs. */
+        static SecretDataAllocatorFuncs secret_data_handler_ctx = {
+            malloc,
+            calloc,
+            realloc,
+            free
+        };
+        static PyDataMem_Handler secret_data_handler = {
+            "secret_data_allocator",
+            {
+                &secret_data_handler_ctx, /* ctx */
+                shift_alloc,              /* malloc */
+                shift_zero,               /* calloc */
+                shift_realloc,            /* realloc */
+                shift_free                /* free */
+            }
+        };
+        void warn_on_free(void *capsule) {
+            PyErr_WarnEx(PyExc_UserWarning, "in warn_on_free", 1);
+            void * obj = PyCapsule_GetPointer(capsule,
+                                              PyCapsule_GetName(capsule));
+            free(obj);
+        };
+        '''
+    more_init = "import_array();"
+    try:
+        import mem_policy
+        return mem_policy
+    except ImportError:
+        pass
+    # if it does not exist, build and load it
+    return extbuild.build_and_import_extension('mem_policy',
+                                               functions,
+                                               prologue=prologue,
+                                               include_dirs=[np.get_include()],
+                                               build_dir=tmp_path,
+                                               more_init=more_init)
+
+
+def test_set_policy(get_module):
+
+    get_handler_name = np.core.multiarray.get_handler_name
+    orig_policy_name = get_handler_name()
+
+    a = np.arange(10).reshape((2, 5))  # a doesn't own its own data
+    assert get_handler_name(a) is None
+    assert get_handler_name(a.base) == orig_policy_name
+
+    orig_policy = get_module.set_secret_data_policy()
+
+    b = np.arange(10).reshape((2, 5))  # b doesn't own its own data
+    assert get_handler_name(b) is None
+    assert get_handler_name(b.base) == 'secret_data_allocator'
+
+    if orig_policy_name == 'default_allocator':
+        get_module.set_old_policy(None)  # tests PyDataMem_SetHandler(NULL)
+        assert get_handler_name() == 'default_allocator'
+    else:
+        get_module.set_old_policy(orig_policy)
+        assert get_handler_name() == orig_policy_name
+
+
+def test_policy_propagation(get_module):
+    # The memory policy goes hand-in-hand with flags.owndata
+
+    class MyArr(np.ndarray):
+        pass
+
+    get_handler_name = np.core.multiarray.get_handler_name
+    orig_policy_name = get_handler_name()
+    a = np.arange(10).view(MyArr).reshape((2, 5))
+    assert get_handler_name(a) is None
+    assert a.flags.owndata is False
+
+    assert get_handler_name(a.base) is None
+    assert a.base.flags.owndata is False
+
+    assert get_handler_name(a.base.base) == orig_policy_name
+    assert a.base.base.flags.owndata is True
+
+
+async def concurrent_context1(get_module, orig_policy_name, event):
+    if orig_policy_name == 'default_allocator':
+        get_module.set_secret_data_policy()
+        assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    else:
+        get_module.set_old_policy(None)
+        assert np.core.multiarray.get_handler_name() == 'default_allocator'
+    event.set()
+
+
+async def concurrent_context2(get_module, orig_policy_name, event):
+    await event.wait()
+    # the policy is not affected by changes in parallel contexts
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+    # change policy in the child context
+    if orig_policy_name == 'default_allocator':
+        get_module.set_secret_data_policy()
+        assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    else:
+        get_module.set_old_policy(None)
+        assert np.core.multiarray.get_handler_name() == 'default_allocator'
+
+
+async def async_test_context_locality(get_module):
+    orig_policy_name = np.core.multiarray.get_handler_name()
+
+    event = asyncio.Event()
+    # the child contexts inherit the parent policy
+    concurrent_task1 = asyncio.create_task(
+        concurrent_context1(get_module, orig_policy_name, event))
+    concurrent_task2 = asyncio.create_task(
+        concurrent_context2(get_module, orig_policy_name, event))
+    await concurrent_task1
+    await concurrent_task2
+
+    # the parent context is not affected by child policy changes
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+
+
+def test_context_locality(get_module):
+    if (sys.implementation.name == 'pypy'
+            and sys.pypy_version_info[:3] < (7, 3, 6)):
+        pytest.skip('no context-locality support in PyPy < 7.3.6')
+    asyncio.run(async_test_context_locality(get_module))
+
+
+def concurrent_thread1(get_module, event):
+    get_module.set_secret_data_policy()
+    assert np.core.multiarray.get_handler_name() == 'secret_data_allocator'
+    event.set()
+
+
+def concurrent_thread2(get_module, event):
+    event.wait()
+    # the policy is not affected by changes in parallel threads
+    assert np.core.multiarray.get_handler_name() == 'default_allocator'
+    # change policy in the child thread
+    get_module.set_secret_data_policy()
+
+
+def test_thread_locality(get_module):
+    orig_policy_name = np.core.multiarray.get_handler_name()
+
+    event = threading.Event()
+    # the child threads do not inherit the parent policy
+    concurrent_task1 = threading.Thread(target=concurrent_thread1,
+                                        args=(get_module, event))
+    concurrent_task2 = threading.Thread(target=concurrent_thread2,
+                                        args=(get_module, event))
+    concurrent_task1.start()
+    concurrent_task2.start()
+    concurrent_task1.join()
+    concurrent_task2.join()
+
+    # the parent thread is not affected by child policy changes
+    assert np.core.multiarray.get_handler_name() == orig_policy_name
+
+
+@pytest.mark.slow
+def test_new_policy(get_module):
+    a = np.arange(10)
+    orig_policy_name = np.core.multiarray.get_handler_name(a)
+
+    orig_policy = get_module.set_secret_data_policy()
+
+    b = np.arange(10)
+    assert np.core.multiarray.get_handler_name(b) == 'secret_data_allocator'
+
+    # test array manipulation. This is slow
+    if orig_policy_name == 'default_allocator':
+        # when the np.core.test tests recurse into this test, the
+        # policy will be set so this "if" will be false, preventing
+        # infinite recursion
+        #
+        # if needed, debug this by
+        # - running tests with -- -s (to not capture stdout/stderr
+        # - setting extra_argv=['-vv'] here
+        assert np.core.test('full', verbose=2, extra_argv=['-vv'])
+        # also try the ma tests, the pickling test is quite tricky
+        assert np.ma.test('full', verbose=2, extra_argv=['-vv'])
+
+    get_module.set_old_policy(orig_policy)
+
+    c = np.arange(10)
+    assert np.core.multiarray.get_handler_name(c) == orig_policy_name
+
+@pytest.mark.xfail(sys.implementation.name == "pypy",
+                   reason=("bad interaction between getenv and "
+                           "os.environ inside pytest"))
+@pytest.mark.parametrize("policy", ["0", "1", None])
+def test_switch_owner(get_module, policy):
+    a = get_module.get_array()
+    assert np.core.multiarray.get_handler_name(a) is None
+    get_module.set_own(a)
+    oldval = os.environ.get('NUMPY_WARN_IF_NO_MEM_POLICY', None)
+    if policy is None:
+        if 'NUMPY_WARN_IF_NO_MEM_POLICY' in os.environ:
+            os.environ.pop('NUMPY_WARN_IF_NO_MEM_POLICY')
+    else:
+        os.environ['NUMPY_WARN_IF_NO_MEM_POLICY'] = policy
+    try:
+        # The policy should be NULL, so we have to assume we can call
+        # "free".  A warning is given if the policy == "1"
+        if policy == "1":
+            with assert_warns(RuntimeWarning) as w:
+                del a
+                gc.collect()
+        else:
+            del a
+            gc.collect()
+
+    finally:
+        if oldval is None:
+            if 'NUMPY_WARN_IF_NO_MEM_POLICY' in os.environ:
+                os.environ.pop('NUMPY_WARN_IF_NO_MEM_POLICY')
+        else:
+            os.environ['NUMPY_WARN_IF_NO_MEM_POLICY'] = oldval
+
+def test_owner_is_base(get_module):
+    a = get_module.get_array_with_base()
+    with pytest.warns(UserWarning, match='warn_on_free'):
+        del a
+        gc.collect()
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 8f8043c30..fa7f254a6 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2511,27 +2511,19 @@ class TestMethods:
         assert_(not isinstance(a.searchsorted(b, 'left', s), A))
         assert_(not isinstance(a.searchsorted(b, 'right', s), A))
 
-    def test_argpartition_out_of_range(self):
+    @pytest.mark.parametrize("dtype", np.typecodes["All"])
+    def test_argpartition_out_of_range(self, dtype):
         # Test out of range values in kth raise an error, gh-5469
-        d = np.arange(10)
+        d = np.arange(10).astype(dtype=dtype)
         assert_raises(ValueError, d.argpartition, 10)
         assert_raises(ValueError, d.argpartition, -11)
-        # Test also for generic type argpartition, which uses sorting
-        # and used to not bound check kth
-        d_obj = np.arange(10, dtype=object)
-        assert_raises(ValueError, d_obj.argpartition, 10)
-        assert_raises(ValueError, d_obj.argpartition, -11)
 
-    def test_partition_out_of_range(self):
+    @pytest.mark.parametrize("dtype", np.typecodes["All"])
+    def test_partition_out_of_range(self, dtype):
         # Test out of range values in kth raise an error, gh-5469
-        d = np.arange(10)
+        d = np.arange(10).astype(dtype=dtype)
         assert_raises(ValueError, d.partition, 10)
         assert_raises(ValueError, d.partition, -11)
-        # Test also for generic type partition, which uses sorting
-        # and used to not bound check kth
-        d_obj = np.arange(10, dtype=object)
-        assert_raises(ValueError, d_obj.partition, 10)
-        assert_raises(ValueError, d_obj.partition, -11)
 
     def test_argpartition_integer(self):
         # Test non-integer values in kth raise an error/
@@ -2551,26 +2543,30 @@ class TestMethods:
         d_obj = np.arange(10, dtype=object)
         assert_raises(TypeError, d_obj.partition, 9.)
 
-    def test_partition_empty_array(self):
+    @pytest.mark.parametrize("kth_dtype", np.typecodes["AllInteger"])
+    def test_partition_empty_array(self, kth_dtype):
         # check axis handling for multidimensional empty arrays
+        kth = np.array(0, dtype=kth_dtype)[()]
         a = np.array([])
         a.shape = (3, 2, 1, 0)
         for axis in range(-a.ndim, a.ndim):
             msg = 'test empty array partition with axis={0}'.format(axis)
-            assert_equal(np.partition(a, 0, axis=axis), a, msg)
+            assert_equal(np.partition(a, kth, axis=axis), a, msg)
         msg = 'test empty array partition with axis=None'
-        assert_equal(np.partition(a, 0, axis=None), a.ravel(), msg)
+        assert_equal(np.partition(a, kth, axis=None), a.ravel(), msg)
 
-    def test_argpartition_empty_array(self):
+    @pytest.mark.parametrize("kth_dtype", np.typecodes["AllInteger"])
+    def test_argpartition_empty_array(self, kth_dtype):
         # check axis handling for multidimensional empty arrays
+        kth = np.array(0, dtype=kth_dtype)[()]
         a = np.array([])
         a.shape = (3, 2, 1, 0)
         for axis in range(-a.ndim, a.ndim):
             msg = 'test empty array argpartition with axis={0}'.format(axis)
-            assert_equal(np.partition(a, 0, axis=axis),
+            assert_equal(np.partition(a, kth, axis=axis),
                          np.zeros_like(a, dtype=np.intp), msg)
         msg = 'test empty array argpartition with axis=None'
-        assert_equal(np.partition(a, 0, axis=None),
+        assert_equal(np.partition(a, kth, axis=None),
                      np.zeros_like(a.ravel(), dtype=np.intp), msg)
 
     def test_partition(self):
@@ -2901,10 +2897,12 @@ class TestMethods:
                 assert_array_equal(np.partition(d, kth)[kth], tgt,
                                    err_msg="data: %r\n kth: %r" % (d, kth))
 
-    def test_argpartition_gh5524(self):
+    @pytest.mark.parametrize("kth_dtype", np.typecodes["AllInteger"])
+    def test_argpartition_gh5524(self, kth_dtype):
         #  A test for functionality of argpartition on lists.
-        d = [6,7,3,2,9,0]
-        p = np.argpartition(d,1)
+        kth = np.array(1, dtype=kth_dtype)[()]
+        d = [6, 7, 3, 2, 9, 0]
+        p = np.argpartition(d, kth)
         self.assert_partitioned(np.array(d)[p],[1])
 
     def test_flatten(self):
@@ -4200,7 +4198,7 @@ class TestArgmaxArgminCommon:
              (3, 4, 1, 2), (4, 1, 2, 3)]
 
     @pytest.mark.parametrize("size, axis", itertools.chain(*[[(size, axis)
-        for axis in list(range(-len(size), len(size))) + [None]] 
+        for axis in list(range(-len(size), len(size))) + [None]]
         for size in sizes]))
     @pytest.mark.parametrize('method', [np.argmax, np.argmin])
     def test_np_argmin_argmax_keepdims(self, size, axis, method):
@@ -4221,7 +4219,7 @@ class TestArgmaxArgminCommon:
         assert_equal(res, res_orig)
         assert_(res.shape == new_shape)
         outarray = np.empty(res.shape, dtype=res.dtype)
-        res1 = method(arr, axis=axis, out=outarray, 
+        res1 = method(arr, axis=axis, out=outarray,
                             keepdims=True)
         assert_(res1 is outarray)
         assert_equal(res, outarray)
@@ -4234,7 +4232,7 @@ class TestArgmaxArgminCommon:
                 wrong_shape[0] = 2
             wrong_outarray = np.empty(wrong_shape, dtype=res.dtype)
             with pytest.raises(ValueError):
-                method(arr.T, axis=axis, 
+                method(arr.T, axis=axis,
                         out=wrong_outarray, keepdims=True)
 
         # non-contiguous arrays
@@ -4252,18 +4250,18 @@ class TestArgmaxArgminCommon:
         assert_(res.shape == new_shape)
         outarray = np.empty(new_shape[::-1], dtype=res.dtype)
         outarray = outarray.T
-        res1 = method(arr.T, axis=axis, out=outarray, 
+        res1 = method(arr.T, axis=axis, out=outarray,
                             keepdims=True)
         assert_(res1 is outarray)
         assert_equal(res, outarray)
 
         if len(size) > 0:
-            # one dimension lesser for non-zero sized 
+            # one dimension lesser for non-zero sized
             # array should raise an error
             with pytest.raises(ValueError):
-                method(arr[0], axis=axis, 
+                method(arr[0], axis=axis,
                         out=outarray, keepdims=True)
-        
+
         if len(size) > 0:
             wrong_shape = list(new_shape)
             if axis is not None:
@@ -4272,7 +4270,7 @@ class TestArgmaxArgminCommon:
                 wrong_shape[0] = 2
             wrong_outarray = np.empty(wrong_shape, dtype=res.dtype)
             with pytest.raises(ValueError):
-                method(arr.T, axis=axis, 
+                method(arr.T, axis=axis,
                         out=wrong_outarray, keepdims=True)
 
     @pytest.mark.parametrize('method', ['max', 'min'])
@@ -4287,7 +4285,7 @@ class TestArgmaxArgminCommon:
             axes.remove(i)
             assert_(np.all(a_maxmin == aarg_maxmin.choose(
                                         *a.transpose(i, *axes))))
-    
+
     @pytest.mark.parametrize('method', ['argmax', 'argmin'])
     def test_output_shape(self, method):
         # see also gh-616
@@ -4330,7 +4328,7 @@ class TestArgmaxArgminCommon:
         [('argmax', np.argmax),
          ('argmin', np.argmin)])
     def test_np_vs_ndarray(self, arr_method, np_method):
-        # make sure both ndarray.argmax/argmin and 
+        # make sure both ndarray.argmax/argmin and
         # numpy.argmax/argmin support out/axis args
         a = np.random.normal(size=(2, 3))
         arg_method = getattr(a, arr_method)
@@ -4344,7 +4342,7 @@ class TestArgmaxArgminCommon:
         # check keyword args
         out1 = np.zeros(3, dtype=int)
         out2 = np.zeros(3, dtype=int)
-        assert_equal(arg_method(out=out1, axis=0), 
+        assert_equal(arg_method(out=out1, axis=0),
                      np_method(a, out=out2, axis=0))
         assert_equal(out1, out2)
 
@@ -4438,7 +4436,7 @@ class TestArgmax:
 
         assert_equal(np.argmax(arr), pos, err_msg="%r" % arr)
         assert_equal(arr[np.argmax(arr)], val, err_msg="%r" % arr)
-    
+
     def test_maximum_signed_integers(self):
 
         a = np.array([1, 2**7 - 1, -2**7], dtype=np.int8)
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index 6b743ab27..ed775cac6 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -9,7 +9,7 @@ import numpy.core._multiarray_tests as _multiarray_tests
 from numpy import array, arange, nditer, all
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises,
-    HAS_REFCOUNT, suppress_warnings
+    HAS_REFCOUNT, suppress_warnings, break_cycles
     )
 
 
@@ -2819,7 +2819,7 @@ def test_iter_writemasked_decref():
     for buf, mask_buf in it:
         buf[...] = (3, singleton)
 
-    del buf, mask_buf, it   # delete everything to ensure corrrect cleanup
+    del buf, mask_buf, it   # delete everything to ensure correct cleanup
 
     if HAS_REFCOUNT:
         # The buffer would have included additional items, they must be
@@ -3128,6 +3128,8 @@ def test_warn_noclose():
         assert len(sup.log) == 1
 
 
+@pytest.mark.skipif(sys.version_info[:2] == (3, 9) and sys.platform == "win32",
+                    reason="Errors with Python 3.9 on Windows")
 @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
 @pytest.mark.parametrize(["in_dtype", "buf_dtype"],
         [("i", "O"), ("O", "i"),  # most simple cases
@@ -3148,6 +3150,8 @@ def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
 
     # Note that resetting does not free references
     del it
+    break_cycles()
+    break_cycles()
     assert count == sys.getrefcount(value)
 
     # Repeat the test with `iternext`
@@ -3157,6 +3161,8 @@ def test_partial_iteration_cleanup(in_dtype, buf_dtype, steps):
         it.iternext()
 
     del it  # should ensure cleanup
+    break_cycles()
+    break_cycles()
     assert count == sys.getrefcount(value)
 
 
@@ -3202,7 +3208,7 @@ def test_debug_print(capfd):
     Currently uses a subprocess to avoid dealing with the C level `printf`s.
     """
     # the expected output with all addresses and sizes stripped (they vary
-    # and/or are platform dependend).
+    # and/or are platform dependent).
     expected = """
     ------ BEGIN ITERATOR DUMP ------
     | Iterator Address:
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 19de0a8aa..ad9437911 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -16,7 +16,7 @@ from numpy.testing import (
     )
 from numpy.core._rational_tests import rational
 
-from hypothesis import assume, given, strategies as st
+from hypothesis import given, strategies as st
 from hypothesis.extra import numpy as hynp
 
 
@@ -646,7 +646,7 @@ class TestFloatExceptions:
             if np.dtype(ftype).kind == 'f':
                 # Get some extreme values for the type
                 fi = np.finfo(ftype)
-                ft_tiny = fi.machar.tiny
+                ft_tiny = fi._machar.tiny
                 ft_max = fi.max
                 ft_eps = fi.eps
                 underflow = 'underflow'
@@ -655,7 +655,7 @@ class TestFloatExceptions:
                 # 'c', complex, corresponding real dtype
                 rtype = type(ftype(0).real)
                 fi = np.finfo(rtype)
-                ft_tiny = ftype(fi.machar.tiny)
+                ft_tiny = ftype(fi._machar.tiny)
                 ft_max = ftype(fi.max)
                 ft_eps = ftype(fi.eps)
                 # The complex types raise different exceptions
@@ -932,25 +932,6 @@ class TestTypes:
         # Promote with object:
         assert_equal(promote_types('O', S+'30'), np.dtype('O'))
 
-    @pytest.mark.parametrize(["dtype1", "dtype2"],
-            [[np.dtype("V6"), np.dtype("V10")],
-             [np.dtype([("name1", "i8")]), np.dtype([("name2", "i8")])],
-             [np.dtype("i8,i8"), np.dtype("i4,i4")],
-            ])
-    def test_invalid_void_promotion(self, dtype1, dtype2):
-        # Mainly test structured void promotion, which currently allows
-        # byte-swapping, but nothing else:
-        with pytest.raises(TypeError):
-            np.promote_types(dtype1, dtype2)
-
-    @pytest.mark.parametrize(["dtype1", "dtype2"],
-            [[np.dtype("V10"), np.dtype("V10")],
-             [np.dtype([("name1", "<i8")]), np.dtype([("name1", ">i8")])],
-             [np.dtype("i8,i8"), np.dtype("i8,>i8")],
-            ])
-    def test_valid_void_promotion(self, dtype1, dtype2):
-        assert np.promote_types(dtype1, dtype2) is dtype1
-
     @pytest.mark.parametrize("dtype",
            list(np.typecodes["All"]) +
            ["i,i", "S3", "S100", "U3", "U100", rational])
@@ -1503,6 +1484,18 @@ class TestNonzero:
         a = np.array([[False], [TrueThenFalse()]])
         assert_raises(RuntimeError, np.nonzero, a)
 
+    def test_nonzero_sideffects_structured_void(self):
+        # Checks that structured void does not mutate alignment flag of
+        # original array.
+        arr = np.zeros(5, dtype="i1,i8,i8")  # `ones` may short-circuit
+        assert arr.flags.aligned  # structs are considered "aligned"
+        assert not arr["f2"].flags.aligned
+        # make sure that nonzero/count_nonzero do not flip the flag:
+        np.nonzero(arr)
+        assert arr.flags.aligned
+        np.count_nonzero(arr)
+        assert arr.flags.aligned
+
     def test_nonzero_exception_safe(self):
         # gh-13930
 
@@ -2893,6 +2886,21 @@ class TestLikeFuncs:
         self.check_like_function(np.full_like, 123.456, True)
         self.check_like_function(np.full_like, np.inf, True)
 
+    @pytest.mark.parametrize('likefunc', [np.empty_like, np.full_like,
+                                          np.zeros_like, np.ones_like])
+    @pytest.mark.parametrize('dtype', [str, bytes])
+    def test_dtype_str_bytes(self, likefunc, dtype):
+        # Regression test for gh-19860
+        a = np.arange(16).reshape(2, 8)
+        b = a[:, ::2]  # Ensure b is not contiguous.
+        kwargs = {'fill_value': ''} if likefunc == np.full_like else {}
+        result = likefunc(b, dtype=dtype, **kwargs)
+        if dtype == str:
+            assert result.strides == (16, 4)
+        else:
+            # dtype is bytes
+            assert result.strides == (4, 1)
+
 
 class TestCorrelate:
     def _setup(self, dt):
@@ -3496,6 +3504,12 @@ class TestBroadcast:
 
         assert_raises(ValueError, np.broadcast, 1, **{'x': 1})
 
+    def test_shape_mismatch_error_message(self):
+        with pytest.raises(ValueError, match=r"arg 0 with shape \(1, 3\) and "
+                                             r"arg 2 with shape \(2,\)"):
+            np.broadcast([[1, 2, 3]], [[4], [5]], [6, 7])
+
+
 class TestKeepdims:
 
     class sub_array(np.ndarray):
diff --git a/numpy/core/tests/test_scalar_methods.py b/numpy/core/tests/test_scalar_methods.py
index 94b2dd3c9..eef4c1433 100644
--- a/numpy/core/tests/test_scalar_methods.py
+++ b/numpy/core/tests/test_scalar_methods.py
@@ -1,8 +1,11 @@
 """
 Test the scalar constructors, which also do type-coercion
 """
+import sys
 import fractions
 import platform
+import types
+from typing import Any, Type
 
 import pytest
 import numpy as np
@@ -128,3 +131,73 @@ class TestIsInteger:
             if value == 0:
                 continue
             assert not value.is_integer()
+
+
+@pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9")
+class TestClassGetItem:
+    @pytest.mark.parametrize("cls", [
+        np.number,
+        np.integer,
+        np.inexact,
+        np.unsignedinteger,
+        np.signedinteger,
+        np.floating,
+    ])
+    def test_abc(self, cls: Type[np.number]) -> None:
+        alias = cls[Any]
+        assert isinstance(alias, types.GenericAlias)
+        assert alias.__origin__ is cls
+
+    def test_abc_complexfloating(self) -> None:
+        alias = np.complexfloating[Any, Any]
+        assert isinstance(alias, types.GenericAlias)
+        assert alias.__origin__ is np.complexfloating
+
+    @pytest.mark.parametrize("cls", [np.generic, np.flexible, np.character])
+    def test_abc_non_numeric(self, cls: Type[np.generic]) -> None:
+        with pytest.raises(TypeError):
+            cls[Any]
+
+    @pytest.mark.parametrize("code", np.typecodes["All"])
+    def test_concrete(self, code: str) -> None:
+        cls = np.dtype(code).type
+        with pytest.raises(TypeError):
+            cls[Any]
+
+    @pytest.mark.parametrize("arg_len", range(4))
+    def test_subscript_tuple(self, arg_len: int) -> None:
+        arg_tup = (Any,) * arg_len
+        if arg_len == 1:
+            assert np.number[arg_tup]
+        else:
+            with pytest.raises(TypeError):
+                np.number[arg_tup]
+
+    def test_subscript_scalar(self) -> None:
+        assert np.number[Any]
+
+
+@pytest.mark.skipif(sys.version_info >= (3, 9), reason="Requires python 3.8")
+@pytest.mark.parametrize("cls", [np.number, np.complexfloating, np.int64])
+def test_class_getitem_38(cls: Type[np.number]) -> None:
+    match = "Type subscription requires python >= 3.9"
+    with pytest.raises(TypeError, match=match):
+        cls[Any]
+
+
+class TestBitCount:
+    # derived in part from the cpython test "test_bit_count"
+
+    @pytest.mark.parametrize("itype", np.sctypes['int']+np.sctypes['uint'])
+    def test_small(self, itype):
+        for a in range(max(np.iinfo(itype).min, 0), 128):
+            msg = f"Smoke test for {itype}({a}).bit_count()"
+            assert itype(a).bit_count() == bin(a).count("1"), msg
+
+    def test_bit_count(self):
+        for exp in [10, 17, 63]:
+            a = 2**exp
+            assert np.uint64(a).bit_count() == 1
+            assert np.uint64(a - 1).bit_count() == exp
+            assert np.uint64(a ^ 63).bit_count() == 7
+            assert np.uint64((a - 1) ^ 510).bit_count() == exp - 8
diff --git a/numpy/core/tests/test_scalarinherit.py b/numpy/core/tests/test_scalarinherit.py
index cc53eb244..98d7f7cde 100644
--- a/numpy/core/tests/test_scalarinherit.py
+++ b/numpy/core/tests/test_scalarinherit.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 """ Test printing of scalar types.
 
 """
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index becd65b11..90078a2ea 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -5,14 +5,14 @@ import itertools
 import operator
 import platform
 import pytest
-from hypothesis import given, settings, Verbosity, assume
+from hypothesis import given, settings, Verbosity
 from hypothesis.strategies import sampled_from
 
 import numpy as np
 from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_almost_equal,
     assert_array_equal, IS_PYPY, suppress_warnings, _gen_alignment_data,
-    assert_warns, assert_raises_regex,
+    assert_warns,
     )
 
 types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
diff --git a/numpy/core/tests/test_scalarprint.py b/numpy/core/tests/test_scalarprint.py
index 2f1c3bc5e..ee21d4aa5 100644
--- a/numpy/core/tests/test_scalarprint.py
+++ b/numpy/core/tests/test_scalarprint.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 """ Test printing of scalar types.
 
 """
diff --git a/numpy/core/tests/test_simd.py b/numpy/core/tests/test_simd.py
index f0c60953b..0270ad901 100644
--- a/numpy/core/tests/test_simd.py
+++ b/numpy/core/tests/test_simd.py
@@ -329,7 +329,7 @@ class _SIMD_FP(_Test_Utility):
         data_square = [x*x for x in data]
         square = self.square(vdata)
         assert square == data_square
-        
+
     def test_max(self):
         """
         Test intrinsics:
@@ -818,6 +818,7 @@ class _SIMD_ALL(_Test_Utility):
         if self._is_fp():
             return
 
+        int_min = self._int_min()
         def trunc_div(a, d):
             """
             Divide towards zero works with large integers > 2^53,
@@ -830,57 +831,31 @@ class _SIMD_ALL(_Test_Utility):
                 return a // d
             return (a + sign_d - sign_a) // d + 1
 
-        int_min = self._int_min() if self._is_signed() else 1
-        int_max = self._int_max()
-        rdata = (
-            0, 1, self.nlanes, int_max-self.nlanes,
-            int_min, int_min//2 + 1
-        )
-        divisors = (1, 2, 9, 13, self.nlanes, int_min, int_max, int_max//2)
-
-        for x, d in itertools.product(rdata, divisors):
-            data = self._data(x)
-            vdata = self.load(data)
-            data_divc = [trunc_div(a, d) for a in data]
-            divisor = self.divisor(d)
-            divc = self.divc(vdata, divisor)
-            assert divc == data_divc
-
-        if not self._is_signed():
-            return
-
-        safe_neg = lambda x: -x-1 if -x > int_max else -x
-        # test round division for signed integers
-        for x, d in itertools.product(rdata, divisors):
-            d_neg = safe_neg(d)
-            data = self._data(x)
-            data_neg = [safe_neg(a) for a in data]
-            vdata = self.load(data)
-            vdata_neg = self.load(data_neg)
-            divisor = self.divisor(d)
-            divisor_neg = self.divisor(d_neg)
-
-            # round towards zero
-            data_divc = [trunc_div(a, d_neg) for a in data]
-            divc = self.divc(vdata, divisor_neg)
-            assert divc == data_divc
-            data_divc = [trunc_div(a, d) for a in data_neg]
-            divc = self.divc(vdata_neg, divisor)
+        data = [1, -int_min]  # to test overflow
+        data += range(0, 2**8, 2**5)
+        data += range(0, 2**8, 2**5-1)
+        bsize = self._scalar_size()
+        if bsize > 8:
+            data += range(2**8, 2**16, 2**13)
+            data += range(2**8, 2**16, 2**13-1)
+        if bsize > 16:
+            data += range(2**16, 2**32, 2**29)
+            data += range(2**16, 2**32, 2**29-1)
+        if bsize > 32:
+            data += range(2**32, 2**64, 2**61)
+            data += range(2**32, 2**64, 2**61-1)
+        # negate
+        data += [-x for x in data]
+        for dividend, divisor in itertools.product(data, data):
+            divisor = self.setall(divisor)[0]  # cast
+            if divisor == 0:
+                continue
+            dividend = self.load(self._data(dividend))
+            data_divc = [trunc_div(a, divisor) for a in dividend]
+            divisor_parms = self.divisor(divisor)
+            divc = self.divc(dividend, divisor_parms)
             assert divc == data_divc
 
-        # test truncate sign if the dividend is zero
-        vzero = self.zero()
-        for d in (-1, -10, -100, int_min//2, int_min):
-            divisor = self.divisor(d)
-            divc = self.divc(vzero, divisor)
-            assert divc == vzero
-
-        # test overflow
-        vmin = self.setall(int_min)
-        divisor = self.divisor(-1)
-        divc = self.divc(vmin, divisor)
-        assert divc == vmin
-
     def test_arithmetic_reduce_sum(self):
         """
         Test reduce sum intrinsics:
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index c3ea10d93..ef0bac957 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1362,6 +1362,14 @@ class TestUfunc:
                            np.array([[2]*i for i in [1, 3, 6, 10]], dtype=object),
                           )
 
+    def test_object_array_accumulate_failure(self):
+        # Typical accumulation on object works as expected:
+        res = np.add.accumulate(np.array([1, 0, 2], dtype=object))
+        assert_array_equal(res, np.array([1, 1, 3], dtype=object))
+        # But errors are propagated from the inner-loop if they occur:
+        with pytest.raises(TypeError):
+            np.add.accumulate([1, None, 2])
+
     def test_object_array_reduceat_inplace(self):
         # Checks that in-place reduceats work, see also gh-7465
         arr = np.empty(4, dtype=object)
@@ -1381,6 +1389,15 @@ class TestUfunc:
         np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
         assert_array_equal(arr, out)
 
+    def test_object_array_reduceat_failure(self):
+        # Reduceat works as expected when no invalid operation occurs (None is
+        # not involved in an operation here)
+        res = np.add.reduceat(np.array([1, None, 2], dtype=object), [1, 2])
+        assert_array_equal(res, np.array([None, 2], dtype=object))
+        # But errors when None would be involved in an operation:
+        with pytest.raises(TypeError):
+            np.add.reduceat([1, None, 2], [0, 2])
+
     def test_zerosize_reduction(self):
         # Test with default dtype and object dtype
         for a in [[], np.array([], dtype=object)]:
@@ -2098,6 +2115,25 @@ class TestUfunc:
         with pytest.raises(TypeError):
             ufunc(a, a, signature=signature)
 
+    @pytest.mark.parametrize("ufunc",
+            [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_support_anything(self, ufunc):
+        # The logical ufuncs support even input that can't be promoted:
+        a = np.array('1')
+        c = np.array([1., 2.])
+        assert_array_equal(ufunc(a, c), ufunc([True, True], True))
+        assert ufunc.reduce(a) == True
+
+    @pytest.mark.parametrize("ufunc",
+             [np.logical_and, np.logical_or, np.logical_xor])
+    def test_logical_ufuncs_out_cast_check(self, ufunc):
+        a = np.array('1')
+        c = np.array([1., 2.])
+        out = a.copy()
+        with pytest.raises(TypeError):
+            # It would be safe, but not equiv casting:
+            ufunc(a, c, out=out, casting="equiv")
+
     def test_reduce_noncontig_output(self):
         # Check that reduction deals with non-contiguous output arrays
         # appropriately.
@@ -2119,6 +2155,22 @@ class TestUfunc:
         assert_equal(y_base[1,:], y_base_copy[1,:])
         assert_equal(y_base[3,:], y_base_copy[3,:])
 
+    @pytest.mark.parametrize("with_cast", [True, False])
+    def test_reduceat_and_accumulate_out_shape_mismatch(self, with_cast):
+        # Should raise an error mentioning "shape" or "size"
+        arr = np.arange(5)
+        out = np.arange(3)  # definitely wrong shape
+        if with_cast:
+            # If a cast is necessary on the output, we can be sure to use
+            # the generic NpyIter (non-fast) path.
+            out = out.astype(np.float64)
+
+        with pytest.raises(ValueError, match="(shape|size)"):
+            np.add.reduceat(arr, [0, 3], out=out)
+
+        with pytest.raises(ValueError, match="(shape|size)"):
+            np.add.accumulate(arr, out=out)
+
     @pytest.mark.parametrize('out_shape',
                              [(), (1,), (3,), (1, 1), (1, 3), (4, 3)])
     @pytest.mark.parametrize('keepdims', [True, False])
@@ -2308,6 +2360,14 @@ def test_ufunc_casterrors():
     assert out[-1] == 1
 
 
+def test_trivial_loop_invalid_cast():
+    # This tests the fast-path "invalid cast", see gh-19904.
+    with pytest.raises(TypeError,
+            match="cast ufunc 'add' input 0"):
+        # the void dtype definitely cannot cast to double:
+        np.add(np.array(1, "i,i"), 3, signature="dd->d")
+
+
 @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts")
 @pytest.mark.parametrize("offset",
         [0, np.BUFSIZE//2, int(1.5*np.BUFSIZE)])
@@ -2323,8 +2383,9 @@ def test_reduce_casterrors(offset):
     out = np.array(-1, dtype=np.intp)
 
     count = sys.getrefcount(value)
-    with pytest.raises(ValueError):
-        # This is an unsafe cast, but we currently always allow that:
+    with pytest.raises(ValueError, match="invalid literal"):
+        # This is an unsafe cast, but we currently always allow that.
+        # Note that the double loop is picked, but the cast fails.
         np.add.reduce(arr, dtype=np.intp, out=out)
     assert count == sys.getrefcount(value)
     # If an error occurred during casting, the operation is done at most until
@@ -2332,3 +2393,20 @@ def test_reduce_casterrors(offset):
     # if the error happened immediately.
     # This does not define behaviour, the output is invalid and thus undefined
     assert out[()] < value * offset
+
+
+@pytest.mark.parametrize("method",
+        [np.add.accumulate, np.add.reduce,
+         pytest.param(lambda x: np.add.reduceat(x, [0]), id="reduceat"),
+         pytest.param(lambda x: np.log.at(x, [2]), id="at")])
+def test_ufunc_methods_floaterrors(method):
+    # adding inf and -inf (or log(-inf) creates an invalid float and warns
+    arr = np.array([np.inf, 0, -np.inf])
+    with np.errstate(all="warn"):
+        with pytest.warns(RuntimeWarning, match="invalid value"):
+            method(arr)
+
+    arr = np.array([np.inf, 0, -np.inf])
+    with np.errstate(all="raise"):
+        with pytest.raises(FloatingPointError):
+            method(arr)
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 4f57c0088..8f5a85824 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -15,7 +15,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_raises, assert_raises_regex,
     assert_array_equal, assert_almost_equal, assert_array_almost_equal,
     assert_array_max_ulp, assert_allclose, assert_no_warnings, suppress_warnings,
-    _gen_alignment_data, assert_array_almost_equal_nulp, assert_warns
+    _gen_alignment_data, assert_array_almost_equal_nulp
     )
 
 def get_glibc_version():
@@ -973,6 +973,12 @@ class TestLog:
         xf = np.log(x)
         assert_almost_equal(np.log(x, out=x), xf)
 
+        # test log() of max for dtype does not raise
+        for dt in ['f', 'd', 'g']:
+            with np.errstate(all='raise'):
+                x = np.finfo(dt).max
+                np.log(x)
+
     def test_log_strides(self):
         np.random.seed(42)
         strides = np.array([-4,-3,-2,-1,1,2,3,4])
@@ -3852,3 +3858,39 @@ def test_outer_exceeds_maxdims():
     with assert_raises(ValueError):
         np.add.outer(deep, deep)
 
+def test_bad_legacy_ufunc_silent_errors():
+    # legacy ufuncs can't report errors and NumPy can't check if the GIL
+    # is released.  So NumPy has to check after the GIL is released just to
+    # cover all bases.  `np.power` uses/used to use this.
+    arr = np.arange(3).astype(np.float64)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error(arr, arr)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        # not contiguous means the fast-path cannot be taken
+        non_contig = arr.repeat(20).reshape(-1, 6)[:, ::2]
+        ncu_tests.always_error(non_contig, arr)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error.outer(arr, arr)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error.reduce(arr)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error.reduceat(arr, [0, 1])
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error.accumulate(arr)
+
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error.at(arr, [0, 1, 2], arr)
+
+
+@pytest.mark.parametrize('x1', [np.arange(3.0), [0.0, 1.0, 2.0]])
+def test_bad_legacy_gufunc_silent_errors(x1):
+    # Verify that an exception raised in a gufunc loop propagates correctly.
+    # The signature of always_error_gufunc is '(i),()->()'.
+    with pytest.raises(RuntimeError, match=r"How unexpected :\)!"):
+        ncu_tests.always_error_gufunc(x1, 0.0)
diff --git a/numpy/core/tests/test_umath_accuracy.py b/numpy/core/tests/test_umath_accuracy.py
index a703c697a..32e2dca66 100644
--- a/numpy/core/tests/test_umath_accuracy.py
+++ b/numpy/core/tests/test_umath_accuracy.py
@@ -1,5 +1,4 @@
 import numpy as np
-import platform
 import os
 from os import path
 import sys