148 files changed, 12001 insertions, 6602 deletions
diff --git a/numpy/core/src/_simd/_simd.c b/numpy/core/src/_simd/_simd.c
index b1fdd4478..52b66e765 100644
--- a/numpy/core/src/_simd/_simd.c
+++ b/numpy/core/src/_simd/_simd.c
@@ -1,11 +1,33 @@
 #include "_simd.h"
 
+#include "numpy/npy_math.h"
+
+static PyObject *
+get_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyLong_FromLong(npy_get_floatstatus());
+}
+
+static PyObject *
+clear_floatstatus(PyObject* NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    npy_clear_floatstatus();
+    Py_RETURN_NONE;
+}
+
+static PyMethodDef _simd_methods[] = {
+    {"get_floatstatus", get_floatstatus, METH_NOARGS, NULL},
+    {"clear_floatstatus", clear_floatstatus, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
 PyMODINIT_FUNC PyInit__simd(void)
 {
     static struct PyModuleDef defs = {
         .m_base = PyModuleDef_HEAD_INIT,
         .m_name = "numpy.core._simd",
-        .m_size = -1
+        .m_size = -1,
+        .m_methods = _simd_methods
     };
     if (npy_cpu_init() < 0) {
         return NULL;
diff --git a/numpy/core/src/_simd/_simd.dispatch.c.src b/numpy/core/src/_simd/_simd.dispatch.c.src
index 48023af80..f532c9e02 100644
--- a/numpy/core/src/_simd/_simd.dispatch.c.src
+++ b/numpy/core/src/_simd/_simd.dispatch.c.src
@@ -42,23 +42,26 @@
  */
 SIMD_IMPL_INTRIN_1(@intrin@_@sfx@, v@sfx@, q@sfx@)
 /**end repeat1**/
+SIMD_IMPL_INTRIN_1(load_@sfx@x2, v@sfx@x2, q@sfx@)
+
 /**begin repeat1
- * # intrin = store, storea, stores, storel, storeh#
+ * # intrin = store, storea, stores, storel, storeh, store#
+ * # x = ,,,,, x2#
  */
 // special definition due to the nature of @intrin@
 static PyObject *
-simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@@x@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
-    simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
+    simd_arg vec_arg = {.dtype = simd_data_v@sfx@@x@};
     if (!PyArg_ParseTuple(
-        args, "O&O&:@intrin@_@sfx@",
+        args, "O&O&:@intrin@_@sfx@@x@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_@intrin@_@sfx@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@);
+    npyv_@intrin@_@sfx@@x@(seq_arg.data.q@sfx@, vec_arg.data.v@sfx@@x@);
     // write-back
     if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.q@sfx@, simd_data_q@sfx@)) {
         simd_arg_free(&seq_arg);
@@ -76,23 +79,35 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 // Partial Load
 SIMD_IMPL_INTRIN_3(load_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@)
 SIMD_IMPL_INTRIN_2(load_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#if @size@ == 32
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_@sfx@, v@sfx@, q@sfx@, u32, @sfx@, @sfx@)
+    SIMD_IMPL_INTRIN_2(load2_tillz_@sfx@, v@sfx@, q@sfx@, u32)
+#endif
 
 // Partial Store
+/**begin repeat1
+ * #intrin = store_till, store2_till, store2_till#
+ * #chksize= 0,          32,           64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
-simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
+simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
     simd_arg seq_arg = {.dtype = simd_data_q@sfx@};
     simd_arg nlane_arg = {.dtype = simd_data_u32};
     simd_arg vec_arg = {.dtype = simd_data_v@sfx@};
     if (!PyArg_ParseTuple(
-        args, "O&O&O&:store_till_@sfx@",
+        args, "O&O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
         simd_arg_converter, &nlane_arg,
         simd_arg_converter, &vec_arg
     )) {
         return NULL;
     }
-    npyv_store_till_@sfx@(
+    npyv_@intrin@_@sfx@(
         seq_arg.data.q@sfx@, nlane_arg.data.u32, vec_arg.data.v@sfx@
     );
     // write-back
@@ -103,14 +118,22 @@ simd__intrin_store_till_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     simd_arg_free(&seq_arg);
     Py_RETURN_NONE;
 }
+#endif // chksize
 
+/**end repeat1**/
 // Non-contiguous Load
 /**begin repeat1
- * #intrin = loadn, loadn_till, loadn_tillz#
- * #till   = 0,     1,          1#
- * #fill   = 0,     1,          0#
- * #format = ,    O&O&,         O&#
- */
+ * #intrin = loadn,       loadn2,       loadn2,
+ *           loadn_till,  loadn2_till,  loadn2_till,
+ *           loadn_tillz, loadn2_tillz, loadn2_tillz#
+ * #scale  = 1,2,2,       1,2,2,         1,2,2#
+ * #till   = 0*3,         1*3,           1*3#
+ * #fill   = 0*3,         1*3,           0*3#
+ # #fill2  = 0*3,         0,1,1,         0*3#
+ * #format = ,,,          O&O&, O&O&O&*2,O&*3#
+ * #chksize= 0,32,64,     0,32,64,       0,32,64#
+ */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -122,6 +145,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
     simd_arg fill_arg = {.dtype = simd_data_@sfx@};
 #endif
+#if @fill2@
+    simd_arg fill2_arg = {.dtype = simd_data_@sfx@};
+#endif
     if (!PyArg_ParseTuple(
         args, "@format@O&O&:@intrin@_@sfx@",
         simd_arg_converter, &seq_arg,
@@ -132,6 +158,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 #if @fill@
         ,simd_arg_converter, &fill_arg
 #endif
+#if @fill2@
+        ,simd_arg_converter, &fill2_arg
+#endif
     )) {
         return NULL;
     }
@@ -140,7 +169,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1 * @scale@;
         min_seq_len = -min_seq_len;
     }
     if (cur_seq_len < min_seq_len) {
@@ -159,6 +188,9 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     #if @fill@
         , fill_arg.data.@sfx@
     #endif
+    #if @fill2@
+        , fill2_arg.data.@sfx@
+    #endif
     );
     simd_arg ret = {
         .dtype = simd_data_v@sfx@, .data = {.v@sfx@=rvec}
@@ -169,14 +201,19 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 
 // Non-contiguous Store
 /**begin repeat1
- * #intrin = storen, storen_till#
- * #till   = 0,      1#
- * #format = ,       O&#
+ * #intrin = storen,      storen2,      storen2,
+             storen_till, storen2_till, storen2_till#
+ * #scale  = 1,2,2,       1,2,2#
+ * #till   = 0*3,         1*3#
+ * #format = ,,,          O&*3#
+ * #chksize= 0,32,64,     0,32,64#
  */
+#if !@chksize@ || @chksize@ == @size@
 static PyObject *
 simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
 {
@@ -202,7 +239,7 @@ simd__intrin_@intrin@_@sfx@(PyObject* NPY_UNUSED(self), PyObject *args)
     Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
     Py_ssize_t min_seq_len = stride * npyv_nlanes_@sfx@;
     if (stride < 0) {
-        seq_ptr += cur_seq_len -1;
+        seq_ptr += cur_seq_len - 1*@scale@;
         min_seq_len = -min_seq_len;
     }
     // overflow guard
@@ -231,6 +268,7 @@ err:
     simd_arg_free(&seq_arg);
     return NULL;
 }
+#endif // chksize
 /**end repeat1**/
 #endif // @ncont_sup@
 
@@ -300,7 +338,7 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
 /**begin repeat1
- * # intrin = combine, zip#
+ * # intrin = combine, zip, unzip#
  */
 SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -309,6 +347,60 @@ SIMD_IMPL_INTRIN_2(@intrin@_@sfx@, v@sfx@x2, v@sfx@, v@sfx@)
 SIMD_IMPL_INTRIN_1(rev64_@sfx@, v@sfx@, v@sfx@)
 #endif
 
+// special implementation to convert runtime constants to immediate values
+#if @size@ == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   /**begin repeat1
+    * # en = e0, e1, e2, e3#
+    */
+    npyv_@sfx@ v@en@;
+    npyv_lanetype_@sfx@ d@en@[npyv_nlanes_@sfx@];
+    if (0) {}
+   /**begin repeat2
+    * # imm = 1, 2, 3#
+    */
+    else if (@en@ == @imm@) {
+        v@en@ = npyv_permi128_@sfx@(a, @imm@, @imm@, @imm@, @imm@);
+    }
+    /**end repeat2**/
+    else {
+        v@en@ = npyv_permi128_@sfx@(a, 0, 0, 0, 0);
+    }
+    npyv_store_@sfx@(d@en@, v@en@);
+    /**end repeat1**/
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_@sfx@; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_@sfx@(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8, u8, u8)
+#elif @size@ == 64
+NPY_FINLINE npyv_@sfx@
+npyv_permi128_@sfx@_(npyv_@sfx@ a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_@sfx@(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_@sfx@(a, 1, 1);
+    }
+    return npyv_permi128_@sfx@(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_@sfx@_, v@sfx@, v@sfx@, u8, u8)
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -387,7 +479,7 @@ SIMD_IMPL_INTRIN_2(divc_@sfx@, v@sfx@, v@sfx@, v@sfx@x3)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_IMPL_INTRIN_3(@intrin@_@sfx@, v@sfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
@@ -438,6 +530,11 @@ SIMD_IMPL_INTRIN_1(reduce_@intrin@_@sfx@, @sfx@, v@sfx@)
  SIMD_IMPL_INTRIN_4(@intrin@_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+SIMD_IMPL_INTRIN_4(ifdiv_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@, v@sfx@)
+SIMD_IMPL_INTRIN_3(ifdivz_@sfx@, v@sfx@, v@bsfx@, v@sfx@, v@sfx@)
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
@@ -541,6 +638,12 @@ static PyMethodDef simd__intrinsics_methods[] = {
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+/**begin repeat1
+ * # intrin = load, store#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@x2)
+/**end repeat1**/
+
 /****************************************
  * Non-contiguous/Partial Memory access
  ****************************************/
@@ -551,6 +654,21 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
+#if @size@ == 32
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#else
+    /**begin repeat1
+     * #intrin = load2_till, load2_tillz, loadn2, loadn2_till, loadn2_tillz,
+     *           store2_till, storen2, storen2_till#
+     */
+    SIMD_INTRIN_DEF(@intrin@_@sfx@)
+    /**end repeat1**/
+#endif
 #endif // ncont_sup
 
 /****************************
@@ -584,7 +702,7 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
  * Reorder
  ***************************/
 /**begin repeat1
- * # intrin = combinel, combineh, combine, zip#
+ * # intrin = combinel, combineh, combine, zip, unzip#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -593,6 +711,10 @@ SIMD_INTRIN_DEF(@intrin@_@sfx@)
 SIMD_INTRIN_DEF(rev64_@sfx@)
 #endif
 
+#if @size@ > 16
+{ "permi128_@sfx@", simd__intrin_permi128_@sfx@_, METH_VARARGS, NULL },
+#endif
+
 /***************************
  * Operators
  ***************************/
@@ -658,7 +780,7 @@ SIMD_INTRIN_DEF(divc_@sfx@)
 
 #if @fused_sup@
 /**begin repeat1
- * #intrin = muladd, mulsub, nmuladd, nmulsub#
+ * #intrin = muladd, mulsub, nmuladd, nmulsub, muladdsub#
  */
 SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
@@ -708,6 +830,14 @@ SIMD_INTRIN_DEF(reduce_@intrin@_@sfx@)
  SIMD_INTRIN_DEF(@intrin@_@sfx@)
 /**end repeat1**/
 
+#if @fp_only@
+/**begin repeat1
+ * #intrin = ifdiv, ifdivz#
+ */
+SIMD_INTRIN_DEF(@intrin@_@sfx@)
+/**end repeat1**/
+#endif
+
 #endif // simd_sup
 /**end repeat**/
 /*************************************************************************
diff --git a/numpy/core/src/_simd/_simd_easyintrin.inc b/numpy/core/src/_simd/_simd_easyintrin.inc
index f2e0da26e..e300e5484 100644
--- a/numpy/core/src/_simd/_simd_easyintrin.inc
+++ b/numpy/core/src/_simd/_simd_easyintrin.inc
@@ -197,6 +197,39 @@
         return simd_arg_to_obj(&ret);                     \
     }
 
+#define SIMD_IMPL_INTRIN_5(NAME, RET, IN0, IN1, IN2, IN3, IN4) \
+    static PyObject *simd__intrin_##NAME                  \
+    (PyObject* NPY_UNUSED(self), PyObject *args)          \
+    {                                                     \
+        simd_arg arg1 = {.dtype = simd_data_##IN0};       \
+        simd_arg arg2 = {.dtype = simd_data_##IN1};       \
+        simd_arg arg3 = {.dtype = simd_data_##IN2};       \
+        simd_arg arg4 = {.dtype = simd_data_##IN3};       \
+        simd_arg arg5 = {.dtype = simd_data_##IN4};       \
+        if (!PyArg_ParseTuple(                            \
+            args, "O&O&O&O&O&:"NPY_TOSTRING(NAME),        \
+            simd_arg_converter, &arg1,                    \
+            simd_arg_converter, &arg2,                    \
+            simd_arg_converter, &arg3,                    \
+            simd_arg_converter, &arg4,                    \
+            simd_arg_converter, &arg5                     \
+        )) return NULL;                                   \
+        simd_data data = {.RET = npyv_##NAME(             \
+            arg1.data.IN0, arg2.data.IN1,                 \
+            arg3.data.IN2, arg4.data.IN3,                 \
+            arg5.data.IN4                                 \
+        )};                                               \
+        simd_arg_free(&arg1);                             \
+        simd_arg_free(&arg2);                             \
+        simd_arg_free(&arg3);                             \
+        simd_arg_free(&arg4);                             \
+        simd_arg_free(&arg5);                             \
+        simd_arg ret = {                                  \
+            .data = data, .dtype = simd_data_##RET        \
+        };                                                \
+        return simd_arg_to_obj(&ret);                     \
+    }
+
 /**
  * Helper macros for repeating and expand a certain macro.
  * Mainly used for converting a scalar to an immediate constant.
diff --git a/numpy/core/src/common/common.hpp b/numpy/core/src/common/common.hpp
new file mode 100644
index 000000000..44ba449d8
--- /dev/null
+++ b/numpy/core/src/common/common.hpp
@@ -0,0 +1,14 @@
+#ifndef NUMPY_CORE_SRC_COMMON_COMMON_HPP
+#define NUMPY_CORE_SRC_COMMON_COMMON_HPP
+/*
+ * The following C++ headers are safe to be used standalone, however,
+ * they are gathered to make it easy for us and for the future need to support PCH.
+ */
+#include "npdef.hpp"
+#include "utils.hpp"
+#include "npstd.hpp"
+#include "half.hpp"
+#include "meta.hpp"
+#include "float_status.hpp"
+
+#endif // NUMPY_CORE_SRC_COMMON_COMMON_HPP
diff --git a/numpy/core/src/common/dlpack/dlpack.h b/numpy/core/src/common/dlpack/dlpack.h
index 29209aee1..f0cbf6136 100644
--- a/numpy/core/src/common/dlpack/dlpack.h
+++ b/numpy/core/src/common/dlpack/dlpack.h
@@ -1,5 +1,5 @@
 // Taken from:
-// https://github.com/dmlc/dlpack/blob/9b6176fdecb55e9bf39b16f08b96913ed3f275b4/include/dlpack/dlpack.h
+// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
 /*!
  *  Copyright (c) 2017 by Contributors
  * \file dlpack.h
@@ -8,14 +8,20 @@
 #ifndef DLPACK_DLPACK_H_
 #define DLPACK_DLPACK_H_
 
+/**
+ * \brief Compatibility with C++
+ */
 #ifdef __cplusplus
 #define DLPACK_EXTERN_C extern "C"
 #else
 #define DLPACK_EXTERN_C
 #endif
 
-/*! \brief The current version of dlpack */
-#define DLPACK_VERSION 050
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -34,10 +40,41 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
 /*!
  * \brief The device type in DLDevice.
  */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
 typedef enum {
+#endif
   /*! \brief CPU device */
   kDLCPU = 1,
   /*! \brief CUDA GPU device */
@@ -70,6 +107,17 @@ typedef enum {
    * \brief CUDA managed/unified memory allocated by cudaMallocManaged
    */
   kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
 } DLDeviceType;
 
 /*!
@@ -82,7 +130,7 @@ typedef struct {
    * \brief The device index.
    * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
    */
-  int device_id;
+  int32_t device_id;
 } DLDevice;
 
 /*!
@@ -107,16 +155,21 @@ typedef enum {
    * (C/C++/Python layout: compact struct per complex number)
    */
   kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
 } DLDataTypeCode;
 
 /*!
- * \brief The data type the tensor can hold.
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
  *
  *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
  *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
  */
 typedef struct {
   /*!
@@ -138,9 +191,16 @@ typedef struct {
  */
 typedef struct {
   /*!
-   * \brief The opaque data pointer points to the allocated data. This will be
-   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
-   * aligned to 256 bytes as in CUDA.
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
    *
    * For given DLTensor, the size of memory required to store the contents of
    * data is calculated as follows:
@@ -160,7 +220,7 @@ typedef struct {
   /*! \brief The device of the tensor */
   DLDevice device;
   /*! \brief Number of dimensions */
-  int ndim;
+  int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
   /*! \brief The shape of the tensor */
@@ -180,6 +240,13 @@ typedef struct {
  *  not meant to transfer the tensor. When the borrowing framework doesn't need
  *  the tensor, it should call the deleter to notify the host that the resource
  *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
  */
 typedef struct DLManagedTensor {
   /*! \brief DLTensor which is being memory managed */
@@ -188,13 +255,65 @@ typedef struct DLManagedTensor {
    *   which DLManagedTensor is used in the framework. It can also be NULL.
    */
   void * manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructors deletes the argument self as well.
    */
   void (*deleter)(struct DLManagedTensor * self);
 } DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void *manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+   * It can be NULL if there is no way for the caller to provide a reasonable
+   * destructor. The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned *self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif
diff --git a/numpy/core/src/common/float_status.hpp b/numpy/core/src/common/float_status.hpp
new file mode 100644
index 000000000..8e4d5e06a
--- /dev/null
+++ b/numpy/core/src/common/float_status.hpp
@@ -0,0 +1,134 @@
+#ifndef NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+#define NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
+#include "npstd.hpp"
+
+#include <fenv.h>
+
+namespace np {
+
+/// @addtogroup cpp_core_utility
+/// @{
+/**
+ * Class wraps floating-point environment operations,
+ * provides lazy access to its functionality.
+ */
+class FloatStatus {
+ public:
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifdef FE_DIVBYZERO
+    static constexpr int kDivideByZero = FE_DIVBYZERO;
+#else
+    static constexpr int kDivideByZero = 0;
+#endif
+#ifdef FE_INVALID
+    static constexpr int kInvalid = FE_INVALID;
+#else
+    static constexpr int kInvalid = 0;
+#endif
+#ifdef FE_INEXACT
+    static constexpr int kInexact = FE_INEXACT;
+#else
+    static constexpr int kInexact = 0;
+#endif
+#ifdef FE_OVERFLOW
+    static constexpr int kOverflow = FE_OVERFLOW;
+#else
+    static constexpr int kOverflow = 0;
+#endif
+#ifdef FE_UNDERFLOW
+    static constexpr int kUnderflow = FE_UNDERFLOW;
+#else
+    static constexpr int kUnderflow = 0;
+#endif
+    static constexpr int kAllExcept = (kDivideByZero | kInvalid | kInexact |
+                                       kOverflow | kUnderflow);
+
+    FloatStatus(bool clear_on_dst=true)
+        : clear_on_dst_(clear_on_dst)
+    {
+        if constexpr (kAllExcept != 0) {
+            fpstatus_ = fetestexcept(kAllExcept);
+        }
+        else {
+            fpstatus_ = 0;
+        }
+    }
+    ~FloatStatus()
+    {
+        if constexpr (kAllExcept != 0) {
+            if (fpstatus_ != 0 && clear_on_dst_) {
+                feclearexcept(kAllExcept);
+            }
+        }
+    }
+    constexpr bool IsDivideByZero() const
+    {
+        return (fpstatus_ & kDivideByZero) != 0;
+    }
+    constexpr bool IsInexact() const
+    {
+        return (fpstatus_ & kInexact) != 0;
+    }
+    constexpr bool IsInvalid() const
+    {
+        return (fpstatus_ & kInvalid) != 0;
+    }
+    constexpr bool IsOverFlow() const
+    {
+        return (fpstatus_ & kOverflow) != 0;
+    }
+    constexpr bool IsUnderFlow() const
+    {
+        return (fpstatus_ & kUnderflow) != 0;
+    }
+    static void RaiseDivideByZero()
+    {
+        if constexpr (kDivideByZero != 0) {
+            feraiseexcept(kDivideByZero);
+        }
+    }
+    static void RaiseInexact()
+    {
+        if constexpr (kInexact != 0) {
+            feraiseexcept(kInexact);
+        }
+    }
+    static void RaiseInvalid()
+    {
+        if constexpr (kInvalid != 0) {
+            feraiseexcept(kInvalid);
+        }
+    }
+    static void RaiseOverflow()
+    {
+        if constexpr (kOverflow != 0) {
+            feraiseexcept(kOverflow);
+        }
+    }
+    static void RaiseUnderflow()
+    {
+        if constexpr (kUnderflow != 0) {
+            feraiseexcept(kUnderflow);
+        }
+    }
+
+  private:
+    bool clear_on_dst_;
+    int fpstatus_;
+};
+
+/// @} cpp_core_utility
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_FLOAT_STATUS_HPP
+
diff --git a/numpy/core/src/common/half.hpp b/numpy/core/src/common/half.hpp
new file mode 100644
index 000000000..4d16e3bcc
--- /dev/null
+++ b/numpy/core/src/common/half.hpp
@@ -0,0 +1,264 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_HPP
+
+#include "npstd.hpp"
+
+#include "npy_cpu_dispatch.h" // NPY_HAVE_CPU_FEATURES
+#include "half_private.hpp"
+
+// TODO(@seiko2plus):
+// - covers half-precision operations that being supported by numpy/halffloat.h
+// - add support for arithmetic operations
+// - enables __fp16 causes massive FP exceptions on aarch64,
+//   needs a deep investigation
+
+namespace np {
+
+/// @addtogroup cpp_core_types
+/// @{
+
+/// Provides a type that implements 16-bit floating point (half-precision).
+/// This type is ensured to be 16-bit size.
+#if 1 // ndef __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    /// Whether `Half` has a full native HW support.
+    static constexpr bool kNative = false;
+    /// Whether `Half` has a native HW support for single/double conversion.
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        (
+            std::is_same_v<T, float> &&
+        #if defined(NPY_HAVE_FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        ) || (
+            std::is_same_v<T, double> &&
+        #if defined(NPY_HAVE_AVX512FP16) || defined(NPY_HAVE_VSX3)
+            true
+        #else
+            false
+        #endif
+        )
+    );
+
+    /// Default constructor. initialize nothing.
+    Half() = default;
+
+    /// Constract from float
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(float f)
+    {
+    #if defined(NPY_HAVE_FP16)
+        __m128 mf = _mm_load_ss(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_cvtps_ph(mf, _MM_FROUND_TO_NEAREST_INT)));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (vf32));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromFloatBits(BitCast<uint32_t>(f));
+    #endif
+    }
+
+    /// Construct from double.
+    /// If there are no hardware optimization available, rounding will always
+    /// be set to ties to even.
+    explicit Half(double f)
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        __m128d md = _mm_load_sd(&f);
+        bits_ = static_cast<uint16_t>(_mm_cvtsi128_si32(_mm_castph_si128(_mm_cvtpd_ph(md))));
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector double vf64 = vec_splats(f);
+        __vector unsigned short vf16;
+        __asm__ __volatile__ ("xvcvdphp %x0,%x1" : "=wa" (vf16) : "wa" (vf64));
+        bits_ = vec_extract(vf16, 0);
+    #else
+        bits_ = half_private::FromDoubleBits(BitCast<uint64_t>(f));
+    #endif
+    }
+
+    /// Cast to float
+    explicit operator float() const
+    {
+    #if defined(NPY_HAVE_FP16)
+        float ret;
+        _mm_store_ss(&ret, _mm_cvtph_ps(_mm_cvtsi32_si128(bits_)));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(vec_extract_fp_from_shorth)
+        return vec_extract(vec_extract_fp_from_shorth(vec_splats(bits_)), 0);
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf32;
+        __asm__ __volatile__("xvcvhpsp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_.u)));
+        return vec_extract(vf32, 0);
+    #else
+        return BitCast<float>(half_private::ToFloatBits(bits_));
+    #endif
+    }
+
+    /// Cast to double
+    explicit operator double() const
+    {
+    #if defined(NPY_HAVE_AVX512FP16)
+        double ret;
+        _mm_store_sd(&ret, _mm_cvtph_pd(_mm_castsi128_ph(_mm_cvtsi32_si128(bits_))));
+        return ret;
+    #elif defined(NPY_HAVE_VSX3) && defined(NPY_HAVE_VSX_ASM)
+        __vector float vf64;
+        __asm__ __volatile__("xvcvhpdp %x0,%x1"
+                             : "=wa"(vf32)
+                             : "wa"(vec_splats(bits_)));
+        return vec_extract(vf64, 0);
+    #else
+        return BitCast<double>(half_private::ToDoubleBits(bits_));
+    #endif
+    }
+
+    /// Returns a new Half constracted from the IEEE 754 binary16.
+    static constexpr Half FromBits(uint16_t bits)
+    {
+        Half h{};
+        h.bits_ = bits;
+        return h;
+    }
+    /// Returns the IEEE 754 binary16 representation.
+    constexpr uint16_t Bits() const
+    {
+        return bits_;
+    }
+
+    /// @name Comparison operators (orderd)
+    /// @{
+    constexpr bool operator==(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Equal(r);
+    }
+    constexpr bool operator<(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && Less(r);
+    }
+    constexpr bool operator<=(Half r) const
+    {
+        return !(IsNaN() || r.IsNaN()) && LessEqual(r);
+    }
+    constexpr bool operator>(Half r) const
+    {
+        return r < *this;
+    }
+    constexpr bool operator>=(Half r) const
+    {
+        return r <= *this;
+    }
+    /// @}
+
+    /// @name Comparison operators (unorderd)
+    /// @{
+    constexpr bool operator!=(Half r) const
+    {
+        return !(*this == r);
+    }
+    /// @} Comparison operators
+
+    /// @name Comparison with no guarantee of NaN behavior
+    /// @{
+    constexpr bool Less(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //   And make sure they are not equal to each other
+        //       in case of both are equal to +-0
+        // else
+        //   Test if  `a` has the sign.
+        //        and `a` != -0.0 and `b` != 0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) && (a != b)
+                                  : sign_a && ((a | b) != 0x8000u);
+    }
+    constexpr bool LessEqual(Half r) const
+    {
+        uint_fast16_t a = static_cast<uint_fast16_t>(bits_),
+                      b = static_cast<uint_fast16_t>(r.bits_);
+        bool sign_a = (a & 0x8000u) == 0x8000u;
+        bool sign_b = (b & 0x8000u) == 0x8000u;
+        // if both `a` and `b` have same sign
+        //   Test if `a` > `b` when `a` has the sign
+        //        or `a` < `b` when is not.
+        //        or a == b (needed even if we used <= above instead
+        //                   since testing +-0 still required)
+        // else
+        //   Test if `a` has the sign
+        //        or `a` and `b` equal to +-0.0
+        return (sign_a == sign_b) ? (sign_a ^ (a < b)) || (a == b)
+                                  : sign_a || ((a | b) == 0x8000u);
+    }
+    constexpr bool Equal(Half r) const
+    {
+        // fast16 cast is not worth it, since unpack op should involved.
+        uint16_t a = bits_, b = r.bits_;
+        return a == b || ((a | b) == 0x8000u);
+    }
+    /// @} Comparison
+
+    /// @name Properties
+    // @{
+    constexpr bool IsNaN() const
+    {
+        return ((bits_ & 0x7c00u) == 0x7c00u) &&
+               ((bits_ & 0x03ffu) != 0);
+    }
+    /// @} Properties
+
+  private:
+    uint16_t bits_;
+};
+#else // __ARM_FP16_FORMAT_IEEE
+class Half final {
+  public:
+    static constexpr bool kNative = true;
+    template<typename T>
+    static constexpr bool kNativeConversion = (
+        std::is_same_v<T, float> || std::is_same_v<T, double>
+    );
+    Half() = default;
+    constexpr Half(__fp16 h) : half_(h)
+    {}
+    constexpr operator __fp16() const
+    { return half_; }
+    static Half FromBits(uint16_t bits)
+    {
+        Half h;
+        h.half_ = BitCast<__fp16>(bits);
+        return h;
+    }
+    uint16_t Bits() const
+    { return BitCast<uint16_t>(half_); }
+    constexpr bool Less(Half r) const
+    { return half_ < r.half_; }
+    constexpr bool LessEqual(Half r) const
+    { return half_ <= r.half_; }
+    constexpr bool Equal(Half r) const
+    { return half_ == r.half_; }
+    constexpr bool IsNaN() const
+    { return half_ != half_; }
+
+  private:
+    __fp16 half_;
+};
+#endif // __ARM_FP16_FORMAT_IEEE
+
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_HALF_HPP
diff --git a/numpy/core/src/common/half_private.hpp b/numpy/core/src/common/half_private.hpp
new file mode 100644
index 000000000..7a64eb397
--- /dev/null
+++ b/numpy/core/src/common/half_private.hpp
@@ -0,0 +1,330 @@
+#ifndef NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+#define NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
+
+#include "npstd.hpp"
+#include "float_status.hpp"
+
+/*
+ * The following functions that emulating float/double/half conversions
+ * are copied from npymath without any changes to its functionalty.
+ */
+namespace np { namespace half_private {
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromFloatBits(uint32_t f)
+{
+    uint32_t f_exp, f_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (uint16_t) ((f&0x80000000u) >> 16);
+    f_exp = (f&0x7f800000u);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (f_exp >= 0x47800000u) {
+        if (f_exp == 0x7f800000u) {
+            /* Inf or NaN */
+            f_sig = (f&0x007fffffu);
+            if (f_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (f_sig >> 13));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return (uint16_t) (h_sgn + 0x7c00u);
+            }
+        } else {
+            if constexpr (gen_overflow) {
+                /* overflow to signed inf */
+                FloatStatus::RaiseOverflow();
+            }
+            return (uint16_t) (h_sgn + 0x7c00u);
+        }
+    }
+
+    /* Exponent underflow converts to a subnormal half or signed zero */
+    if (f_exp <= 0x38000000u) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (f_exp < 0x33000000u) {
+            if constexpr (gen_underflow) {
+                /* If f != 0, it underflowed to 0 */
+                if ((f&0x7fffffff) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        f_exp >>= 23;
+        f_sig = (0x00800000u + (f&0x007fffffu));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((f_sig&(((uint32_t)1 << (126 - f_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Usually the significand is shifted by 13. For subnormals an
+         * additional shift needs to occur. This shift is one for the largest
+         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
+         * offsets the new first bit. At most the shift can be 1+10 bits.
+         */
+        f_sig >>= (113 - f_exp);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand. However, the (113 - f_exp)
+             * shift can lose up to 11 bits, so the || checks them in the original.
+             * In all other cases, we can just add one.
+             */
+            if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
+                f_sig += 0x00001000u;
+            }
+        }
+        else {
+            f_sig += 0x00001000u;
+        }
+        h_sig = (uint16_t) (f_sig >> 13);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return (uint16_t) (h_sgn + h_sig);
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((f_exp - 0x38000000u) >> 13);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    f_sig = (f&0x007fffffu);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((f_sig&0x00003fffu) != 0x00001000u) {
+            f_sig += 0x00001000u;
+        }
+    }
+    else {
+        f_sig += 0x00001000u;
+    }
+    h_sig = (uint16_t) (f_sig >> 13);
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+template<bool gen_overflow=true, bool gen_underflow=true, bool round_even=true>
+inline uint16_t FromDoubleBits(uint64_t d)
+{
+    uint64_t d_exp, d_sig;
+    uint16_t h_sgn, h_exp, h_sig;
+
+    h_sgn = (d&0x8000000000000000ULL) >> 48;
+    d_exp = (d&0x7ff0000000000000ULL);
+
+    /* Exponent overflow/NaN converts to signed inf/NaN */
+    if (d_exp >= 0x40f0000000000000ULL) {
+        if (d_exp == 0x7ff0000000000000ULL) {
+            /* Inf or NaN */
+            d_sig = (d&0x000fffffffffffffULL);
+            if (d_sig != 0) {
+                /* NaN - propagate the flag in the significand... */
+                uint16_t ret = (uint16_t) (0x7c00u + (d_sig >> 42));
+                /* ...but make sure it stays a NaN */
+                if (ret == 0x7c00u) {
+                    ret++;
+                }
+                return h_sgn + ret;
+            } else {
+                /* signed inf */
+                return h_sgn + 0x7c00u;
+            }
+        } else {
+            /* overflow to signed inf */
+            if constexpr (gen_overflow) {
+                FloatStatus::RaiseOverflow();
+            }
+            return h_sgn + 0x7c00u;
+        }
+    }
+
+    /* Exponent underflow converts to subnormal half or signed zero */
+    if (d_exp <= 0x3f00000000000000ULL) {
+        /*
+         * Signed zeros, subnormal floats, and floats with small
+         * exponents all convert to signed zero half-floats.
+         */
+        if (d_exp < 0x3e60000000000000ULL) {
+            if constexpr (gen_underflow) {
+                /* If d != 0, it underflowed to 0 */
+                if ((d&0x7fffffffffffffffULL) != 0) {
+                    FloatStatus::RaiseUnderflow();
+                }
+            }
+            return h_sgn;
+        }
+        /* Make the subnormal significand */
+        d_exp >>= 52;
+        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
+        if constexpr (gen_underflow) {
+            /* If it's not exactly represented, it underflowed */
+            if ((d_sig&(((uint64_t)1 << (1051 - d_exp)) - 1)) != 0) {
+                FloatStatus::RaiseUnderflow();
+            }
+        }
+        /*
+         * Unlike floats, doubles have enough room to shift left to align
+         * the subnormal significand leading to no loss of the last bits.
+         * The smallest possible exponent giving a subnormal is:
+         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
+         * shifted with respect to it. This adds a shift of 10+1 bits the final
+         * right shift when comparing it to the one in the normal branch.
+         */
+        assert(d_exp - 998 >= 0);
+        d_sig <<= (d_exp - 998);
+        /* Handle rounding by adding 1 to the bit beyond half precision */
+        if constexpr (round_even) {
+            /*
+             * If the last bit in the half significand is 0 (already even), and
+             * the remaining bit pattern is 1000...0, then we do not add one
+             * to the bit after the half significand.  In all other cases, we do.
+             */
+            if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
+                d_sig += 0x0010000000000000ULL;
+            }
+        }
+        else {
+            d_sig += 0x0010000000000000ULL;
+        }
+        h_sig = (uint16_t) (d_sig >> 53);
+        /*
+         * If the rounding causes a bit to spill into h_exp, it will
+         * increment h_exp from zero to one and h_sig will be zero.
+         * This is the correct result.
+         */
+        return h_sgn + h_sig;
+    }
+
+    /* Regular case with no overflow or underflow */
+    h_exp = (uint16_t) ((d_exp - 0x3f00000000000000ULL) >> 42);
+    /* Handle rounding by adding 1 to the bit beyond half precision */
+    d_sig = (d&0x000fffffffffffffULL);
+    if constexpr (round_even) {
+        /*
+         * If the last bit in the half significand is 0 (already even), and
+         * the remaining bit pattern is 1000...0, then we do not add one
+         * to the bit after the half significand.  In all other cases, we do.
+         */
+        if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
+            d_sig += 0x0000020000000000ULL;
+        }
+    }
+    else {
+        d_sig += 0x0000020000000000ULL;
+    }
+    h_sig = (uint16_t) (d_sig >> 42);
+
+    /*
+     * If the rounding causes a bit to spill into h_exp, it will
+     * increment h_exp by one and h_sig will be zero.  This is the
+     * correct result.  h_exp may increment to 15, at greatest, in
+     * which case the result overflows to a signed inf.
+     */
+    if constexpr (gen_overflow) {
+        h_sig += h_exp;
+        if (h_sig == 0x7c00u) {
+            FloatStatus::RaiseOverflow();
+        }
+        return h_sgn + h_sig;
+    }
+    else {
+        return h_sgn + h_exp + h_sig;
+    }
+}
+
+constexpr uint32_t ToFloatBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint32_t f_sgn = ((uint32_t)h&0x8000u) << 16;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return f_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint32_t f_exp = ((uint32_t)(127 - 15 - h_exp)) << 23;
+            uint32_t f_sig = ((uint32_t)(h_sig&0x03ffu)) << 13;
+            return f_sgn + f_exp + f_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return f_sgn + 0x7f800000u + (((uint32_t)(h&0x03ffu)) << 13);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return f_sgn + (((uint32_t)(h&0x7fffu) + 0x1c000u) << 13);
+    }
+}
+
+constexpr uint64_t ToDoubleBits(uint16_t h)
+{
+    uint16_t h_exp = (h&0x7c00u);
+    uint64_t d_sgn = ((uint64_t)h&0x8000u) << 48;
+    switch (h_exp) {
+        case 0x0000u: { // 0 or subnormal
+            uint16_t h_sig = (h&0x03ffu);
+            // Signed zero
+            if (h_sig == 0) {
+                return d_sgn;
+            }
+            // Subnormal
+            h_sig <<= 1;
+            while ((h_sig&0x0400u) == 0) {
+                h_sig <<= 1;
+                h_exp++;
+            }
+            uint64_t d_exp = ((uint64_t)(1023 - 15 - h_exp)) << 52;
+            uint64_t d_sig = ((uint64_t)(h_sig&0x03ffu)) << 42;
+            return d_sgn + d_exp + d_sig;
+        }
+        case 0x7c00u: // inf or NaN
+            // All-ones exponent and a copy of the significand
+            return d_sgn + 0x7ff0000000000000ULL + (((uint64_t)(h&0x03ffu)) << 42);
+        default: // normalized
+            // Just need to adjust the exponent and shift
+            return d_sgn + (((uint64_t)(h&0x7fffu) + 0xfc000u) << 42);
+    }
+}
+
+}} // namespace np::half_private
+#endif // NUMPY_CORE_SRC_COMMON_HALF_PRIVATE_HPP
diff --git a/numpy/core/src/common/meta.hpp b/numpy/core/src/common/meta.hpp
new file mode 100644
index 000000000..27ea1857e
--- /dev/null
+++ b/numpy/core/src/common/meta.hpp
@@ -0,0 +1,54 @@
+#ifndef NUMPY_CORE_SRC_COMMON_META_HPP
+#define NUMPY_CORE_SRC_COMMON_META_HPP
+
+#include "npstd.hpp"
+
+namespace np { namespace meta {
+/// @addtogroup cpp_core_meta
+/// @{
+
+namespace details {
+template<int size, bool unsig>
+struct IntBySize;
+
+template<bool unsig>
+struct IntBySize<sizeof(uint8_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint8_t, int8_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint16_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint16_t, int16_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint32_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint32_t, int32_t>::type;
+};
+template<bool unsig>
+struct IntBySize<sizeof(uint64_t), unsig> {
+    using Type = typename std::conditional<
+        unsig, uint64_t, int64_t>::type;
+};
+} // namespace details
+
+/// Provides safe conversion of any integer type synonyms
+/// to a fixed-width integer type.
+template<typename T>
+struct FixedWidth {
+    using TF_ = typename details::IntBySize<
+        sizeof(T), std::is_unsigned<T>::value
+    >::Type;
+
+    using Type = typename std::conditional<
+        std::is_integral<T>::value, TF_, T
+    >::type;
+};
+
+/// @} cpp_core_meta
+
+}} // namespace np::meta
+
+#endif // NUMPY_CORE_SRC_COMMON_META_HPP
+
diff --git a/numpy/core/src/common/npdef.hpp b/numpy/core/src/common/npdef.hpp
new file mode 100644
index 000000000..56a0df52e
--- /dev/null
+++ b/numpy/core/src/common/npdef.hpp
@@ -0,0 +1,28 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+#define NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
+#if !defined(__cplusplus) || __cplusplus < 201703L
+    #error "NumPy requires a compiler with at least C++17 enabled"
+#endif
+
+/// @addtogroup cpp_core_defs
+/// @{
+
+/// Whether compiler supports C++20
+#if __cplusplus > 202002L
+    #define NP_HAS_CPP20 1
+#else
+    #define NP_HAS_CPP20 0
+#endif
+
+/// Wraps `__has_builtin`
+#if defined(__has_builtin)
+    #define NP_HAS_BUILTIN(INTRIN) __has_builtin(INTRIN)
+#else
+    #define NP_HAS_BUILTIN(INTRIN) 0
+#endif
+
+/// @} cpp_core_defs
+
+#endif // NUMPY_CORE_SRC_COMMON_NPDEF_HPP
+
diff --git a/numpy/core/src/common/npstd.hpp b/numpy/core/src/common/npstd.hpp
new file mode 100644
index 000000000..92e3d5cc5
--- /dev/null
+++ b/numpy/core/src/common/npstd.hpp
@@ -0,0 +1,57 @@
+#ifndef NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+#define NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
+#include <cstddef>
+#include <cstring>
+#include <cctype>
+#include <cstdint>
+
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cmath>
+#include <complex>
+#include <type_traits>
+
+#include <numpy/npy_common.h>
+
+#include "npy_config.h"
+
+namespace np {
+/// @addtogroup cpp_core_types
+/// @{
+using std::uint8_t;
+using std::int8_t;
+using std::uint16_t;
+using std::int16_t;
+using std::uint32_t;
+using std::int32_t;
+using std::uint64_t;
+using std::int64_t;
+using std::uintptr_t;
+using std::intptr_t;
+using std::complex;
+using std::uint_fast16_t;
+using std::uint_fast32_t;
+
+/** Guard for long double.
+ *
+ * The C implementation defines long double as double
+ * on MinGW to provide compatibility with MSVC to unify
+ * one behavior under Windows OS, which makes npy_longdouble
+ * not fit to be used with template specialization or overloading.
+ *
+ * This type will be set to `void` when `npy_longdouble` is not defined
+ * as `long double`.
+ */
+using LongDouble = typename std::conditional<
+    !std::is_same<npy_longdouble, long double>::value,
+     void, npy_longdouble
+>::type;
+/// @} cpp_core_types
+
+} // namespace np
+
+#endif // NUMPY_CORE_SRC_COMMON_NPSTD_HPP
+
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h
index d6886c5ea..715b17777 100644
--- a/numpy/core/src/common/npy_config.h
+++ b/numpy/core/src/common/npy_config.h
@@ -2,6 +2,7 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
 
 #include "config.h"
+#include "npy_cpu_dispatch.h" // brings NPY_HAVE_[CPU features]
 #include "numpy/numpyconfig.h"
 #include "numpy/utils.h"
 #include "numpy/npy_os.h"
diff --git a/numpy/core/src/common/npy_cpu_features.c b/numpy/core/src/common/npy_cpu_features.c
index 7563979d1..64a85f6fb 100644
--- a/numpy/core/src/common/npy_cpu_features.c
+++ b/numpy/core/src/common/npy_cpu_features.c
@@ -14,13 +14,15 @@ static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX];
 static void
 npy__cpu_init_features(void);
 /*
- * Disable CPU dispatched features at runtime if environment variable
- * 'NPY_DISABLE_CPU_FEATURES' is defined.
+ * Enable or disable CPU dispatched features at runtime if the environment variable
+ * `NPY_ENABLE_CPU_FEATURES`  or  `NPY_DISABLE_CPU_FEATURES`
+ * depends on the value of boolean parameter `disable`(toggle).
+ *
  * Multiple features can be present, and separated by space, comma, or tab.
- * Raises an error if parsing fails or if the feature was not enabled
+ * Raises an error if parsing fails or if the feature was not enabled or disabled
 */
 static int
-npy__cpu_try_disable_env(void);
+npy__cpu_check_env(int disable, const char *env);
 
 /* Ensure the build's CPU baseline features are supported at runtime */
 static int
@@ -43,9 +45,22 @@ npy_cpu_init(void)
     if (npy__cpu_validate_baseline() < 0) {
         return -1;
     }
-    if (npy__cpu_try_disable_env() < 0) {
+    char *enable_env = getenv("NPY_ENABLE_CPU_FEATURES");
+    char *disable_env = getenv("NPY_DISABLE_CPU_FEATURES");
+    int is_enable = enable_env && enable_env[0];
+    int is_disable = disable_env && disable_env[0];
+    if (is_enable & is_disable) {
+        PyErr_Format(PyExc_ImportError,
+            "Both NPY_DISABLE_CPU_FEATURES and NPY_ENABLE_CPU_FEATURES "
+            "environment variables cannot be set simultaneously."
+        );
         return -1;
     }
+    if (is_enable | is_disable) {
+        if (npy__cpu_check_env(is_disable, is_disable ? disable_env : enable_env) < 0) {
+            return -1;
+        }
+    }
     return 0;
 }
 
@@ -81,12 +96,14 @@ static struct {
                 {NPY_CPU_FEATURE_AVX512VBMI, "AVX512VBMI"},
                 {NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"},
                 {NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"},
+                {NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"},
                 {NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"},
                 {NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"},
                 {NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"},
                 {NPY_CPU_FEATURE_AVX512_CLX, "AVX512_CLX"},
                 {NPY_CPU_FEATURE_AVX512_CNL, "AVX512_CNL"},
                 {NPY_CPU_FEATURE_AVX512_ICL, "AVX512_ICL"},
+                {NPY_CPU_FEATURE_AVX512_SPR, "AVX512_SPR"},
                 {NPY_CPU_FEATURE_VSX, "VSX"},
                 {NPY_CPU_FEATURE_VSX2, "VSX2"},
                 {NPY_CPU_FEATURE_VSX3, "VSX3"},
@@ -208,7 +225,8 @@ npy__cpu_validate_baseline(void)
         *(fptr-1) = '\0'; // trim the last space
         PyErr_Format(PyExc_RuntimeError,
             "NumPy was built with baseline optimizations: \n"
-            "(" NPY_WITH_CPU_BASELINE ") but your machine doesn't support:\n(%s).",
+            "(" NPY_WITH_CPU_BASELINE ") but your machine "
+            "doesn't support:\n(%s).",
             baseline_failure
         );
         return -1;
@@ -218,27 +236,31 @@ npy__cpu_validate_baseline(void)
 }
 
 static int
-npy__cpu_try_disable_env(void)
-{
-    char *disenv = getenv("NPY_DISABLE_CPU_FEATURES");
-    if (disenv == NULL || disenv[0] == 0) {
-        return 0;
-    }
-    #define NPY__CPU_ENV_ERR_HEAD \
-        "During parsing environment variable 'NPY_DISABLE_CPU_FEATURES':\n"
+npy__cpu_check_env(int disable, const char *env) {
+
+    static const char *names[] = {
+        "enable", "disable",
+        "NPY_ENABLE_CPU_FEATURES", "NPY_DISABLE_CPU_FEATURES",
+        "During parsing environment variable: 'NPY_ENABLE_CPU_FEATURES':\n",
+        "During parsing environment variable: 'NPY_DISABLE_CPU_FEATURES':\n"
+    };
+    disable = disable ? 1 : 0;
+    const char *act_name = names[disable];
+    const char *env_name = names[disable + 2];
+    const char *err_head = names[disable + 4];
 
 #if !defined(NPY_DISABLE_OPTIMIZATION) && NPY_WITH_CPU_DISPATCH_N > 0
     #define NPY__MAX_VAR_LEN 1024 // More than enough for this era
-    size_t var_len = strlen(disenv) + 1;
+    size_t var_len = strlen(env) + 1;
     if (var_len > NPY__MAX_VAR_LEN) {
         PyErr_Format(PyExc_RuntimeError,
-            "Length of environment variable 'NPY_DISABLE_CPU_FEATURES' is %d, only %d accepted",
-            var_len, NPY__MAX_VAR_LEN - 1
+            "Length of environment variable '%s' is %d, only %d accepted",
+            env_name, var_len, NPY__MAX_VAR_LEN
         );
         return -1;
     }
-    char disable_features[NPY__MAX_VAR_LEN];
-    memcpy(disable_features, disenv, var_len);
+    char features[NPY__MAX_VAR_LEN];
+    memcpy(features, env, var_len);
 
     char nexist[NPY__MAX_VAR_LEN];
     char *nexist_cur = &nexist[0];
@@ -248,17 +270,19 @@ npy__cpu_try_disable_env(void)
 
     //comma and space including (htab, vtab, CR, LF, FF)
     const char *delim = ", \t\v\r\n\f";
-    char *feature = strtok(disable_features, delim);
+    char *feature = strtok(features, delim);
     while (feature) {
-        if (npy__cpu_baseline_fid(feature) > 0) {
-            PyErr_Format(PyExc_RuntimeError,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU feature '%s', since it is part of "
-                "the baseline optimizations:\n"
-                "(" NPY_WITH_CPU_BASELINE ").",
-                feature
-            );
-            return -1;
+        if (npy__cpu_baseline_fid(feature) > 0){
+            if (disable) {
+                PyErr_Format(PyExc_RuntimeError,
+                    "%s"
+                    "You cannot disable CPU feature '%s', since it is part of "
+                    "the baseline optimizations:\n"
+                    "(" NPY_WITH_CPU_BASELINE ").",
+                    err_head, feature
+                );
+                return -1;
+            } goto next;
         }
         // check if the feature is part of dispatched features
         int feature_id = npy__cpu_dispatch_fid(feature);
@@ -275,47 +299,58 @@ npy__cpu_try_disable_env(void)
             notsupp_cur[flen] = ' '; notsupp_cur += flen + 1;
             goto next;
         }
-        // Finally we can disable it
-        npy__cpu_have[feature_id] = 0;
+        // Finally we can disable or mark for enabling
+        npy__cpu_have[feature_id] = disable ? 0:2;
     next:
         feature = strtok(NULL, delim);
     }
+    if (!disable){
+        // Disables any unmarked dispatched feature.
+        #define NPY__CPU_DISABLE_DISPATCH_CB(FEATURE, DUMMY) \
+            if(npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)] != 0)\
+            {npy__cpu_have[NPY_CAT(NPY_CPU_FEATURE_, FEATURE)]--;}\
+
+        NPY_WITH_CPU_DISPATCH_CALL(NPY__CPU_DISABLE_DISPATCH_CB, DUMMY) // extra arg for msvc
+    }
 
     *nexist_cur = '\0';
     if (nexist[0] != '\0') {
         *(nexist_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not part of the dispatched optimizations\n"
-                "(" NPY_WITH_CPU_DISPATCH ").",
-                nexist
+        if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%sYou cannot %s CPU features (%s), since "
+            "they are not part of the dispatched optimizations\n"
+            "(" NPY_WITH_CPU_DISPATCH ").",
+            err_head, act_name, nexist
         ) < 0) {
             return -1;
         }
+        return 0;
     }
 
+    #define NOTSUPP_BODY \
+                "%s" \
+                "You cannot %s CPU features (%s), since " \
+                "they are not supported by your machine.", \
+                err_head, act_name, notsupp
+
     *notsupp_cur = '\0';
     if (notsupp[0] != '\0') {
         *(notsupp_cur-1) = '\0'; // trim the last space
-        if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-                NPY__CPU_ENV_ERR_HEAD
-                "You cannot disable CPU features (%s), since "
-                "they are not supported by your machine.",
-                notsupp
-        ) < 0) {
+        if (!disable){
+            PyErr_Format(PyExc_RuntimeError, NOTSUPP_BODY);
             return -1;
         }
     }
 #else
-    if (PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
-            NPY__CPU_ENV_ERR_HEAD
-            "You cannot use environment variable 'NPY_DISABLE_CPU_FEATURES', since "
+    if (PyErr_WarnFormat(PyExc_ImportWarning, 1,
+            "%s"
+            "You cannot use environment variable '%s', since "
         #ifdef NPY_DISABLE_OPTIMIZATION
-            "the NumPy library was compiled with optimization disabled."
+            "the NumPy library was compiled with optimization disabled.",
         #else
-            "the NumPy library was compiled without any dispatched optimizations."
+            "the NumPy library was compiled without any dispatched optimizations.",
         #endif
+        err_head, env_name, act_name
     ) < 0) {
         return -1;
     }
@@ -506,6 +541,11 @@ npy__cpu_init_features(void)
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] &&
                                                          npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ];
+        // Sapphire Rapids
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16]     = (reg[3] & (1 << 23))  != 0;
+        npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR]      = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] &&
+                                                         npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16];
+
     }
 }
 
@@ -513,7 +553,10 @@ npy__cpu_init_features(void)
 
 #elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE)
 
-#ifdef __linux__
+#if defined(__linux__) || defined(__FreeBSD__)
+    #ifdef __FreeBSD__
+        #include <machine/cpu.h> // defines PPC_FEATURE_HAS_VSX
+    #endif
     #include <sys/auxv.h>
     #ifndef AT_HWCAP2
         #define AT_HWCAP2 26
@@ -533,12 +576,21 @@ static void
 npy__cpu_init_features(void)
 {
     memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
+#if defined(__linux__) || defined(__FreeBSD__)
 #ifdef __linux__
     unsigned int hwcap = getauxval(AT_HWCAP);
     if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
         return;
 
     hwcap = getauxval(AT_HWCAP2);
+#else
+    unsigned long hwcap;
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+    if ((hwcap & PPC_FEATURE_HAS_VSX) == 0)
+        return;
+
+    elf_aux_info(AT_HWCAP2, &hwcap, sizeof(hwcap));
+#endif // __linux__
     if (hwcap & PPC_FEATURE2_ARCH_3_1)
     {
         npy__cpu_have[NPY_CPU_FEATURE_VSX]  =
@@ -551,7 +603,7 @@ npy__cpu_init_features(void)
     npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX3] = (hwcap & PPC_FEATURE2_ARCH_3_00) != 0;
     npy__cpu_have[NPY_CPU_FEATURE_VSX4] = (hwcap & PPC_FEATURE2_ARCH_3_1) != 0;
-// TODO: AIX, FreeBSD
+// TODO: AIX, OpenBSD
 #else
     npy__cpu_have[NPY_CPU_FEATURE_VSX]  = 1;
     #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2)
diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h
index 96c543e70..9180670c4 100644
--- a/numpy/core/src/common/npy_cpu_features.h
+++ b/numpy/core/src/common/npy_cpu_features.h
@@ -43,6 +43,7 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512VNNI        = 42,
     NPY_CPU_FEATURE_AVX512VBMI2       = 43,
     NPY_CPU_FEATURE_AVX512BITALG      = 44,
+    NPY_CPU_FEATURE_AVX512FP16        = 45,
 
     // X86 CPU Groups
     // Knights Landing (F,CD,ER,PF)
@@ -57,6 +58,8 @@ enum npy_cpu_features
     NPY_CPU_FEATURE_AVX512_CNL        = 105,
     // Ice Lake        (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ)
     NPY_CPU_FEATURE_AVX512_ICL        = 106,
+    // Sapphire Rapids (Ice Lake, AVX512FP16)
+    NPY_CPU_FEATURE_AVX512_SPR        = 107,
 
     // IBM/POWER VSX
     // POWER7
@@ -103,13 +106,25 @@ enum npy_cpu_features
  *  - detects runtime CPU features
  *  - check that baseline CPU features are present
  *  - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
+ *  - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
  *
  * It will set a RuntimeError when 
  *  - CPU baseline features from the build are not supported at runtime
  *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
- * and will warn if 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that
- * is not disabled (the machine or build does not support it, or the project was
- * not built with any feature optimization support)
+ *  - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
+ *    simultaneously set
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
+ *    not built with any feature optimization support
+ *  
+ * It will set an ImportWarning when:
+ *  - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
+ *    by the machine or build
+ *  - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
+ *    disable/enable a feature when the project was not built with any feature
+ *    optimization support
+ * 
  * return 0 on success otherwise return -1
  */
 NPY_VISIBILITY_HIDDEN int
diff --git a/numpy/core/src/common/numpyos.c b/numpy/core/src/common/numpyos.c
index 5ee95191d..2fec06e1c 100644
--- a/numpy/core/src/common/numpyos.c
+++ b/numpy/core/src/common/numpyos.c
@@ -779,3 +779,25 @@ NumPyOS_strtoull(const char *str, char **endptr, int base)
     return strtoull(str, endptr, base);
 }
 
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+
+#if _MSC_VER >= 1900
+/* npy3k_compat.h uses this function in the _Py_BEGIN/END_SUPPRESS_IPH
+ * macros. It does not need to be defined when building using MSVC
+ * earlier than 14.0 (_MSC_VER == 1900).
+ */
+
+static void __cdecl _silent_invalid_parameter_handler(
+    wchar_t const* expression,
+    wchar_t const* function,
+    wchar_t const* file,
+    unsigned int line,
+    uintptr_t pReserved) { }
+
+_invalid_parameter_handler _Py_silent_invalid_parameter_handler = _silent_invalid_parameter_handler;
+
+#endif
+
+#endif
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
index ad9688338..58d842a6d 100644
--- a/numpy/core/src/common/simd/avx2/arithmetic.h
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -247,6 +247,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm256_fnmsub_ps
     #define npyv_nmulsub_f64 _mm256_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm256_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm256_fmaddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -274,6 +278,13 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
         npyv_f64 neg_a = npyv_xor_f64(a, npyv_setall_f64(-0.0));
         return npyv_sub_f64(npyv_mul_f64(neg_a, b), c);
     }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    { return _mm256_addsub_ps(npyv_mul_f32(a, b), c); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    { return _mm256_addsub_pd(npyv_mul_f64(a, b), c); }
+
 #endif // !NPY_HAVE_FMA3
 
 /***************************
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
index 8cb74df2b..d64f3c6d6 100644
--- a/numpy/core/src/common/simd/avx2/avx2.h
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -11,6 +11,7 @@
     #define NPY_SIMD_FMA3 0 // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm256_i32gather_*
 #define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 8)
 
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 410c35dc8..81144a36b 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -84,17 +84,6 @@ NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride)
 NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
 { return _mm256_castsi256_ps(npyv_loadn_u32((const npy_uint32*)ptr, stride)); }
 //// 64
-#if 0 // slower
-NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
-{
-    const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
-    return _mm256_i64gather_epi64((const void*)ptr, idx, 8);
-}
-NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
-{ return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
-NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
-{ return _mm256_castsi256_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
-#endif
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 {
     __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
@@ -107,6 +96,32 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm256_castpd_si256(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d a0 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr));
+    __m128d a2 = _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2)));
+    __m128d a01 = _mm_loadh_pd(a0, (const double*)(ptr + stride));
+    __m128d a23 = _mm_loadh_pd(a2, (const double*)(ptr + stride*3));
+    return _mm256_castpd_ps(_mm256_insertf128_pd(_mm256_castpd128_pd256(a01), a23, 1));
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm256_castps_si256(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m256d a = npyv_loadl_f64(ptr);
+    return _mm256_insertf128_pd(a, _mm_loadu_pd(ptr + stride), 1);
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return _mm256_castpd_si256(npyv_loadn2_f64((const double*)ptr, stride)); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -143,6 +158,32 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm256_castsi256_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+    _mm_storel_pd((double*)ptr, a0);
+    _mm_storeh_pd((double*)(ptr + stride), a0);
+    _mm_storel_pd((double*)(ptr + stride*2), a1);
+    _mm_storeh_pd((double*)(ptr + stride*3), a1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm256_castps_si256(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    npyv_storel_u64(ptr, a);
+    npyv_storeh_u64(ptr + stride, a);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm256_castpd_si256(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -174,7 +215,7 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    __m256i payload = _mm256_maskload_epi64((const void*)ptr, mask);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
     return _mm256_blendv_epi8(vfill, payload, mask);
 }
 // fill zero to rest lanes
@@ -184,7 +225,45 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_maskload_epi64((const void*)ptr, mask);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+/// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    return _mm256_maskload_epi64((const long long*)ptr, mask);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
+    npy_int64 m = -((npy_int64)(nlane > 1));
+    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
+    return _mm256_blendv_epi8(vfill, payload, mask);
 }
 /*********************************
  * Non-contiguous partial load
@@ -216,12 +295,53 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const void*)ptr, idx, mask, 8);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m256i vfill = npyv_set_s32(
+        fill_lo, fill_hi, fill_lo, fill_hi,
+        fill_lo, fill_hi, fill_lo, fill_hi
+    );
+    const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
+    const __m256i steps = npyv_set_s64(0, 1, 2, 3);
+    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
+    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    __m256i a = npyv_loadl_s64(ptr);
+#if defined(_MSC_VER) && defined(_M_IX86)
+    __m128i fill =_mm_setr_epi32(
+        (int)fill_lo, (int)(fill_lo >> 32),
+        (int)fill_hi, (int)(fill_hi >> 32)
+    );
+#else
+    __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
+#endif
+    __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
+    return _mm256_inserti128_si256(a, b, 1);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -241,7 +361,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane = npyv_setall_s64(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
-    _mm256_maskstore_epi64((void*)ptr, mask, a);
+    _mm256_maskstore_epi64((long long*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + 2, a);
+    }
 }
 /*********************************
  * Non-contiguous partial store
@@ -252,23 +386,52 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128i a0 = _mm256_castsi256_si128(a);
     __m128i a1 = _mm256_extracti128_si256(a, 1);
+
+    ptr[stride*0] = _mm_extract_epi32(a0, 0);
     switch(nlane) {
-    default:
-        ptr[stride*7] = _mm_extract_epi32(a1, 3);
-    case 7:
-        ptr[stride*6] = _mm_extract_epi32(a1, 2);
-    case 6:
-        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+    case 1:
+        return;
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        return;
+    case 3:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        return;
+    case 4:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        return;
     case 5:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
         ptr[stride*4] = _mm_extract_epi32(a1, 0);
-    case 4:
+        return;
+    case 6:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
         ptr[stride*3] = _mm_extract_epi32(a0, 3);
-    case 3:
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        return;
+    case 7:
+        ptr[stride*1] = _mm_extract_epi32(a0, 1);
         ptr[stride*2] = _mm_extract_epi32(a0, 2);
-    case 2:
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a0, 1);
-    case 1:
-        ptr[stride*0] = _mm_extract_epi32(a0, 0);
+        ptr[stride*2] = _mm_extract_epi32(a0, 2);
+        ptr[stride*3] = _mm_extract_epi32(a0, 3);
+        ptr[stride*4] = _mm_extract_epi32(a1, 0);
+        ptr[stride*5] = _mm_extract_epi32(a1, 1);
+        ptr[stride*6] = _mm_extract_epi32(a1, 2);
+        ptr[stride*7] = _mm_extract_epi32(a1, 3);
     }
 }
 //// 64
@@ -277,19 +440,60 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     assert(nlane > 0);
     __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
     __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
     double *dptr = (double*)ptr;
+    _mm_storel_pd(dptr, a0);
     switch(nlane) {
-    default:
-        _mm_storeh_pd(dptr + stride * 3, a1);
+    case 1:
+        return;
+    case 2:
+        _mm_storeh_pd(dptr + stride * 1, a0);
+        return;
     case 3:
+        _mm_storeh_pd(dptr + stride * 1, a0);
         _mm_storel_pd(dptr + stride * 2, a1);
-    case 2:
+        return;
+    default:
         _mm_storeh_pd(dptr + stride * 1, a0);
+        _mm_storel_pd(dptr + stride * 2, a1);
+        _mm_storeh_pd(dptr + stride * 3, a1);
+    }
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    __m128d a0 = _mm256_castpd256_pd128(_mm256_castsi256_pd(a));
+    __m128d a1 = _mm256_extractf128_pd(_mm256_castsi256_pd(a), 1);
+
+    _mm_storel_pd((double*)ptr, a0);
+    switch(nlane) {
     case 1:
-        _mm_storel_pd(dptr + stride * 0, a0);
+        return;
+    case 2:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        return;
+    case 3:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        return;
+    default:
+        _mm_storeh_pd((double*)(ptr + stride * 1), a0);
+        _mm_storel_pd((double*)(ptr + stride * 2), a1);
+        _mm_storeh_pd((double*)(ptr + stride * 3), a1);
     }
 }
 
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s64(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s64(ptr + stride, a);
+    }
+}
 /*****************************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via reinterpret cast
  *****************************************************************************/
@@ -300,7 +504,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -312,7 +517,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -353,6 +559,110 @@ NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX2_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (load/store pair)
+#define NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX2_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX2_MEM_INTERLEAVE(SFX, ZSFX)                             \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX2_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup tables
  *********************************/
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
index c10267b21..7b9b6a344 100644
--- a/numpy/core/src/common/simd/avx2/operators.h
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -205,7 +205,7 @@ NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
 #define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
 #define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
 
-// precision comparison
+// precision comparison (ordered)
 #define npyv_cmpeq_f32(A, B)  _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
 #define npyv_cmpeq_f64(A, B)  _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
 #define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_UQ))
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
index 4d6ec8f75..9ebe0e7f4 100644
--- a/numpy/core/src/common/simd/avx2/reorder.h
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -94,6 +94,75 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
     return npyv_combine_f64(ab0, ab1);
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u8x2 ab_lh = npyv_combine_u8(ab_03, ab_12);
+    npyv_u8x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i ab_03 = _mm256_shuffle_epi8(ab0, idx);
+    __m256i ab_12 = _mm256_shuffle_epi8(ab1, idx);
+    npyv_u16x2 ab_lh = npyv_combine_u16(ab_03, ab_12);
+    npyv_u16x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256i abl = _mm256_permutevar8x32_epi32(ab0, idx);
+    __m256i abh = _mm256_permutevar8x32_epi32(ab1, idx);
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    npyv_u64x2 ab_lh = npyv_combine_u64(ab0, ab1);
+    npyv_u64x2 r;
+    r.val[0] = _mm256_unpacklo_epi64(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_epi64(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m256i idx = npyv_set_u32(0, 2, 4, 6, 1, 3, 5, 7);
+    __m256 abl = _mm256_permutevar8x32_ps(ab0, idx);
+    __m256 abh = _mm256_permutevar8x32_ps(ab1, idx);
+    return npyv_combine_f32(abl, abh);
+}
+
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    npyv_f64x2 ab_lh = npyv_combine_f64(ab0, ab1);
+    npyv_f64x2 r;
+    r.val[0] = _mm256_unpacklo_pd(ab_lh.val[0], ab_lh.val[1]);
+    r.val[1] = _mm256_unpackhi_pd(ab_lh.val[0], ab_lh.val[1]);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -126,4 +195,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm256_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm256_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm256_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm256_permute_pd(A, ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0))
+
 #endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
index 1290dc0ad..a63da87d0 100644
--- a/numpy/core/src/common/simd/avx512/arithmetic.h
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -349,6 +349,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
 // negate multiply and subtract, -(a*b) - c
 #define npyv_nmulsub_f32 _mm512_fnmsub_ps
 #define npyv_nmulsub_f64 _mm512_fnmsub_pd
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#define npyv_muladdsub_f32 _mm512_fmaddsub_ps
+#define npyv_muladdsub_f64 _mm512_fmaddsub_pd
 
 /***************************
  * Summation: Calculates the sum of all vector elements.
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
index 0946e6443..aa6abe256 100644
--- a/numpy/core/src/common/simd/avx512/avx512.h
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -7,6 +7,7 @@
 #define NPY_SIMD_F64 1
 #define NPY_SIMD_FMA3 1 // native support
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 0
 // Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
 #define NPY_SIMD_MAXLOAD_STRIDE32  (0x7fffffff / 16)
 #define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
diff --git a/numpy/core/src/common/simd/avx512/maskop.h b/numpy/core/src/common/simd/avx512/maskop.h
index d1c188390..88fa4a68c 100644
--- a/numpy/core/src/common/simd/avx512/maskop.h
+++ b/numpy/core/src/common/simd/avx512/maskop.h
@@ -51,4 +51,17 @@ NPYV_IMPL_AVX512_MASK_ADDSUB(s64, b64, epi64)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f32, b32, ps)
 NPYV_IMPL_AVX512_MASK_ADDSUB(f64, b64, pd)
 
+// division, m ? a / b : c
+NPY_FINLINE npyv_f32 npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{ return _mm512_mask_div_ps(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f32 npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+{ return _mm512_maskz_div_ps(m, a, b); }
+// division, m ? a / b : c
+NPY_FINLINE npyv_f64 npyv_ifdiv_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{ return _mm512_mask_div_pd(c, m, a, b); }
+// conditional division, m ? a / b : 0
+NPY_FINLINE npyv_f64 npyv_ifdivz_f64(npyv_b32 m, npyv_f64 a, npyv_f64 b)
+{ return _mm512_maskz_div_pd(m, a, b); }
+
 #endif // _NPY_SIMD_AVX512_MASKOP_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index 03fcb4630..fdf96a92c 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -120,6 +120,52 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_loadn_u64((const npy_uint64*)ptr, stride); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return _mm512_castsi512_pd(npyv_loadn_u64((const npy_uint64*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)ptr)),
+        (const double*)(ptr + stride)
+    );
+    __m128d b = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*2))),
+        (const double*)(ptr + stride*3)
+    );
+    __m128d c = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*4))),
+        (const double*)(ptr + stride*5)
+    );
+    __m128d d = _mm_loadh_pd(
+        _mm_castsi128_pd(_mm_loadl_epi64((const __m128i*)(ptr + stride*6))),
+        (const double*)(ptr + stride*7)
+    );
+    return _mm512_castpd_si512(npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    ));
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_loadn2_u32((const npy_uint32*)ptr, stride); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return _mm512_castsi512_ps(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{
+    __m128d a = _mm_loadu_pd(ptr);
+    __m128d b = _mm_loadu_pd(ptr + stride);
+    __m128d c = _mm_loadu_pd(ptr + stride * 2);
+    __m128d d = _mm_loadu_pd(ptr + stride * 3);
+    return npyv512_combine_pd256(
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1),
+        _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1)
+    );
+}
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ return npyv_reinterpret_u64_f64(npyv_loadn2_f64((const double*)ptr, stride)); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ return npyv_loadn2_u64((const npy_uint64*)ptr, stride); }
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -151,6 +197,48 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    __m256d lo = _mm512_castpd512_pd256(_mm512_castsi512_pd(a));
+    __m256d hi = _mm512_extractf64x4_pd(_mm512_castsi512_pd(a), 1);
+    __m128d e0 = _mm256_castpd256_pd128(lo);
+    __m128d e1 = _mm256_extractf128_pd(lo, 1);
+    __m128d e2 = _mm256_castpd256_pd128(hi);
+    __m128d e3 = _mm256_extractf128_pd(hi, 1);
+    _mm_storel_pd((double*)(ptr + stride * 0), e0);
+    _mm_storeh_pd((double*)(ptr + stride * 1), e0);
+    _mm_storel_pd((double*)(ptr + stride * 2), e1);
+    _mm_storeh_pd((double*)(ptr + stride * 3), e1);
+    _mm_storel_pd((double*)(ptr + stride * 4), e2);
+    _mm_storeh_pd((double*)(ptr + stride * 5), e2);
+    _mm_storel_pd((double*)(ptr + stride * 6), e3);
+    _mm_storeh_pd((double*)(ptr + stride * 7), e3);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm512_castps_si512(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{
+    __m256i lo = npyv512_lower_si256(a);
+    __m256i hi = npyv512_higher_si256(a);
+    __m128i e0 = _mm256_castsi256_si128(lo);
+    __m128i e1 = _mm256_extracti128_si256(lo, 1);
+    __m128i e2 = _mm256_castsi256_si128(hi);
+    __m128i e3 = _mm256_extracti128_si256(hi, 1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 0), e0);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 1), e1);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 2), e2);
+    _mm_storeu_si128((__m128i*)(ptr + stride * 3), e3);
+}
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ npyv_storen2_u64((npy_uint64*)ptr, stride, _mm512_castpd_si512(a)); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -159,14 +247,14 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
 }
 //// 64
@@ -174,14 +262,44 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+}
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
 }
 /*********************************
@@ -198,7 +316,7 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
     const __m512i vfill = _mm512_set1_epi32(fill);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
 }
 // fill zero to rest lanes
@@ -215,13 +333,48 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
         4*stride, 5*stride, 6*stride, 7*stride
     );
     const __m512i vfill = npyv_setall_s64(fill);
-    const __mmask8 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+       0,        1,          stride,   stride+1,
+       stride*2, stride*2+1, stride*3, stride*3+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
+    const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
+    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -229,14 +382,30 @@ npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_storeu_epi32((__m512i*)ptr, mask, a);
 }
 //// 64
 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
 {
     assert(nlane > 0);
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_storeu_epi64((__m512i*)ptr, mask, a);
 }
 /*********************************
@@ -251,7 +420,7 @@ NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp
         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
     );
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
-    const __mmask16 mask = nlane > 31 ? -1 : (1 << nlane) - 1;
+    const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
     _mm512_mask_i32scatter_epi32((__m512i*)ptr, mask, idx, a, 4);
 }
 //// 64
@@ -262,7 +431,31 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         0*stride, 1*stride, 2*stride, 3*stride,
         4*stride, 5*stride, 6*stride, 7*stride
     );
-    const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
+}
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0*stride, 1*stride, 2*stride, 3*stride,
+        4*stride, 5*stride, 6*stride, 7*stride
+    );
+    const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
+    _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 4);
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0);
+    const __m512i idx = npyv_set_s64(
+        0,        1,            stride,   stride+1,
+        2*stride, 2*stride+1, 3*stride, 3*stride+1
+    );
+    const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     _mm512_mask_i64scatter_epi64((__m512i*)ptr, mask, idx, a, 8);
 }
 
@@ -331,6 +524,110 @@ NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_AVX512_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride (pair load/store)
+#define NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                              \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_AVX512_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_AVX512_MEM_INTERLEAVE(SFX, ZSFX)                           \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_AVX512_MEM_INTERLEAVE(f64, f64)
+
 /**************************************************
  * Lookup table
  *************************************************/
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
index c0b2477f3..27e66b5e7 100644
--- a/numpy/core/src/common/simd/avx512/reorder.h
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -167,6 +167,140 @@ NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
     return r;
 }
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+    const __m512i idx_a = npyv_set_u8(
+        0,  2,  4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+        32, 34, 36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+        64, 66, 68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+        96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126
+    );
+    const __m512i idx_b = npyv_set_u8(
+        1,  3,  5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
+        33, 35, 37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
+        65, 67, 69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,  93,  95,
+        97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127
+    );
+    r.val[0] = _mm512_permutex2var_epi8(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi8(ab0, idx_b, ab1);
+#else
+    #ifdef NPY_HAVE_AVX512BW
+        const __m512i idx = npyv_set_u8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m512i abl = _mm512_shuffle_epi8(ab0, idx);
+        __m512i abh = _mm512_shuffle_epi8(ab1, idx);
+    #else
+        const __m256i idx = _mm256_setr_epi8(
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+        );
+        __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+        __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+        __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+        __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+        __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+        __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+    #endif
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+    const __m512i idx_a = npyv_set_u16(
+        0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+        32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+    );
+    const __m512i idx_b = npyv_set_u16(
+        1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
+        33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+    );
+    r.val[0] = _mm512_permutex2var_epi16(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi16(ab0, idx_b, ab1);
+#else
+    const __m256i idx = _mm256_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15,
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m256i abl_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab0),  idx);
+    __m256i abl_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab0), idx);
+    __m256i abh_lo = _mm256_shuffle_epi8(npyv512_lower_si256(ab1),  idx);
+    __m256i abh_hi = _mm256_shuffle_epi8(npyv512_higher_si256(ab1), idx);
+    __m512i abl = npyv512_combine_si256(abl_lo, abl_hi);
+    __m512i abh = npyv512_combine_si256(abh_lo, abh_hi);
+
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    r.val[0] = _mm512_permutex2var_epi64(abl, idx_a, abh);
+    r.val[1] = _mm512_permutex2var_epi64(abl, idx_b, abh);
+#endif
+    return r;
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u32x2 r;
+    r.val[0] = _mm512_permutex2var_epi32(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi32(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_u64x2 r;
+    r.val[0] = _mm512_permutex2var_epi64(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_epi64(ab0, idx_b, ab1);
+    return r;
+}
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    const __m512i idx_a = npyv_set_u32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const __m512i idx_b = npyv_set_u32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_f32x2 r;
+    r.val[0] = _mm512_permutex2var_ps(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_ps(ab0, idx_b, ab1);
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{
+    const __m512i idx_a = npyv_set_u64(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m512i idx_b = npyv_set_u64(1, 3, 5, 7, 9, 11, 13, 15);
+    npyv_f64x2 r;
+    r.val[0] = _mm512_permutex2var_pd(ab0, idx_a, ab1);
+    r.val[1] = _mm512_permutex2var_pd(ab0, idx_b, ab1);
+    return r;
+}
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -223,4 +357,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm512_shuffle_ps(a, a, (_MM_PERM_ENUM)_MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm512_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm512_permute_ps(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm512_permute_pd(A, (((E1)<<7) | ((E0)<<6) | ((E1)<<5) | ((E0)<<4) | ((E1)<<3) | ((E0)<<2) | ((E1)<<1) | (E0)))
+
 #endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/emulate_maskop.h b/numpy/core/src/common/simd/emulate_maskop.h
index 2a808a153..6627e5010 100644
--- a/numpy/core/src/common/simd/emulate_maskop.h
+++ b/numpy/core/src/common/simd/emulate_maskop.h
@@ -42,5 +42,39 @@ NPYV_IMPL_EMULATE_MASK_ADDSUB(s64, b64)
 #if NPY_SIMD_F64
     NPYV_IMPL_EMULATE_MASK_ADDSUB(f64, b64)
 #endif
+#if NPY_SIMD_F32
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f32
+    npyv_ifdiv_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        const npyv_f32 one = npyv_setall_f32(1.0f);
+        npyv_f32 div = npyv_div_f32(a, npyv_select_f32(m, b, one));
+        return npyv_select_f32(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f32
+    npyv_ifdivz_f32(npyv_b32 m, npyv_f32 a, npyv_f32 b)
+    {
+        const npyv_f32 zero = npyv_zero_f32();
+        return npyv_ifdiv_f32(m, a, b, zero);
+    }
+#endif
+#if NPY_SIMD_F64
+    // conditional division, m ? a / b : c
+    NPY_FINLINE npyv_f64
+    npyv_ifdiv_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 one = npyv_setall_f64(1.0);
+        npyv_f64 div = npyv_div_f64(a, npyv_select_f64(m, b, one));
+        return npyv_select_f64(m, div, c);
+    }
+    // conditional division, m ? a / b : 0
+    NPY_FINLINE npyv_f64
+    npyv_ifdivz_f64(npyv_b64 m, npyv_f64 a, npyv_f64 b)
+    {
+        const npyv_f64 zero = npyv_zero_f64();
+        return npyv_ifdiv_f64(m, a, b, zero);
+    }
+#endif
 
 #endif // _NPY_SIMD_EMULATE_MASKOP_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
index 00994806d..683620111 100644
--- a/numpy/core/src/common/simd/neon/arithmetic.h
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -265,6 +265,14 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
     { return vmlsq_f32(vnegq_f32(c), a, b); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+
 /***************************
  * FUSED F64
  ***************************/
@@ -277,6 +285,11 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return vfmsq_f64(c, a, b); }
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vfmsq_f64(vnegq_f64(c), a, b); }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+    }
 #endif // NPY_SIMD_F64
 
 /***************************
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index c0a771b5d..58d14809f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -278,20 +278,25 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
     return vrndnq_f32(a);
 #else
     // ARMv7 NEON only supports fp to int truncate conversion.
-    // a magic trick of adding 1.5 * 2**23 is used for rounding
+    // a magic trick of adding 1.5 * 2^23 is used for rounding
     // to nearest even and then subtract this magic number to get
     // the integer.
-    const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
-    const npyv_f32 magic = vdupq_n_f32(12582912.0f); // 1.5 * 2**23
-    npyv_f32 round = vsubq_f32(vaddq_f32(a, magic), magic);
-    npyv_b32 overflow = vcleq_f32(vabsq_f32(a), vreinterpretq_f32_u32(vdupq_n_u32(0x4b000000)));
-    round = vbslq_f32(overflow, round, a);
-    // signed zero
-    round = vreinterpretq_f32_s32(vorrq_s32(
-         vreinterpretq_s32_f32(round),
-         vandq_s32(vreinterpretq_s32_f32(a), szero)
-    ));
-    return round;
+    //
+    const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+    const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+    const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+    const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+    npyv_u32 nnan_mask = vceqq_f32(a, a);
+    // eliminate nans to avoid invalid fp errors
+    npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))));
+    // round by add magic number 1.5 * 2^23
+    npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+    // copysign
+    round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask ));
+    // a if |a| >= 2^23 or a == NaN
+    npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+             mask = vandq_u32(mask, nnan_mask);
+    return vbslq_f32(mask, round, a);
 #endif
 }
 #if NPY_SIMD_F64
@@ -302,33 +307,30 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_ceil_f32 vrndpq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
-        /**
-         * On armv7, vcvtq.f32 handles special cases as follows:
-         *  NaN return 0
-         * +inf or +outrange return 0x80000000(-0.0f)
-         * -inf or -outrange return 0x7fffffff(nan)
-         */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcltq_f32(round, a), one))
-        );
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(ceil),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+            vandq_u32(vcltq_f32(round, x), one))
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, ceil, a);
    }
 #endif
 #if NPY_SIMD_F64
@@ -339,29 +341,37 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_trunc_f32 vrndq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
+    {
         const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 exp_mask = vdupq_n_u32(0xff000000);
+        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(
+            vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero));
+
+        npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1);
+                 nfinite_mask = vandq_u32(nfinite_mask, exp_mask);
+                 nfinite_mask = vceqq_u32(nfinite_mask, exp_mask);
+        // eliminate nans/inf to avoid invalid fp errors
+        npyv_f32 x = vreinterpretq_f32_u32(
+            veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a)));
         /**
          * On armv7, vcvtq.f32 handles special cases as follows:
          *  NaN return 0
          * +inf or +outrange return 0x80000000(-0.0f)
          * -inf or -outrange return 0x7fffffff(nan)
          */
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_s32 trunci = vcvtq_s32_f32(x);
+        npyv_f32 trunc = vcvtq_f32_s32(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(round),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
-        ));
-        // if nan or overflow return a
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
+        trunc = vreinterpretq_f32_u32(
+            vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask));
+        // if overflow return a
+        npyv_u32 overflow_mask = vorrq_u32(
+            vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int)
         );
-        return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // a if a overflow or nonfinite
+        return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc);
    }
 #endif
 #if NPY_SIMD_F64
@@ -372,28 +382,31 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_ASIMD
     #define npyv_floor_f32 vrndmq_f32
 #else
-   NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
-   {
-        const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f));
+    NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
+    {
         const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f));
-        const npyv_s32 max_int = vdupq_n_s32(0x7fffffff);
+        const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f));
+        const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero);
+        const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); // 2^23
+        const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); // 1.5 * 2^23
 
-        npyv_s32 roundi = vcvtq_s32_f32(a);
-        npyv_f32 round = vcvtq_f32_s32(roundi);
+        npyv_u32 nnan_mask = vceqq_f32(a, a);
+        npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)));
+        // eliminate nans to avoid invalid fp errors
+        npyv_f32 abs_x = vabsq_f32(x);
+        // round by add magic number 1.5 * 2^23
+        npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h);
+        // copysign
+        round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask));
         npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32(
-            vandq_u32(vcgtq_f32(round, a), one)
-        ));
-        // respect signed zero
-        npyv_f32 rzero = vreinterpretq_f32_s32(vorrq_s32(
-            vreinterpretq_s32_f32(floor),
-            vandq_s32(vreinterpretq_s32_f32(a), szero)
+            vandq_u32(vcgtq_f32(round, x), one)
         ));
-        npyv_u32 nnan = npyv_notnan_f32(a);
-        npyv_u32 overflow = vorrq_u32(
-            vceqq_s32(roundi, szero), vceqq_s32(roundi, max_int)
-        );
-
-       return vbslq_f32(vbicq_u32(nnan, overflow), rzero, a);
+        // respects signed zero
+        floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask));
+        // a if |a| >= 2^23 or a == NaN
+        npyv_u32 mask = vcleq_f32(abs_x, two_power_23);
+                 mask = vandq_u32(mask, nnan_mask);
+        return vbslq_f32(mask, floor, a);
    }
 #endif // NPY_HAVE_ASIMD
 #if NPY_SIMD_F64
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 7060ea628..6163440c3 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -102,6 +102,29 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
     );
 }
 #endif
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{
+    return vcombine_u32(
+        vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride)
+    );
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+#endif
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -131,6 +154,32 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); }
 #endif
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+#if NPY_SIMD_F64
+    vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0);
+    vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1);
+#else
+    // armhf strict to alignment
+    vst1_u32((uint32_t*)ptr, vget_low_u32(a));
+    vst1_u32((uint32_t*)ptr + stride, vget_high_u32(a));
+#endif
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+#if NPY_SIMD_F64
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+#endif
 /*********************************
  * Partial Load
  *********************************/
@@ -168,6 +217,29 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 { return npyv_load_till_s64(ptr, nlane, 0); }
 
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -206,6 +278,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
 
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                          npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -238,6 +338,30 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        // armhf strict to alignment, may cause bus error
+    #if NPY_SIMD_F64
+        vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    #else
+        npyv_storel_s32(ptr, a);
+    #endif
+        return;
+    }
+    npyv_store_s32(ptr, a);
+}
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -245,16 +369,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    vst1q_lane_s32((int32_t*)ptr, a, 0);
     switch(nlane) {
-    default:
-        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
+    case 1:
+        return;
+    case 2:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
+        return;
     case 3:
+        vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
         vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
-    case 2:
+        return;
+    default:
         vst1q_lane_s32((int32_t*)ptr + stride, a, 1);
-    case 1:
-        vst1q_lane_s32((int32_t*)ptr, a, 0);
-        break;
+        vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2);
+        vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3);
     }
 }
 //// 64
@@ -268,6 +397,27 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     npyv_storen_s64(ptr, stride, a);
 }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+#if NPY_SIMD_F64
+    vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0);
+    if (nlane > 1) {
+        vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1);
+    }
+#else
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+#endif
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -278,7 +428,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -290,7 +441,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -332,6 +484,131 @@ NPYV_IMPL_NEON_REST_PARTIAL_TYPES(u64, s64)
 #if NPY_SIMD_F64
 NPYV_IMPL_NEON_REST_PARTIAL_TYPES(f64, s64)
 #endif
+
+// 128-bit/64-bit stride
+#define NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(u64, s64)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_REST_PARTIAL_TYPES_PAIR(f64, s64)
+#endif
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_NEON_MEM_INTERLEAVE(SFX, T_PTR)                        \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return vld2q_##SFX((const T_PTR*)ptr);                           \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        vst2q_##SFX((T_PTR*)ptr, v);                                     \
+    }
+
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u8,  uint8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s8,  int8_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u16, uint16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s16, int16_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(u32, uint32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(s32, int32_t)
+NPYV_IMPL_NEON_MEM_INTERLEAVE(f32, float)
+
+#if NPY_SIMD_F64
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(f64, double)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(u64, uint64_t)
+    NPYV_IMPL_NEON_MEM_INTERLEAVE(s64, int64_t)
+#else
+    #define NPYV_IMPL_NEON_MEM_INTERLEAVE_64(SFX)                               \
+        NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                         \
+            const npyv_lanetype_##SFX *ptr)                                     \
+        {                                                                       \
+            npyv_##SFX a = npyv_load_##SFX(ptr);                                \
+            npyv_##SFX b = npyv_load_##SFX(ptr + 2);                            \
+            npyv_##SFX##x2 r;                                                   \
+            r.val[0] = vcombine_##SFX(vget_low_##SFX(a),  vget_low_##SFX(b));   \
+            r.val[1] = vcombine_##SFX(vget_high_##SFX(a), vget_high_##SFX(b));  \
+            return r;                                                           \
+        }                                                                       \
+        NPY_FINLINE void npyv_store_##SFX##x2(                                  \
+            npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v)                         \
+        {                                                                       \
+            npyv_store_##SFX(ptr, vcombine_##SFX(                               \
+                vget_low_##SFX(v.val[0]),  vget_low_##SFX(v.val[1])));          \
+            npyv_store_##SFX(ptr + 2, vcombine_##SFX(                           \
+                vget_high_##SFX(v.val[0]),  vget_high_##SFX(v.val[1])));        \
+        }
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(u64)
+        NPYV_IMPL_NEON_MEM_INTERLEAVE_64(s64)
+#endif
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
index b08071527..49c35c415 100644
--- a/numpy/core/src/common/simd/neon/neon.h
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -16,6 +16,7 @@
     #define NPY_SIMD_FMA3 0  // HW emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef uint8x16_t  npyv_u8;
 typedef int8x16_t   npyv_s8;
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
index 249621bd6..e18ea94b8 100644
--- a/numpy/core/src/common/simd/neon/operators.h
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -240,10 +240,35 @@
 
 // check special cases
 NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a)
-{ return vceqq_f32(a, a); }
+{
+#if defined(__clang__)
+/**
+ * To avoid signaling qNaN, workaround for clang symmetric inputs bug
+ * check https://github.com/numpy/numpy/issues/22933,
+ * for more clarification.
+ */
+    npyv_b32 ret;
+    #if NPY_SIMD_F64
+        __asm("fcmeq %0.4s, %1.4s, %1.4s" : "=w" (ret) : "w" (a));
+    #else
+        __asm("vceq.f32 %q0, %q1, %q1" : "=w" (ret) : "w" (a));
+    #endif
+    return ret;
+#else
+    return vceqq_f32(a, a);
+#endif
+}
 #if NPY_SIMD_F64
     NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a)
-    { return vceqq_f64(a, a); }
+    {
+    #if defined(__clang__)
+        npyv_b64 ret;
+        __asm("fcmeq %0.2d, %1.2d, %1.2d" : "=w" (ret) : "w" (a));
+        return ret;
+    #else
+        return vceqq_f64(a, a);
+    #endif
+    }
 #endif
 
 // Test cross all vector lanes
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
index 50b06ed11..8bf68f5be 100644
--- a/numpy/core/src/common/simd/neon/reorder.h
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -76,36 +76,45 @@ NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
 NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #endif
 
-// interleave two vectors
-#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
-    NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
-    {                                                        \
-        T_VEC##x2 r;                                         \
-        r.val[0] = vzip1q_##SFX(a, b);                       \
-        r.val[1] = vzip2q_##SFX(a, b);                       \
-        return r;                                            \
-    }
-
+// interleave & deinterleave two vectors
 #ifdef __aarch64__
-    NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
-    NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
-    NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
-    NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
-    NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
-    NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
-    NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
-    NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vzip1q_##SFX(a, b);                       \
+            r.val[1] = vzip2q_##SFX(a, b);                       \
+            return r;                                            \
+        }                                                        \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        {                                                        \
+            T_VEC##x2 r;                                         \
+            r.val[0] = vuzp1q_##SFX(a, b);                       \
+            r.val[1] = vuzp2q_##SFX(a, b);                       \
+            return r;                                            \
+        }
 #else
-    #define npyv_zip_u8  vzipq_u8
-    #define npyv_zip_s8  vzipq_s8
-    #define npyv_zip_u16 vzipq_u16
-    #define npyv_zip_s16 vzipq_s16
-    #define npyv_zip_u32 vzipq_u32
-    #define npyv_zip_s32 vzipq_s32
-    #define npyv_zip_f32 vzipq_f32
+    #define NPYV_IMPL_NEON_ZIP(T_VEC, SFX)                       \
+        NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b)   \
+        { return vzipq_##SFX(a, b); }                            \
+        NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \
+        { return vuzpq_##SFX(a, b); }
 #endif
+
+NPYV_IMPL_NEON_ZIP(npyv_u8,  u8)
+NPYV_IMPL_NEON_ZIP(npyv_s8,  s8)
+NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+
 #define npyv_zip_u64 npyv_combine_u64
 #define npyv_zip_s64 npyv_combine_s64
+#define npyv_zip_f64 npyv_combine_f64
+#define npyv_unzip_u64 npyv_combine_u64
+#define npyv_unzip_s64 npyv_combine_s64
+#define npyv_unzip_f64 npyv_combine_f64
 
 // Reverse elements of each 64-bit lane
 #define npyv_rev64_u8  vrev64q_u8
@@ -116,4 +125,65 @@ NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
 #define npyv_rev64_s32 vrev64q_s32
 #define npyv_rev64_f32 vrev64q_f32
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#ifdef __clang__
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shufflevector(A, A, E0, E1, E2, E3)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u32(A, E0, E1, E2, E3) \
+        __builtin_shuffle(A, npyv_set_u32(E0, E1, E2, E3))
+#else
+    #define npyv_permi128_u32(A, E0, E1, E2, E3)          \
+        npyv_set_u32(                                     \
+            vgetq_lane_u32(A, E0), vgetq_lane_u32(A, E1), \
+            vgetq_lane_u32(A, E2), vgetq_lane_u32(A, E3)  \
+        )
+    #define npyv_permi128_s32(A, E0, E1, E2, E3)          \
+        npyv_set_s32(                                     \
+            vgetq_lane_s32(A, E0), vgetq_lane_s32(A, E1), \
+            vgetq_lane_s32(A, E2), vgetq_lane_s32(A, E3)  \
+        )
+    #define npyv_permi128_f32(A, E0, E1, E2, E3)          \
+        npyv_set_f32(                                     \
+            vgetq_lane_f32(A, E0), vgetq_lane_f32(A, E1), \
+            vgetq_lane_f32(A, E2), vgetq_lane_f32(A, E3)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s32 npyv_permi128_u32
+    #define npyv_permi128_f32 npyv_permi128_u32
+#endif
+
+#ifdef __clang__
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shufflevector(A, A, E0, E1)
+#elif defined(__GNUC__)
+    #define npyv_permi128_u64(A, E0, E1) \
+        __builtin_shuffle(A, npyv_set_u64(E0, E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1)                  \
+        npyv_set_u64(                                     \
+            vgetq_lane_u64(A, E0), vgetq_lane_u64(A, E1)  \
+        )
+    #define npyv_permi128_s64(A, E0, E1)                  \
+        npyv_set_s64(                                     \
+            vgetq_lane_s64(A, E0), vgetq_lane_s64(A, E1)  \
+        )
+    #define npyv_permi128_f64(A, E0, E1)                  \
+        npyv_set_f64(                                     \
+            vgetq_lane_f64(A, E0), vgetq_lane_f64(A, E1)  \
+        )
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+    #define npyv_permi128_s64 npyv_permi128_u64
+    #define npyv_permi128_f64 npyv_permi128_u64
+#endif
+
+#if !NPY_SIMD_F64
+    #undef npyv_permi128_f64
+#endif
+
 #endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 92a77ad80..8c9b14251 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -82,6 +82,9 @@ typedef double     npyv_lanetype_f64;
     #define NPY_SIMD_FMA3 0
     /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0.
     #define NPY_SIMD_BIGENDIAN 0
+    /// 1 if the supported comparison intrinsics(lt, le, gt, ge)
+    /// raises FP invalid exception for quite NaNs.
+    #define NPY_SIMD_CMPSIGNAL 0
 #endif
 
 // enable emulated mask operations for all SIMD extension except for AVX512
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
index bced35108..72a87eac1 100644
--- a/numpy/core/src/common/simd/sse/arithmetic.h
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -282,6 +282,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and subtract, -(a*b) - c
     #define npyv_nmulsub_f32 _mm_fnmsub_ps
     #define npyv_nmulsub_f64 _mm_fnmsub_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_fmaddsub_ps
+    #define npyv_muladdsub_f64 _mm_fmaddsub_pd
 #elif defined(NPY_HAVE_FMA4)
     // multiply and add, a*b + c
     #define npyv_muladd_f32 _mm_macc_ps
@@ -292,6 +296,10 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     // negate multiply and add, -(a*b) + c
     #define npyv_nmuladd_f32 _mm_nmacc_ps
     #define npyv_nmuladd_f64 _mm_nmacc_pd
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    #define npyv_muladdsub_f32 _mm_maddsub_ps
+    #define npyv_muladdsub_f64 _mm_maddsub_pd
 #else
     // multiply and add, a*b + c
     NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
@@ -308,6 +316,28 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     { return npyv_sub_f32(c, npyv_mul_f32(a, b)); }
     NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return npyv_sub_f64(c, npyv_mul_f64(a, b)); }
+    // multiply, add for odd elements and subtract even elements.
+    // (a * b) -+ c
+    NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+    {
+        npyv_f32 m = npyv_mul_f32(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_ps(m, c);
+    #else
+        const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+        return npyv_add_f32(m, npyv_xor_f32(msign, c));
+    #endif
+    }
+    NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+    {
+        npyv_f64 m = npyv_mul_f64(a, b);
+    #if NPY_HAVE_SSE3
+        return _mm_addsub_pd(m, c);
+    #else
+        const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+        return npyv_add_f64(m, npyv_xor_f64(msign, c));
+    #endif
+    }
 #endif // NPY_HAVE_FMA3
 #ifndef NPY_HAVE_FMA3 // for FMA4 and NON-FMA3
     // negate multiply and subtract, -(a*b) - c
diff --git a/numpy/core/src/common/simd/sse/math.h b/numpy/core/src/common/simd/sse/math.h
index 967240d09..b51c935af 100644
--- a/numpy/core/src/common/simd/sse/math.h
+++ b/numpy/core/src/common/simd/sse/math.h
@@ -269,13 +269,23 @@ NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f32 szero = _mm_set1_ps(-0.0f);
-    __m128i roundi = _mm_cvtps_epi32(a);
-    __m128i overflow = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
-    __m128 r = _mm_cvtepi32_ps(roundi);
-    // respect sign of zero
-    r = _mm_or_ps(r, _mm_and_ps(a, szero));
-    return npyv_select_f32(overflow, a, r);
+    const __m128 szero = _mm_set1_ps(-0.0f);
+    const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+    __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+            nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+            nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+    // eliminate nans/inf to avoid invalid fp errors
+    __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+    __m128i roundi = _mm_cvtps_epi32(x);
+    __m128 round = _mm_cvtepi32_ps(roundi);
+    // respect signed zero
+    round = _mm_or_ps(round, _mm_and_ps(a, szero));
+    // if overflow return a
+    __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+    // a if a overflow or nonfinite
+    return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, round);
 #endif
 }
 
@@ -285,16 +295,20 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #ifdef NPY_HAVE_SSE41
     return _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
 #else
-    const npyv_f64 szero = _mm_set1_pd(-0.0);
-    const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-    npyv_f64 abs_a = npyv_abs_f64(a);
+    const __m128d szero = _mm_set1_pd(-0.0);
+    const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+    __m128d nan_mask = _mm_cmpunord_pd(a, a);
+    // eliminate nans to avoid invalid fp errors within cmpge
+    __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
     // round by add magic number 2^52
     // assuming that MXCSR register is set to rounding
-    npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_a), two_power_52);
+    __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
     // copysign
-    npyv_f64 round = _mm_or_pd(abs_round, _mm_and_pd(a, szero));
-    // a if a >= 2^52
-    return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, round);
+    round = _mm_or_pd(round, _mm_and_pd(a, szero));
+    // a if |a| >= 2^52 or a == NaN
+    __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+            mask = _mm_or_pd(mask, nan_mask);
+    return npyv_select_f64(_mm_castpd_si128(mask), a, round);
 #endif
 }
 // ceil
@@ -304,24 +318,48 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 round = _mm_cvtepi32_ps(roundi);
-        npyv_f32 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(ceil, _mm_and_ps(a, szero));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 ceil = _mm_add_ps(round, _mm_and_ps(_mm_cmplt_ps(round, x), one));
+        // respect signed zero
+        ceil = _mm_or_ps(ceil, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, ceil);
     }
     NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        npyv_f64 round = npyv_rint_f64(a);
-        npyv_f64 ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, a), one));
-        // respect signed zero, e.g. -0.5 -> -0.0
-        return _mm_or_pd(ceil, _mm_and_pd(a, szero));
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
+        // round by add magic number 2^52
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d ceil = _mm_add_pd(round, _mm_and_pd(_mm_cmplt_pd(round, x), one));
+        // respects sign of 0.0
+        ceil = _mm_or_pd(ceil, sign_x);
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+                mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, ceil);
     }
 #endif
 
@@ -332,26 +370,43 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a)
     {
-        const npyv_f32 szero = _mm_set1_ps(-0.0f);
-        npyv_s32 roundi = _mm_cvttps_epi32(a);
-        npyv_f32 trunc = _mm_cvtepi32_ps(roundi);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i trunci = _mm_cvttps_epi32(x);
+        __m128 trunc = _mm_cvtepi32_ps(trunci);
         // respect signed zero, e.g. -0.5 -> -0.0
-        npyv_f32 rzero = _mm_or_ps(trunc, _mm_and_ps(a, szero));
+        trunc = _mm_or_ps(trunc, _mm_and_ps(a, szero));
         // if overflow return a
-        return npyv_select_f32(_mm_cmpeq_epi32(roundi, _mm_castps_si128(szero)), a, rzero);
+        __m128i overflow_mask = _mm_cmpeq_epi32(trunci, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, trunc);
     }
     NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a)
     {
-        const npyv_f64 szero = _mm_set1_pd(-0.0);
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        const npyv_f64 two_power_52 = _mm_set1_pd(0x10000000000000);
-        npyv_f64 abs_a = npyv_abs_f64(a);
+        const __m128d one = _mm_set1_pd(1.0);
+        const __m128d szero = _mm_set1_pd(-0.0);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d abs_x = npyv_abs_f64(_mm_xor_pd(nan_mask, a));
         // round by add magic number 2^52
-        npyv_f64 abs_round = _mm_sub_pd(_mm_add_pd(abs_a, two_power_52), two_power_52);
-        npyv_f64 subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_a), one);
-        npyv_f64 trunc = _mm_or_pd(_mm_sub_pd(abs_round, subtrahend), _mm_and_pd(a, szero));
-        // a if a >= 2^52
-        return npyv_select_f64(npyv_cmpge_f64(abs_a, two_power_52), a, trunc);
+        // assuming that MXCSR register is set to rounding
+        __m128d abs_round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        __m128d subtrahend = _mm_and_pd(_mm_cmpgt_pd(abs_round, abs_x), one);
+        __m128d trunc = _mm_sub_pd(abs_round, subtrahend);
+        // copysign
+        trunc = _mm_or_pd(trunc, _mm_and_pd(a, szero));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, trunc);
     }
 #endif
 
@@ -362,15 +417,46 @@ NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a)
 #else
     NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a)
     {
-        const npyv_f32 one = _mm_set1_ps(1.0f);
-        npyv_f32 round = npyv_rint_f32(a);
-        return _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, a), one));
+        const __m128 one = _mm_set1_ps(1.0f);
+        const __m128 szero = _mm_set1_ps(-0.0f);
+        const __m128i exp_mask = _mm_set1_epi32(0xff000000);
+
+        __m128i nfinite_mask = _mm_slli_epi32(_mm_castps_si128(a), 1);
+                nfinite_mask = _mm_and_si128(nfinite_mask, exp_mask);
+                nfinite_mask = _mm_cmpeq_epi32(nfinite_mask, exp_mask);
+
+        // eliminate nans/inf to avoid invalid fp errors
+        __m128 x = _mm_xor_ps(a, _mm_castsi128_ps(nfinite_mask));
+        __m128i roundi = _mm_cvtps_epi32(x);
+        __m128 round = _mm_cvtepi32_ps(roundi);
+        __m128 floor = _mm_sub_ps(round, _mm_and_ps(_mm_cmpgt_ps(round, x), one));
+        // respect signed zero
+        floor = _mm_or_ps(floor, _mm_and_ps(a, szero));
+        // if overflow return a
+        __m128i overflow_mask = _mm_cmpeq_epi32(roundi, _mm_castps_si128(szero));
+        // a if a overflow or nonfinite
+        return npyv_select_f32(_mm_or_si128(nfinite_mask, overflow_mask), a, floor);
     }
     NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a)
     {
-        const npyv_f64 one = _mm_set1_pd(1.0);
-        npyv_f64 round = npyv_rint_f64(a);
-        return _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, a), one));
+        const __m128d one = _mm_set1_pd(1.0f);
+        const __m128d szero = _mm_set1_pd(-0.0f);
+        const __m128d two_power_52 = _mm_set1_pd(0x10000000000000);
+        __m128d nan_mask = _mm_cmpunord_pd(a, a);
+        // eliminate nans to avoid invalid fp errors within cmpge
+        __m128d x = _mm_xor_pd(nan_mask, a);
+        __m128d abs_x = npyv_abs_f64(x);
+        __m128d sign_x = _mm_and_pd(x, szero);
+        // round by add magic number 2^52
+        // assuming that MXCSR register is set to rounding
+        __m128d round = _mm_sub_pd(_mm_add_pd(two_power_52, abs_x), two_power_52);
+        // copysign
+        round = _mm_or_pd(round, sign_x);
+        __m128d floor = _mm_sub_pd(round, _mm_and_pd(_mm_cmpgt_pd(round, x), one));
+        // a if |a| >= 2^52 or a == NaN
+        __m128d mask = _mm_cmpge_pd(abs_x, two_power_52);
+               mask = _mm_or_pd(mask, nan_mask);
+        return npyv_select_f64(_mm_castpd_si128(mask), a, floor);
     }
 #endif // NPY_HAVE_SSE41
 
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 3ff64848d..4c8e86a6f 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -103,6 +103,28 @@ NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
 NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return _mm_castpd_si128(npyv_loadn_f64((const double*)ptr, stride)); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{
+    __m128d r = _mm_loadh_pd(
+        npyv_loadl_f64((const double*)ptr), (const double*)(ptr + stride)
+    );
+    return _mm_castpd_ps(r);
+}
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return _mm_castps_si128(npyv_loadn2_f32((const float*)ptr, stride)); }
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -135,6 +157,24 @@ NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
 NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 { npyv_storen_f64((double*)ptr, stride, _mm_castsi128_pd(a)); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    _mm_storel_pd((double*)ptr, _mm_castsi128_pd(a));
+    _mm_storeh_pd((double*)(ptr + stride), _mm_castsi128_pd(a));
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, a); }
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, _mm_castps_si128(a)); }
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
 /*********************************
  * Partial Load
  *********************************/
@@ -204,13 +244,14 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
         return _mm_cvtsi32_si128(*ptr);
     case 2:
         return _mm_loadl_epi64((const __m128i*)ptr);
-    case 3:;
-        npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
-    #ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[2], 2);
-    #else
-        return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
-    #endif
+    case 3: {
+            npyv_s32 a = _mm_loadl_epi64((const __m128i*)ptr);
+        #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[2], 2);
+        #else
+            return _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[2]));
+        #endif
+        }
     default:
         return npyv_load_s32(ptr);
     }
@@ -246,6 +287,32 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     }
     return npyv_load_s64(ptr);
 }
+
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -305,23 +372,27 @@ npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
     case 1:
         return _mm_cvtsi32_si128(ptr[0]);
     case 2:;
-        npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        return _mm_insert_epi32(a, ptr[stride], 1);
-#else
-        return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-#endif // NPY_HAVE_SSE41
-    case 3:;
-        a = _mm_cvtsi32_si128(ptr[0]);
-#ifdef NPY_HAVE_SSE41
-        a = _mm_insert_epi32(a, ptr[stride], 1);
-        a = _mm_insert_epi32(a, ptr[stride*2], 2);
-        return a;
-#else
-        a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
-        a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
-        return a;
-#endif // NPY_HAVE_SSE41
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            return _mm_insert_epi32(a, ptr[stride], 1);
+    #else
+            return _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+    #endif // NPY_HAVE_SSE41
+        }
+    case 3:
+        {
+            npyv_s32 a = _mm_cvtsi32_si128(ptr[0]);
+    #ifdef NPY_HAVE_SSE41
+            a = _mm_insert_epi32(a, ptr[stride], 1);
+            a = _mm_insert_epi32(a, ptr[stride*2], 2);
+            return a;
+    #else
+            a = _mm_unpacklo_epi32(a, _mm_cvtsi32_si128(ptr[stride]));
+            a = _mm_unpacklo_epi64(a, _mm_cvtsi32_si128(ptr[stride*2]));
+            return a;
+    #endif // NPY_HAVE_SSE41
+        }
     default:
         return npyv_loadn_s32(ptr, stride);
     }
@@ -358,6 +429,37 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride,
     }
     return npyv_loadn_s64(ptr, stride);
 }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
+        return _mm_castpd_si128(
+            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+        );
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return _mm_loadl_epi64((const __m128i*)ptr);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -394,6 +496,17 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     }
     npyv_store_s64(ptr, a);
 }
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -401,25 +514,35 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = _mm_cvtsi128_si32(a);
     switch(nlane) {
+    case 1:
+        return;
 #ifdef NPY_HAVE_SSE41
-    default:
-        ptr[stride*3] = _mm_extract_epi32(a, 3);
+    case 2:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
+        return;
     case 3:
+        ptr[stride*1] = _mm_extract_epi32(a, 1);
         ptr[stride*2] = _mm_extract_epi32(a, 2);
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_extract_epi32(a, 1);
+        ptr[stride*2] = _mm_extract_epi32(a, 2);
+        ptr[stride*3] = _mm_extract_epi32(a, 3);
 #else
-    default:
-        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
+    case 2:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        return;
     case 3:
+        ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
         ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
-    case 2:
+        return;
+    default:
         ptr[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 1)));
+        ptr[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 2)));
+        ptr[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 3)));
 #endif
-    case 1:
-        ptr[stride*0] = _mm_cvtsi128_si32(a);
-        break;
     }
 }
 //// 64
@@ -432,6 +555,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -442,7 +580,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -454,7 +593,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -495,6 +635,110 @@ NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_SSE_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f32, s32)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_SSE_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_SSE_MEM_INTERLEAVE(SFX, ZSFX)                              \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_zip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX);   \
+    NPY_FINLINE npyv_##ZSFX##x2 npyv_unzip_##ZSFX(npyv_##ZSFX, npyv_##ZSFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                          \
+        const npyv_lanetype_##SFX *ptr                                       \
+    ) {                                                                      \
+        return npyv_unzip_##ZSFX(                                            \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX)     \
+        );                                                                   \
+    }                                                                        \
+    NPY_FINLINE void npyv_store_##SFX##x2(                                   \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                           \
+    ) {                                                                      \
+        npyv_##SFX##x2 zip = npyv_zip_##ZSFX(v.val[0], v.val[1]);            \
+        npyv_store_##SFX(ptr, zip.val[0]);                                   \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);               \
+    }
+
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s8, u8)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s16, u16)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s32, u32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(u64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(s64, u64)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f32, f32)
+NPYV_IMPL_SSE_MEM_INTERLEAVE(f64, f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
index d96ab9c56..9a57f6489 100644
--- a/numpy/core/src/common/simd/sse/reorder.h
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -81,6 +81,75 @@ NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
 NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
 NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u8(abl, abh);
+#else
+    __m128i ab_083b = _mm_unpacklo_epi8(ab0, ab1);
+    __m128i ab_4c6e = _mm_unpackhi_epi8(ab0, ab1);
+    __m128i ab_048c = _mm_unpacklo_epi8(ab_083b, ab_4c6e);
+    __m128i ab_36be = _mm_unpackhi_epi8(ab_083b, ab_4c6e);
+    __m128i ab_0346 = _mm_unpacklo_epi8(ab_048c, ab_36be);
+    __m128i ab_8bc8 = _mm_unpackhi_epi8(ab_048c, ab_36be);
+    npyv_u8x2 r;
+    r.val[0] = _mm_unpacklo_epi8(ab_0346, ab_8bc8);
+    r.val[1] = _mm_unpackhi_epi8(ab_0346, ab_8bc8);
+    return r;
+#endif
+}
+#define npyv_unzip_s8 npyv_unzip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+#ifdef NPY_HAVE_SSSE3
+    const __m128i idx = _mm_setr_epi8(
+        0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15
+    );
+    __m128i abl = _mm_shuffle_epi8(ab0, idx);
+    __m128i abh = _mm_shuffle_epi8(ab1, idx);
+    return npyv_combine_u16(abl, abh);
+#else
+    __m128i ab_0415 = _mm_unpacklo_epi16(ab0, ab1);
+    __m128i ab_263f = _mm_unpackhi_epi16(ab0, ab1);
+    __m128i ab_0246 = _mm_unpacklo_epi16(ab_0415, ab_263f);
+    __m128i ab_135f = _mm_unpackhi_epi16(ab_0415, ab_263f);
+    npyv_u16x2 r;
+    r.val[0] = _mm_unpacklo_epi16(ab_0246, ab_135f);
+    r.val[1] = _mm_unpackhi_epi16(ab_0246, ab_135f);
+    return r;
+#endif
+}
+#define npyv_unzip_s16 npyv_unzip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    __m128i abl = _mm_shuffle_epi32(ab0, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i abh = _mm_shuffle_epi32(ab1, _MM_SHUFFLE(3, 1, 2, 0));
+    return npyv_combine_u32(abl, abh);
+}
+#define npyv_unzip_s32 npyv_unzip_u32
+
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+#define npyv_unzip_s64 npyv_unzip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+{
+    npyv_f32x2 r;
+    r.val[0] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(2, 0, 2, 0));
+    r.val[1] = _mm_shuffle_ps(ab0, ab1, _MM_SHUFFLE(3, 1, 3, 1));
+    return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a)
 {
@@ -122,4 +191,22 @@ NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a)
     return _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));
 }
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_s32 npyv_permi128_u32
+
+#define npyv_permi128_u64(A, E0, E1) \
+    _mm_shuffle_epi32(A, _MM_SHUFFLE(((E1)<<1)+1, ((E1)<<1), ((E0)<<1)+1, ((E0)<<1)))
+
+#define npyv_permi128_s64 npyv_permi128_u64
+
+#define npyv_permi128_f32(A, E0, E1, E2, E3) \
+    _mm_shuffle_ps(A, A, _MM_SHUFFLE(E3, E2, E1, E0))
+
+#define npyv_permi128_f64(A, E0, E1) \
+    _mm_shuffle_pd(A, A, _MM_SHUFFLE2(E1, E0))
+
 #endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
index c21bbfda7..0c6b8cdba 100644
--- a/numpy/core/src/common/simd/sse/sse.h
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -12,6 +12,7 @@
     #define NPY_SIMD_FMA3 0  // fast emulated
 #endif
 #define NPY_SIMD_BIGENDIAN 0
+#define NPY_SIMD_CMPSIGNAL 1
 
 typedef __m128i npyv_u8;
 typedef __m128i npyv_s8;
diff --git a/numpy/core/src/common/simd/vec/arithmetic.h b/numpy/core/src/common/simd/vec/arithmetic.h
index a2e9d07eb..85f4d6b26 100644
--- a/numpy/core/src/common/simd/vec/arithmetic.h
+++ b/numpy/core/src/common/simd/vec/arithmetic.h
@@ -322,6 +322,20 @@ NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor)
     NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
     { return vec_neg(vec_madd(a, b, c)); }
 #endif
+// multiply, add for odd elements and subtract even elements.
+// (a * b) -+ c
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c)
+{
+    const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f);
+    return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c));
+}
+#endif
+NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c)
+{
+    const npyv_f64 msign = npyv_set_f64(-0.0, 0.0);
+    return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c));
+}
 /***************************
  * Summation
  ***************************/
@@ -352,6 +366,7 @@ NPY_FINLINE float npyv_sum_f32(npyv_f32 a)
 {
     npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a));
     return vec_extract(sum, 0) + vec_extract(sum, 1);
+    (void)sum;
 }
 #endif
 
@@ -372,6 +387,7 @@ NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a)
     npyv_u32 four = vec_sum4s(a, zero);
     npyv_s32 one  = vec_sums((npyv_s32)four, (npyv_s32)zero);
     return (npy_uint16)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
@@ -386,6 +402,7 @@ NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a)
     npyv_u32   four  = vec_add(eight.val[0], eight.val[1]);
     npyv_s32   one   = vec_sums((npyv_s32)four, zero);
     return (npy_uint32)vec_extract(one, 3);
+    (void)one;
 #endif
 }
 
diff --git a/numpy/core/src/common/simd/vec/conversion.h b/numpy/core/src/common/simd/vec/conversion.h
index f0d625c55..922109f7b 100644
--- a/numpy/core/src/common/simd/vec/conversion.h
+++ b/numpy/core/src/common/simd/vec/conversion.h
@@ -96,6 +96,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 4);
     #endif
+        // to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+	(void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a)
     {
@@ -106,6 +108,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a)
     {
@@ -120,6 +124,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
     NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a)
     {
@@ -134,6 +140,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #else
         return vec_extract(r, 8);
     #endif
+	// to suppress ambiguous warning: variable `r` but not used [-Wunused-but-set-variable]
+        (void)r;
     }
 #else
     NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a)
@@ -170,7 +178,8 @@ npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d,
     #ifdef NPY_HAVE_VXE2
         return vec_signed(a);
     #elif defined(NPY_HAVE_VXE)
-        return vec_packs(vec_signed(npyv_doublee(a)), vec_signed(npyv_doublee(vec_mergel(a, a))));
+        return vec_packs(vec_signed(npyv_doublee(vec_mergeh(a,a))),
+            vec_signed(npyv_doublee(vec_mergel(a, a))));
     // VSX
     #elif defined(__IBMC__)
         return vec_cts(a, 0);
diff --git a/numpy/core/src/common/simd/vec/math.h b/numpy/core/src/common/simd/vec/math.h
index 95b16fdf7..85690f76c 100644
--- a/numpy/core/src/common/simd/vec/math.h
+++ b/numpy/core/src/common/simd/vec/math.h
@@ -186,6 +186,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int32, s32)
     {                                                                   \
         npyv_##SFX r = vec_##INTRIN(a, vec_sld(a, a, 8));               \
         return (npy_##STYPE)vec_extract(r, 0);                          \
+    	(void)r;					 	                                \
     }
 NPY_IMPL_VEC_REDUCE_MINMAX(min, uint64, u64)
 NPY_IMPL_VEC_REDUCE_MINMAX(max, uint64, u64)
@@ -225,6 +226,7 @@ NPY_IMPL_VEC_REDUCE_MINMAX(max, int64, s64)
     {                                                             \
         npyv_f64 r = vec_##INTRIN(a, vec_sld(a, a, 8));           \
         return vec_extract(r, 0);                                 \
+        (void)r;                                                  \
     }                                                             \
     NPY_FINLINE double npyv_reduce_##INTRIN##n_f64(npyv_f64 a)    \
     {                                                             \
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index e8f588ef2..1ad39cead 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -46,14 +46,8 @@
 #endif
 
 // avoid aliasing rules
-#ifdef __cplusplus
-    template<typename T_PTR>
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const T_PTR *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#else
-    NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
-    { npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
-#endif // __cplusplus
+NPY_FINLINE npy_uint64 *npyv__ptr2u64(const void *ptr)
+{ npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr64; }
 
 // load lower part
 NPY_FINLINE npyv_u64 npyv__loadl(const void *ptr)
@@ -136,6 +130,24 @@ NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride)
 { return npyv_set_s64(ptr[0], ptr[stride]); }
 NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride)
 { return npyv_set_f64(ptr[0], ptr[stride]); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride)
+{ return (npyv_u32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride)
+{ return (npyv_s32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride)
+{ return (npyv_f32)npyv_set_u64(*(npy_uint64*)ptr, *(npy_uint64*)(ptr + stride)); }
+#endif
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_u64(ptr); }
+NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_s64(ptr); }
+NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride)
+{ (void)stride; return npyv_load_f64(ptr); }
+
 /***************************
  * Non-contiguous Store
  ***************************/
@@ -164,6 +176,26 @@ NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
 NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a)
 { npyv_storen_u64((npy_uint64*)ptr, stride, (npyv_u64)a); }
 
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a)
+{
+    *(npy_uint64*)ptr = vec_extract((npyv_u64)a, 0);
+    *(npy_uint64*)(ptr + stride) = vec_extract((npyv_u64)a, 1);
+}
+NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#if NPY_SIMD_F32
+NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a)
+{ npyv_storen2_u32((npy_uint32*)ptr, stride, (npyv_u32)a); }
+#endif
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a)
+{ (void)stride; npyv_store_u64(ptr, a); }
+NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a)
+{ (void)stride; npyv_store_s64(ptr, a); }
+NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
+{ (void)stride; npyv_store_f64(ptr, a); }
+
 /*********************************
  * Partial Load
  *********************************/
@@ -173,9 +205,9 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     npyv_s32 vfill = npyv_setall_s32(fill);
 #ifdef NPY_HAVE_VX
-    const unsigned blane = (unsigned short)nlane;
+    const unsigned blane = (nlane > 4) ? 4 : nlane;
     const npyv_u32 steps = npyv_set_u32(0, 1, 2, 3);
-    const npyv_u32 vlane = npyv_setall_u32((unsigned)blane);
+    const npyv_u32 vlane = npyv_setall_u32(blane);
     const npyv_b32 mask  = vec_cmpgt(vlane, steps);
     npyv_s32 a = vec_load_len(ptr, blane*4-1);
     return vec_sel(vfill, a, mask);
@@ -201,8 +233,8 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = ((unsigned short)nlane)*4 - 1;
-    return vec_load_len(ptr, blane);
+    unsigned blane = (nlane > 4) ? 4 : nlane;
+    return vec_load_len(ptr, blane*4-1);
 #else
     return npyv_load_till_s32(ptr, nlane, 0);
 #endif
@@ -220,12 +252,33 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     return vec_load_len((const signed long long*)ptr, blane*8-1);
 #else
     return npyv_load_till_s64(ptr, nlane, 0);
 #endif
 }
+//// 64-bit nlane
+NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
+                                          npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_load_s32(ptr);
+}
+// fill zero to rest lanes
+NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
+{ return (npyv_s32)npyv_load_tillz_s64((const npy_int64*)ptr, nlane); }
+
+//// 128-bit nlane
+NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
+                                           npy_int64 fill_lo, npy_int64 fill_hi)
+{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
+{ (void)nlane; return npyv_load_s64(ptr); }
 /*********************************
  * Non-contiguous partial load
  *********************************/
@@ -265,6 +318,34 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn_till_s64(ptr, stride, nlane, 0); }
+
+//// 64-bit load over 32-bit stride
+NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane,
+                                                 npy_int32 fill_lo, npy_int32 fill_hi)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
+{
+    assert(nlane > 0);
+    if (nlane == 1) {
+        return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+    }
+    return npyv_loadn2_s32(ptr, stride);
+}
+
+//// 128-bit load over 64-bit stride
+NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
+                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); }
+
+NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
+{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); }
+
 /*********************************
  * Partial store
  *********************************/
@@ -273,7 +354,7 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 4) ? 4 : nlane;
     vec_store_len(a, ptr, blane*4-1);
 #else
     switch(nlane) {
@@ -297,7 +378,7 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 {
     assert(nlane > 0);
 #ifdef NPY_HAVE_VX
-    unsigned blane = (unsigned short)nlane;
+    unsigned blane = (nlane > 2) ? 2 : nlane;
     vec_store_len(a, (signed long long*)ptr, blane*8-1);
 #else
     if (nlane == 1) {
@@ -307,6 +388,18 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
     npyv_store_s64(ptr, a);
 #endif
 }
+
+//// 64-bit nlane
+NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a)
+{ npyv_store_till_s64((npy_int64*)ptr, nlane, (npyv_s64)a); }
+
+//// 128-bit nlane
+NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a)
+{
+    assert(nlane > 0); (void)nlane;
+    npyv_store_s64(ptr, a);
+}
+
 /*********************************
  * Non-contiguous partial store
  *********************************/
@@ -314,16 +407,21 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a
 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
 {
     assert(nlane > 0);
+    ptr[stride*0] = vec_extract(a, 0);
     switch(nlane) {
-    default:
-        ptr[stride*3] = vec_extract(a, 3);
-    case 3:
-        ptr[stride*2] = vec_extract(a, 2);
+    case 1:
+        return;
     case 2:
         ptr[stride*1] = vec_extract(a, 1);
-    case 1:
-        ptr[stride*0] = vec_extract(a, 0);
-        break;
+        return;
+    case 3:
+        ptr[stride*1] = vec_extract(a, 1);
+        ptr[stride*2] = vec_extract(a, 2);
+        return;
+    default:
+         ptr[stride*1] = vec_extract(a, 1);
+         ptr[stride*2] = vec_extract(a, 2);
+         ptr[stride*3] = vec_extract(a, 3);
     }
 }
 //// 64
@@ -336,6 +434,21 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
     }
     npyv_storen_s64(ptr, stride, a);
 }
+
+//// 64-bit store over 32-bit stride
+NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a)
+{
+    assert(nlane > 0);
+    npyv_storel_s32(ptr, a);
+    if (nlane > 1) {
+        npyv_storeh_s32(ptr + stride, a);
+    }
+}
+
+//// 128-bit store over 64-bit stride
+NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a)
+{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); }
+
 /*****************************************************************
  * Implement partial load/store for u32/f32/u64/f64... via casting
  *****************************************************************/
@@ -346,7 +459,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX(                   \
             (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX                       \
         ));                                                                                 \
@@ -358,7 +472,8 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp
         union {                                                                             \
             npyv_lanetype_##F_SFX from_##F_SFX;                                             \
             npyv_lanetype_##T_SFX to_##T_SFX;                                               \
-        } pun = {.from_##F_SFX = fill};                                                     \
+        } pun;                                                                              \
+        pun.from_##F_SFX = fill;                                                            \
         return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX(                  \
             (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX               \
         ));                                                                                 \
@@ -401,6 +516,114 @@ NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f32, s32)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(u64, s64)
 NPYV_IMPL_VEC_REST_PARTIAL_TYPES(f64, s64)
 
+// 128-bit/64-bit stride
+#define NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX)                                 \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX                                        \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane,                                     \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX(                  \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane,                    \
+     npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi)                          \
+    {                                                                                       \
+        union pun {                                                                         \
+            npyv_lanetype_##F_SFX from_##F_SFX;                                             \
+            npyv_lanetype_##T_SFX to_##T_SFX;                                               \
+        };                                                                                  \
+        union pun pun_lo;                                                                   \
+        union pun pun_hi;                                                                   \
+        pun_lo.from_##F_SFX = fill_lo;                                                      \
+        pun_hi.from_##F_SFX = fill_hi;                                                      \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX,           \
+            pun_hi.to_##T_SFX                                                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX                                       \
+    (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane)                                     \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX(                 \
+            (const npyv_lanetype_##T_SFX *)ptr, nlane                                       \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX                                      \
+    (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane)                    \
+    {                                                                                       \
+        return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX(                \
+            (const npyv_lanetype_##T_SFX *)ptr, stride, nlane                               \
+        ));                                                                                 \
+    }                                                                                       \
+    NPY_FINLINE void npyv_store2_till_##F_SFX                                               \
+    (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a)                           \
+    {                                                                                       \
+        npyv_store2_till_##T_SFX(                                                           \
+            (npyv_lanetype_##T_SFX *)ptr, nlane,                                            \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }                                                                                       \
+    NPY_FINLINE void npyv_storen2_till_##F_SFX                                              \
+    (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a)          \
+    {                                                                                       \
+        npyv_storen2_till_##T_SFX(                                                          \
+            (npyv_lanetype_##T_SFX *)ptr, stride, nlane,                                    \
+            npyv_reinterpret_##T_SFX##_##F_SFX(a)                                           \
+        );                                                                                  \
+    }
+
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u32, s32)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f32, s32)
+#endif
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(u64, s64)
+NPYV_IMPL_VEC_REST_PARTIAL_TYPES_PAIR(f64, s64)
+
+/************************************************************
+ *  de-interlave load / interleave contiguous store
+ ************************************************************/
+// two channels
+#define NPYV_IMPL_VEC_MEM_INTERLEAVE(SFX)                                \
+    NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX);   \
+    NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \
+    NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2(                      \
+        const npyv_lanetype_##SFX *ptr                                   \
+    ) {                                                                  \
+        return npyv_unzip_##SFX(                                         \
+            npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \
+        );                                                               \
+    }                                                                    \
+    NPY_FINLINE void npyv_store_##SFX##x2(                               \
+        npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v                       \
+    ) {                                                                  \
+        npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]);         \
+        npyv_store_##SFX(ptr, zip.val[0]);                               \
+        npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]);           \
+    }
+
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s8)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s16)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s32)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(u64)
+NPYV_IMPL_VEC_MEM_INTERLEAVE(s64)
+#if NPY_SIMD_F32
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f32)
+#endif
+NPYV_IMPL_VEC_MEM_INTERLEAVE(f64)
+
 /*********************************
  * Lookup table
  *********************************/
diff --git a/numpy/core/src/common/simd/vec/misc.h b/numpy/core/src/common/simd/vec/misc.h
index 7ea0f21f6..79c194d90 100644
--- a/numpy/core/src/common/simd/vec/misc.h
+++ b/numpy/core/src/common/simd/vec/misc.h
@@ -26,28 +26,28 @@
 #define NPYV_IMPL_VEC_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
 #define NPYV_IMPL_VEC_SPLTD(T_VEC, V) ((T_VEC){V, V})
 
-#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)VAL)
-#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)VAL)
-#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)VAL)
-#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)VAL)
-#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)VAL)
-#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_u8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_u8,  (unsigned char)(VAL))
+#define npyv_setall_s8(VAL)  NPYV_IMPL_VEC_SPLTB(npyv_s8,  (signed char)(VAL))
+#define npyv_setall_u16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_u16, (unsigned short)(VAL))
+#define npyv_setall_s16(VAL) NPYV_IMPL_VEC_SPLTH(npyv_s16, (short)(VAL))
+#define npyv_setall_u32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_u32, (unsigned int)(VAL))
+#define npyv_setall_s32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_s32, (int)(VAL))
 #if NPY_SIMD_F32
-    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, VAL)
+    #define npyv_setall_f32(VAL) NPYV_IMPL_VEC_SPLTW(npyv_f32, (VAL))
 #endif
-#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)VAL)
-#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_u64, (npy_uint64)(VAL))
+#define npyv_setall_s64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_s64, (npy_int64)(VAL))
 #define npyv_setall_f64(VAL) NPYV_IMPL_VEC_SPLTD(npyv_f64, VAL)
 
 // vector with specific values set to each lane and
 // set a specific value to all remained lanes
-#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
-#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u8(FILL, ...)  ((npyv_u8){NPYV__SET_FILL_16(unsigned char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...)  ((npyv_s8){NPYV__SET_FILL_16(signed char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(unsigned short, FILL, __VA_ARGS__)})
 #define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
-#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(unsigned int, FILL, __VA_ARGS__)})
 #define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
-#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)})
 #define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
 #if NPY_SIMD_F32
     #define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
diff --git a/numpy/core/src/common/simd/vec/reorder.h b/numpy/core/src/common/simd/vec/reorder.h
index b60b9287d..3910980a2 100644
--- a/numpy/core/src/common/simd/vec/reorder.h
+++ b/numpy/core/src/common/simd/vec/reorder.h
@@ -68,6 +68,85 @@ NPYV_IMPL_VEC_COMBINE_ZIP(npyv_s64, s64)
 #endif
 NPYV_IMPL_VEC_COMBINE_ZIP(npyv_f64, f64)
 
+// deinterleave two vectors
+NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 ab0, npyv_u8 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+    );
+    npyv_u8x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 ab0, npyv_s8 ab1)
+{
+    npyv_u8x2 ru = npyv_unzip_u8((npyv_u8)ab0, (npyv_u8)ab1);
+    npyv_s8x2 r;
+    r.val[0] = (npyv_s8)ru.val[0];
+    r.val[1] = (npyv_s8)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 ab0, npyv_u16 ab1)
+{
+    const npyv_u8 idx_even = npyv_set_u8(
+        0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+    );
+    const npyv_u8 idx_odd = npyv_set_u8(
+        2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
+    );
+    npyv_u16x2 r;
+    r.val[0] = vec_perm(ab0, ab1, idx_even);
+    r.val[1] = vec_perm(ab0, ab1, idx_odd);
+    return r;
+}
+NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 ab0, npyv_s16 ab1)
+{
+    npyv_u16x2 ru = npyv_unzip_u16((npyv_u16)ab0, (npyv_u16)ab1);
+    npyv_s16x2 r;
+    r.val[0] = (npyv_s16)ru.val[0];
+    r.val[1] = (npyv_s16)ru.val[1];
+    return r;
+}
+NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 ab0, npyv_u32 ab1)
+{
+    npyv_u32 m0 = vec_mergeh(ab0, ab1);
+    npyv_u32 m1 = vec_mergel(ab0, ab1);
+    npyv_u32 r0 = vec_mergeh(m0, m1);
+    npyv_u32 r1 = vec_mergel(m0, m1);
+    npyv_u32x2 r;
+    r.val[0] = r0;
+    r.val[1] = r1;
+    return r;
+}
+NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 ab0, npyv_s32 ab1)
+{
+    npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+    npyv_s32x2 r;
+    r.val[0] = (npyv_s32)ru.val[0];
+    r.val[1] = (npyv_s32)ru.val[1];
+    return r;
+}
+#if NPY_SIMD_F32
+    NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 ab0, npyv_f32 ab1)
+    {
+        npyv_u32x2 ru = npyv_unzip_u32((npyv_u32)ab0, (npyv_u32)ab1);
+        npyv_f32x2 r;
+        r.val[0] = (npyv_f32)ru.val[0];
+        r.val[1] = (npyv_f32)ru.val[1];
+        return r;
+    }
+#endif
+NPY_FINLINE npyv_u64x2 npyv_unzip_u64(npyv_u64 ab0, npyv_u64 ab1)
+{ return npyv_combine_u64(ab0, ab1); }
+NPY_FINLINE npyv_s64x2 npyv_unzip_s64(npyv_s64 ab0, npyv_s64 ab1)
+{ return npyv_combine_s64(ab0, ab1); }
+NPY_FINLINE npyv_f64x2 npyv_unzip_f64(npyv_f64 ab0, npyv_f64 ab1)
+{ return npyv_combine_f64(ab0, ab1); }
+
 // Reverse elements of each 64-bit lane
 NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a)
 {
@@ -111,4 +190,24 @@ NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a)
     { return (npyv_f32)npyv_rev64_u32((npyv_u32)a); }
 #endif
 
+// Permuting the elements of each 128-bit lane by immediate index for
+// each element.
+#define npyv_permi128_u32(A, E0, E1, E2, E3)      \
+    vec_perm(A, A, npyv_set_u8(                   \
+        (E0<<2), (E0<<2)+1, (E0<<2)+2, (E0<<2)+3, \
+        (E1<<2), (E1<<2)+1, (E1<<2)+2, (E1<<2)+3, \
+        (E2<<2), (E2<<2)+1, (E2<<2)+2, (E2<<2)+3, \
+        (E3<<2), (E3<<2)+1, (E3<<2)+2, (E3<<2)+3  \
+    ))
+#define npyv_permi128_s32 npyv_permi128_u32
+#define npyv_permi128_f32 npyv_permi128_u32
+
+#if defined(__IBMC__) || defined(vec_permi)
+    #define npyv_permi128_u64(A, E0, E1) vec_permi(A, A, ((E0)<<1) | (E1))
+#else
+    #define npyv_permi128_u64(A, E0, E1) vec_xxpermdi(A, A, ((E0)<<1) | (E1))
+#endif
+#define npyv_permi128_s64 npyv_permi128_u64
+#define npyv_permi128_f64 npyv_permi128_u64
+
 #endif // _NPY_SIMD_VEC_REORDER_H
diff --git a/numpy/core/src/common/simd/vec/vec.h b/numpy/core/src/common/simd/vec/vec.h
index abcd33ce1..1d4508669 100644
--- a/numpy/core/src/common/simd/vec/vec.h
+++ b/numpy/core/src/common/simd/vec/vec.h
@@ -39,8 +39,10 @@
 
 #ifdef NPY_HAVE_VX
     #define NPY_SIMD_BIGENDIAN 1
+    #define NPY_SIMD_CMPSIGNAL 0
 #else
     #define NPY_SIMD_BIGENDIAN 0
+    #define NPY_SIMD_CMPSIGNAL 1
 #endif
 
 typedef __vector unsigned char      npyv_u8;
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
index 6fe157199..4b17a2809 100644
--- a/numpy/core/src/common/ucsnarrow.h
+++ b/numpy/core/src/common/ucsnarrow.h
@@ -2,6 +2,6 @@
 #define NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_
 
 NPY_NO_EXPORT PyUnicodeObject *
-PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
+PyUnicode_FromUCS4(char const *src, Py_ssize_t size, int swap, int align);
 
 #endif  /* NUMPY_CORE_SRC_COMMON_NPY_UCSNARROW_H_ */
diff --git a/numpy/core/src/common/utils.hpp b/numpy/core/src/common/utils.hpp
new file mode 100644
index 000000000..f847cab44
--- /dev/null
+++ b/numpy/core/src/common/utils.hpp
@@ -0,0 +1,51 @@
+#ifndef NUMPY_CORE_SRC_COMMON_UTILS_HPP
+#define NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
+#include "npdef.hpp"
+
+#if NP_HAS_CPP20
+    #include <bit>
+#endif
+
+#include <type_traits>
+#include <string.h>
+
+namespace np {
+
+/** Create a value of type `To` from the bits of `from`.
+ *
+ * similar to `std::bit_cast` but compatible with C++17,
+ * should perform similar to `*reinterpret_cast<To*>(&from)`
+ * or through punning without expecting any undefined behaviors.
+ */
+template<typename To, typename From>
+#if NP_HAS_BUILTIN(__builtin_bit_cast) || NP_HAS_CPP20
+[[nodiscard]] constexpr
+#else
+inline
+#endif
+To BitCast(const From &from) noexcept
+{
+    static_assert(
+        sizeof(To) == sizeof(From),
+        "both data types must have the same size");
+
+    static_assert(
+        std::is_trivially_copyable_v<To> &&
+        std::is_trivially_copyable_v<From>,
+        "both data types must be trivially copyable");
+
+#if NP_HAS_CPP20
+    return std::bit_cast<To>(from);
+#elif NP_HAS_BUILTIN(__builtin_bit_cast)
+    return __builtin_bit_cast(To, from);
+#else
+    To to;
+    memcpy(&to, &from, sizeof(from));
+    return to;
+#endif
+}
+
+} // namespace np
+#endif // NUMPY_CORE_SRC_COMMON_UTILS_HPP
+
diff --git a/numpy/core/src/multiarray/abstractdtypes.h b/numpy/core/src/multiarray/abstractdtypes.h
index b0850bd35..a3f6ceb05 100644
--- a/numpy/core/src/multiarray/abstractdtypes.h
+++ b/numpy/core/src/multiarray/abstractdtypes.h
@@ -58,7 +58,8 @@ npy_mark_tmp_array_if_pyscalar(
         }
         return 1;
     }
-    else if (PyComplex_Check(obj) && PyArray_TYPE(arr) == NPY_CDOUBLE) {
+    else if (PyComplex_Check(obj) && !PyArray_IsScalar(obj, CDouble)
+             && PyArray_TYPE(arr) == NPY_CDOUBLE) {
         ((PyArrayObject_fields *)arr)->flags |= NPY_ARRAY_WAS_PYTHON_COMPLEX;
         if (dtype != NULL) {
             Py_INCREF(&PyArray_PyComplexAbstractDType);
diff --git a/numpy/core/src/multiarray/argfunc.dispatch.c.src b/numpy/core/src/multiarray/argfunc.dispatch.c.src
index 1d7753275..d79be1df5 100644
--- a/numpy/core/src/multiarray/argfunc.dispatch.c.src
+++ b/numpy/core/src/multiarray/argfunc.dispatch.c.src
@@ -194,7 +194,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
         npyv_@bsfx@ nnan_ab = npyv_and_@bsfx@(nnan_a, nnan_b);
         npyv_@bsfx@ nnan_cd = npyv_and_@bsfx@(nnan_c, nnan_d);
         npy_uint64 nnan = npyv_tobits_@bsfx@(npyv_and_@bsfx@(nnan_ab, nnan_cd));
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             npy_uint64 nnan_4[4];
             nnan_4[0] = npyv_tobits_@bsfx@(nnan_a);
             nnan_4[1] = npyv_tobits_@bsfx@(nnan_b);
@@ -219,7 +219,7 @@ simd_@func@_@sfx@(npyv_lanetype_@sfx@ *ip, npy_intp len)
     #if @is_fp@
         npyv_@bsfx@ nnan_a = npyv_notnan_@sfx@(a);
         npy_uint64 nnan = npyv_tobits_@bsfx@(nnan_a);
-        if (nnan != ((1LL << vstep) - 1)) {
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
             for (int vi = 0; vi < vstep; ++vi) {
                 if (!((nnan >> vi) & 1)) {
                     return i + vi;
diff --git a/numpy/core/src/multiarray/array_assign_array.c b/numpy/core/src/multiarray/array_assign_array.c
index 9d5bf6875..7aee7d6e9 100644
--- a/numpy/core/src/multiarray/array_assign_array.c
+++ b/numpy/core/src/multiarray/array_assign_array.c
@@ -130,7 +130,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     }
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        npy_clear_floatstatus_barrier(src_data);
+        npy_clear_floatstatus_barrier((char*)&src_data);
     }
     if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS;
@@ -153,7 +153,7 @@ raw_array_assign_array(int ndim, npy_intp const *shape,
     NPY_cast_info_xfree(&cast_info);
 
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier(src_data);
+        int fpes = npy_get_floatstatus_barrier((char*)&src_data);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index 13d6682f7..ba9118052 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -16,6 +16,7 @@
 #include "common_dtype.h"
 #include "dtypemeta.h"
 
+#include "npy_argparse.h"
 #include "abstractdtypes.h"
 #include "array_coercion.h"
 #include "ctors.h"
@@ -873,42 +874,62 @@ find_descriptor_from_array(
 
 /**
  * Given a dtype or DType object, find the correct descriptor to cast the
- * array to.
+ * array to.  In some places, this function is used with dtype=NULL which
+ * means that legacy behavior is used: The dtype instances "S0", "U0", and
+ * "V0" are converted to mean the DType classes instead.
+ * When dtype != NULL, this path is ignored, and the function does nothing
+ * unless descr == NULL. If both descr and dtype are null, it returns the
+ * descriptor for the array.
  *
  * This function is identical to normal casting using only the dtype, however,
  * it supports inspecting the elements when the array has object dtype
  * (and the given datatype describes a parametric DType class).
  *
  * @param arr
- * @param dtype A dtype instance or class.
+ * @param dtype NULL or a dtype class
+ * @param descr A dtype instance, if the dtype is NULL the dtype class is
+ *              found and e.g. "S0" is converted to denote only String.
  * @return A concrete dtype instance or NULL
  */
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype)
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr)
 {
     /* If the requested dtype is flexible, adapt it */
-    PyArray_Descr *new_dtype;
-    PyArray_DTypeMeta *new_DType;
+    PyArray_Descr *new_descr;
     int res;
 
-    res = PyArray_ExtractDTypeAndDescriptor((PyObject *)dtype,
-            &new_dtype, &new_DType);
-    if (res < 0) {
-        return NULL;
+    if (dtype != NULL && descr != NULL) {
+        /* descr was given and no special logic, return (call not necessary) */
+        Py_INCREF(descr);
+        return descr;
     }
-    if (new_dtype == NULL) {
-        res = find_descriptor_from_array(arr, new_DType, &new_dtype);
+    if (dtype == NULL) {
+        res = PyArray_ExtractDTypeAndDescriptor(descr, &new_descr, &dtype);
         if (res < 0) {
-            Py_DECREF(new_DType);
             return NULL;
         }
-        if (new_dtype == NULL) {
-            /* This is an object array but contained no elements, use default */
-            new_dtype = NPY_DT_CALL_default_descr(new_DType);
+        if (new_descr != NULL) {
+            Py_DECREF(dtype);
+            return new_descr;
         }
     }
-    Py_DECREF(new_DType);
-    return new_dtype;
+    else {
+        assert(descr == NULL);  /* gueranteed above */
+        Py_INCREF(dtype);
+    }
+
+    res = find_descriptor_from_array(arr, dtype, &new_descr);
+    if (res < 0) {
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    if (new_descr == NULL) {
+        /* This is an object array but contained no elements, use default */
+        new_descr = NPY_DT_CALL_default_descr(dtype);
+    }
+    Py_XDECREF(dtype);
+    return new_descr;
 }
 
 
@@ -1004,53 +1025,6 @@ PyArray_DiscoverDTypeAndShape_Recursive(
             Py_DECREF(arr);
             arr = NULL;
         }
-        else if (curr_dims > 0 && curr_dims != max_dims) {
-            /*
-             * Deprecated 2020-12-09, NumPy 1.20
-             *
-             * See https://github.com/numpy/numpy/issues/17965
-             * Shapely had objects which are not sequences but did export
-             * the array-interface (and so are arguably array-like).
-             * Previously numpy would not use array-like information during
-             * shape discovery, so that it ended up acting as if this was
-             * an (unknown) scalar but with the specified dtype.
-             * Thus we ignore "scalars" here, as the value stored in the
-             * array should be acceptable.
-             */
-            if (PyArray_NDIM(arr) > 0 && NPY_UNLIKELY(!PySequence_Check(obj))) {
-                if (PyErr_WarnFormat(PyExc_FutureWarning, 1,
-                        "The input object of type '%s' is an array-like "
-                        "implementing one of the corresponding protocols "
-                        "(`__array__`, `__array_interface__` or "
-                        "`__array_struct__`); but not a sequence (or 0-D). "
-                        "In the future, this object will be coerced as if it "
-                        "was first converted using `np.array(obj)`. "
-                        "To retain the old behaviour, you have to either "
-                        "modify the type '%s', or assign to an empty array "
-                        "created with `np.empty(correct_shape, dtype=object)`.",
-                        Py_TYPE(obj)->tp_name, Py_TYPE(obj)->tp_name) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                /*
-                 * Strangely enough, even though we threw away the result here,
-                 * we did use it during descriptor discovery, so promote it:
-                 */
-                if (update_shape(curr_dims, &max_dims, out_shape,
-                        0, NULL, NPY_FALSE, flags) < 0) {
-                    *flags |= FOUND_RAGGED_ARRAY;
-                    Py_DECREF(arr);
-                    return max_dims;
-                }
-                if (!(*flags & DESCRIPTOR_WAS_SET) && handle_promotion(
-                        out_descr, PyArray_DESCR(arr), fixed_DType, flags) < 0) {
-                    Py_DECREF(arr);
-                    return -1;
-                }
-                Py_DECREF(arr);
-                return max_dims;
-            }
-        }
     }
     if (arr != NULL) {
         /*
@@ -1260,7 +1234,9 @@ PyArray_DiscoverDTypeAndShape(
     }
 
     if (requested_descr != NULL) {
-        assert(fixed_DType == NPY_DTYPE(requested_descr));
+        if (fixed_DType != NULL) {
+            assert(fixed_DType == NPY_DTYPE(requested_descr));
+        }
         /* The output descriptor must be the input. */
         Py_INCREF(requested_descr);
         *out_descr = requested_descr;
@@ -1380,105 +1356,25 @@ PyArray_DiscoverDTypeAndShape(
 }
 
 
-
-/**
- * Check the descriptor is a legacy "flexible" DType instance, this is
- * an instance which is (normally) not attached to an array, such as a string
- * of length 0 or a datetime with no unit.
- * These should be largely deprecated, and represent only the DType class
- * for most `dtype` parameters.
- *
- * TODO: This function should eventually receive a deprecation warning and
- *       be removed.
- *
- * @param descr
- * @return 1 if this is not a concrete dtype instance 0 otherwise
- */
-static int
-descr_is_legacy_parametric_instance(PyArray_Descr *descr)
-{
-    if (PyDataType_ISUNSIZED(descr)) {
-        return 1;
-    }
-    /* Flexible descr with generic time unit (which can be adapted) */
-    if (PyDataType_ISDATETIME(descr)) {
-        PyArray_DatetimeMetaData *meta;
-        meta = get_datetime_metadata_from_dtype(descr);
-        if (meta->base == NPY_FR_GENERIC) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Given either a DType instance or class, (or legacy flexible instance),
- * ands sets output dtype instance and DType class. Both results may be
- * NULL, but if `out_descr` is set `out_DType` will always be the
- * corresponding class.
- *
- * @param dtype
- * @param out_descr
- * @param out_DType
- * @return 0 on success -1 on failure
- */
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
-{
-    *out_DType = NULL;
-    *out_descr = NULL;
-
-    if (dtype != NULL) {
-        if (PyObject_TypeCheck(dtype, (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            assert(dtype != (PyObject * )&PyArrayDescr_Type);  /* not np.dtype */
-            *out_DType = (PyArray_DTypeMeta *)dtype;
-            Py_INCREF(*out_DType);
-        }
-        else if (PyObject_TypeCheck((PyObject *)Py_TYPE(dtype),
-                    (PyTypeObject *)&PyArrayDTypeMeta_Type)) {
-            *out_DType = NPY_DTYPE(dtype);
-            Py_INCREF(*out_DType);
-            if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype)) {
-                *out_descr = (PyArray_Descr *)dtype;
-                Py_INCREF(*out_descr);
-            }
-        }
-        else {
-            PyErr_SetString(PyExc_TypeError,
-                    "dtype parameter must be a DType instance or class.");
-            return -1;
-        }
-    }
-    return 0;
-}
-
-
 /*
  * Python API function to expose the dtype+shape discovery functionality
  * directly.
  */
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs)
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    static char *kwlist[] = {"obj", "dtype", NULL};
-
     PyObject *obj;
-    PyObject *dtype = NULL;
-    PyArray_Descr *fixed_descriptor = NULL;
-    PyArray_DTypeMeta *fixed_DType = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     npy_intp shape[NPY_MAXDIMS];
 
-    if (!PyArg_ParseTupleAndKeywords(
-            args, kwargs, "O|O:_discover_array_parameters", kwlist,
-            &obj, &dtype)) {
-        return NULL;
-    }
-
-    if (PyArray_ExtractDTypeAndDescriptor(dtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments(
+            "_discover_array_parameters", args, len_args, kwnames,
+            "", NULL, &obj,
+            "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
+            NULL, NULL, NULL) < 0) {
+        /* fixed is last to parse, so never necessary to clean up */
         return NULL;
     }
 
@@ -1487,9 +1383,9 @@ _discover_array_parameters(PyObject *NPY_UNUSED(self),
     int ndim = PyArray_DiscoverDTypeAndShape(
             obj, NPY_MAXDIMS, shape,
             &coercion_cache,
-            fixed_DType, fixed_descriptor, (PyArray_Descr **)&out_dtype, 0);
-    Py_XDECREF(fixed_DType);
-    Py_XDECREF(fixed_descriptor);
+            dt_info.dtype, dt_info.descr, (PyArray_Descr **)&out_dtype, 0);
+    Py_XDECREF(dt_info.dtype);
+    Py_XDECREF(dt_info.descr);
     if (ndim < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index 63d543cf7..0757c1cb8 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -26,7 +26,8 @@ NPY_NO_EXPORT int
 PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value);
 
 NPY_NO_EXPORT PyArray_Descr *
-PyArray_AdaptDescriptorToArray(PyArrayObject *arr, PyObject *dtype);
+PyArray_AdaptDescriptorToArray(
+        PyArrayObject *arr, PyArray_DTypeMeta *dtype, PyArray_Descr *descr);
 
 NPY_NO_EXPORT int
 PyArray_DiscoverDTypeAndShape(
@@ -36,14 +37,9 @@ PyArray_DiscoverDTypeAndShape(
         PyArray_DTypeMeta *fixed_DType, PyArray_Descr *requested_descr,
         PyArray_Descr **out_descr, int never_copy);
 
-NPY_NO_EXPORT int
-PyArray_ExtractDTypeAndDescriptor(PyObject *dtype,
-        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
-
 NPY_NO_EXPORT PyObject *
 _discover_array_parameters(PyObject *NPY_UNUSED(self),
-                           PyObject *args, PyObject *kwargs);
-
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames);
 
 /* Would make sense to inline the freeing functions everywhere */
 /* Frees the coercion cache object recursively. */
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index 79c588f25..87515bb03 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -27,15 +27,18 @@
  *    weak reference to the input DTypes.
  */
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _UMATHMODULE
 #define _MULTIARRAYMODULE
 
 #include <npy_pycompat.h>
 #include "arrayobject.h"
+#include "array_coercion.h"
 #include "array_method.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "convert_datatype.h"
 #include "common.h"
+#include "numpy/ufuncobject.h"
 
 
 /*
@@ -248,6 +251,7 @@ fill_arraymethod_from_slots(
     /* Set the defaults */
     meth->get_strided_loop = &npy_default_get_strided_loop;
     meth->resolve_descriptors = &default_resolve_descriptors;
+    meth->get_reduction_initial = NULL;  /* no initial/identity by default */
 
     /* Fill in the slots passed by the user */
     /*
@@ -260,7 +264,7 @@ fill_arraymethod_from_slots(
             case NPY_METH_resolve_descriptors:
                 meth->resolve_descriptors = slot->pfunc;
                 continue;
-            case NPY_METH_get_loop:
+            case _NPY_METH_get_loop:
                 /*
                  * NOTE: get_loop is considered "unstable" in the public API,
                  *       I do not like the signature, and the `move_references`
@@ -284,6 +288,12 @@ fill_arraymethod_from_slots(
             case NPY_METH_unaligned_contiguous_loop:
                 meth->unaligned_contiguous_loop = slot->pfunc;
                 continue;
+            case NPY_METH_get_reduction_initial:
+                meth->get_reduction_initial = slot->pfunc;
+                continue;
+            case NPY_METH_contiguous_indexed_loop:
+                meth->contiguous_indexed_loop = slot->pfunc;
+                continue;
             default:
                 break;
         }
@@ -334,27 +344,39 @@ fill_arraymethod_from_slots(
     }
 
     /* Check whether the provided loops make sense. */
+    if (meth->flags & NPY_METH_SUPPORTS_UNALIGNED) {
+        if (meth->unaligned_strided_loop == NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must provide unaligned strided inner loop when using "
+                    "NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    else {
+        if (meth->unaligned_strided_loop != NULL) {
+            PyErr_Format(PyExc_TypeError,
+                    "Must not provide unaligned strided inner loop when not "
+                    "using NPY_METH_SUPPORTS_UNALIGNED flag (in method: %s)",
+                    spec->name);
+            return -1;
+        }
+    }
+    /* Fill in the blanks: */
+    if (meth->unaligned_contiguous_loop == NULL) {
+        meth->unaligned_contiguous_loop = meth->unaligned_strided_loop;
+    }
     if (meth->strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide a strided inner loop function. (method: %s)",
-                spec->name);
-        return -1;
+        meth->strided_loop = meth->unaligned_strided_loop;
     }
     if (meth->contiguous_loop == NULL) {
         meth->contiguous_loop = meth->strided_loop;
     }
-    if (meth->unaligned_contiguous_loop != NULL &&
-            meth->unaligned_strided_loop == NULL) {
-        PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
-        return -1;
-    }
-    if ((meth->unaligned_strided_loop == NULL) !=
-            !(meth->flags & NPY_METH_SUPPORTS_UNALIGNED)) {
+
+    if (meth->strided_loop == NULL) {
         PyErr_Format(PyExc_TypeError,
-                "Must provide unaligned strided inner loop when providing "
-                "a contiguous version. (method: %s)", spec->name);
+                "Must provide a strided inner loop function. (method: %s)",
+                spec->name);
         return -1;
     }
 
@@ -883,7 +905,7 @@ generic_masked_strided_loop(PyArrayMethod_Context *context,
 /*
  * Fetches a strided-loop function that supports a boolean mask as additional
  * (last) operand to the strided-loop.  It is otherwise largely identical to
- * the `get_loop` method which it wraps.
+ * the `get_strided_loop` method which it wraps.
  * This is the core implementation for the ufunc `where=...` keyword argument.
  *
  * NOTE: This function does not support `move_references` or inner dimensions.
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index c9ec8903d..c82a968cd 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -11,40 +11,7 @@
 extern "C" {
 #endif
 
-typedef enum {
-    /* Flag for whether the GIL is required */
-    NPY_METH_REQUIRES_PYAPI = 1 << 1,
-    /*
-     * Some functions cannot set floating point error flags, this flag
-     * gives us the option (not requirement) to skip floating point error
-     * setup/check. No function should set error flags and ignore them
-     * since it would interfere with chaining operations (e.g. casting).
-     */
-    /*
-     * TODO: Change this into a positive flag?  That would make "combing"
-     *       multiple methods easier. OTOH, if we add more flags, the default
-     *       would be 0 just like it is here.
-     */
-    NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
-    /* Whether the method supports unaligned access (not runtime) */
-    NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
-    /*
-     * Private flag for now for *logic* functions.  The logical functions
-     * `logical_or` and `logical_and` can always cast the inputs to booleans
-     * "safely" (because that is how the cast to bool is defined).
-     * @seberg: I am not sure this is the best way to handle this, so its
-     * private for now (also it is very limited anyway).
-     * There is one "exception". NA aware dtypes cannot cast to bool
-     * (hopefully), so the `??->?` loop should error even with this flag.
-     * But a second NA fallback loop will be necessary.
-     */
-    _NPY_METH_FORCE_CAST_INPUTS = 1 << 17,
-
-    /* All flags which can change at runtime */
-    NPY_METH_RUNTIME_FLAGS = (
-            NPY_METH_REQUIRES_PYAPI |
-            NPY_METH_NO_FLOATINGPOINT_ERRORS),
-} NPY_ARRAYMETHOD_FLAGS;
+#include "numpy/_dtype_api.h"
 
 
 /*
@@ -60,119 +27,6 @@ typedef enum {
             ((flags1 | flags2) & ~PyArrayMethod_MINIMAL_FLAGS)  \
             | (flags1 & flags2)))
 
-
-struct PyArrayMethodObject_tag;
-
-/*
- * This struct is specific to an individual (possibly repeated) call of
- * the ArrayMethods strided operator, and as such is passed into the various
- * methods of the ArrayMethod object (the resolve_descriptors function,
- * the get_loop function and the individual lowlevel strided operator calls).
- * It thus has to be persistent for one end-user call, and then be discarded.
- *
- * TODO: Before making this public, we should review which information should
- *       be stored on the Context/BoundArrayMethod vs. the ArrayMethod.
- */
-typedef struct {
-    PyObject *caller;  /* E.g. the original ufunc, may be NULL */
-    struct PyArrayMethodObject_tag *method;
-
-    /* Operand descriptors, filled in by resolve_descriptors */
-    PyArray_Descr **descriptors;
-} PyArrayMethod_Context;
-
-
-typedef int (PyArrayMethod_StridedLoop)(PyArrayMethod_Context *context,
-        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
-        NpyAuxData *transferdata);
-
-
-typedef NPY_CASTING (resolve_descriptors_function)(
-        struct PyArrayMethodObject_tag *method,
-        PyArray_DTypeMeta **dtypes,
-        PyArray_Descr **given_descrs,
-        PyArray_Descr **loop_descrs,
-        npy_intp *view_offset);
-
-
-typedef int (get_loop_function)(
-        PyArrayMethod_Context *context,
-        int aligned, int move_references,
-        const npy_intp *strides,
-        PyArrayMethod_StridedLoop **out_loop,
-        NpyAuxData **out_transferdata,
-        NPY_ARRAYMETHOD_FLAGS *flags);
-
-
-/*
- * The following functions are only used be the wrapping array method defined
- * in umath/wrapping_array_method.c
- */
-
-/*
- * The function to convert the given descriptors (passed in to
- * `resolve_descriptors`) and translates them for the wrapped loop.
- * The new descriptors MUST be viewable with the old ones, `NULL` must be
- * supported (for outputs) and should normally be forwarded.
- *
- * The function must clean up on error.
- *
- * NOTE: We currently assume that this translation gives "viewable" results.
- *       I.e. there is no additional casting related to the wrapping process.
- *       In principle that could be supported, but not sure it is useful.
- *       This currently also means that e.g. alignment must apply identically
- *       to the new dtypes.
- *
- * TODO: Due to the fact that `resolve_descriptors` is also used for `can_cast`
- *       there is no way to "pass out" the result of this function.  This means
- *       it will be called twice for every ufunc call.
- *       (I am considering including `auxdata` as an "optional" parameter to
- *       `resolve_descriptors`, so that it can be filled there if not NULL.)
- */
-typedef int translate_given_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *wrapped_dtypes[],
-        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[]);
-
-/**
- * The function to convert the actual loop descriptors (as returned by the
- * original `resolve_descriptors` function) to the ones the output array
- * should use.
- * This function must return "viewable" types, it must not mutate them in any
- * form that would break the inner-loop logic.  Does not need to support NULL.
- *
- * The function must clean up on error.
- *
- * @param nargs Number of arguments
- * @param new_dtypes The DTypes of the output (usually probably not needed)
- * @param given_descrs Original given_descrs to the resolver, necessary to
- *        fetch any information related to the new dtypes from the original.
- * @param original_descrs The `loop_descrs` returned by the wrapped loop.
- * @param loop_descrs The output descriptors, compatible to `original_descrs`.
- *
- * @returns 0 on success, -1 on failure.
- */
-typedef int translate_loop_descrs_func(int nin, int nout,
-        PyArray_DTypeMeta *new_dtypes[], PyArray_Descr *given_descrs[],
-        PyArray_Descr *original_descrs[], PyArray_Descr *loop_descrs[]);
-
-
-/*
- * This struct will be public and necessary for creating a new ArrayMethod
- * object (casting and ufuncs).
- * We could version the struct, although since we allow passing arbitrary
- * data using the slots, and have flags, that may be enough?
- * (See also PyBoundArrayMethodObject.)
- */
-typedef struct {
-    const char *name;
-    int nin, nout;
-    NPY_CASTING casting;
-    NPY_ARRAYMETHOD_FLAGS flags;
-    PyArray_DTypeMeta **dtypes;
-    PyType_Slot *slots;
-} PyArrayMethod_Spec;
-
-
 /*
  * Structure of the ArrayMethod. This structure should probably not be made
  * public. If necessary, we can make certain operations on it public
@@ -193,16 +47,20 @@ typedef struct PyArrayMethodObject_tag {
     NPY_ARRAYMETHOD_FLAGS flags;
     resolve_descriptors_function *resolve_descriptors;
     get_loop_function *get_strided_loop;
+    get_reduction_initial_function  *get_reduction_initial;
     /* Typical loop functions (contiguous ones are used in current casts) */
     PyArrayMethod_StridedLoop *strided_loop;
     PyArrayMethod_StridedLoop *contiguous_loop;
     PyArrayMethod_StridedLoop *unaligned_strided_loop;
     PyArrayMethod_StridedLoop *unaligned_contiguous_loop;
+    PyArrayMethod_StridedLoop *contiguous_indexed_loop;
     /* Chunk only used for wrapping array method defined in umath */
     struct PyArrayMethodObject_tag *wrapped_meth;
     PyArray_DTypeMeta **wrapped_dtypes;
     translate_given_descrs_func *translate_given_descrs;
     translate_loop_descrs_func *translate_loop_descrs;
+    /* Chunk reserved for use by the legacy fallback arraymethod */
+    char legacy_initial[sizeof(npy_clongdouble)];  /* initial value storage */
 } PyArrayMethodObject;
 
 
@@ -226,18 +84,6 @@ typedef struct {
 extern NPY_NO_EXPORT PyTypeObject PyArrayMethod_Type;
 extern NPY_NO_EXPORT PyTypeObject PyBoundArrayMethod_Type;
 
-/*
- * SLOTS IDs For the ArrayMethod creation, one public, the IDs are fixed.
- * TODO: Before making it public, consider adding a large constant to private
- *       slots.
- */
-#define NPY_METH_resolve_descriptors 1
-#define NPY_METH_get_loop 2
-#define NPY_METH_strided_loop 3
-#define NPY_METH_contiguous_loop 4
-#define NPY_METH_unaligned_strided_loop 5
-#define NPY_METH_unaligned_contiguous_loop 6
-
 
 /*
  * Used internally (initially) for real to complex loops only
diff --git a/numpy/core/src/multiarray/arrayfunction_override.c b/numpy/core/src/multiarray/arrayfunction_override.c
index 2bb3fbe28..3c55e2164 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.c
+++ b/numpy/core/src/multiarray/arrayfunction_override.c
@@ -1,11 +1,15 @@
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
+#include <Python.h>
+#include "structmember.h"
+
 #include "npy_pycompat.h"
 #include "get_attr_string.h"
 #include "npy_import.h"
 #include "multiarraymodule.h"
 
+#include "arrayfunction_override.h"
 
 /* Return the ndarray.__array_function__ method. */
 static PyObject *
@@ -200,183 +204,67 @@ call_array_function(PyObject* argument, PyObject* method,
 }
 
 
-/**
- * Internal handler for the array-function dispatching. The helper returns
- * either the result, or NotImplemented (as a borrowed reference).
- *
- * @param public_api The public API symbol used for dispatching
- * @param relevant_args Arguments which may implement __array_function__
- * @param args Original arguments
- * @param kwargs Original keyword arguments
- *
- * @returns The result of the dispatched version, or a borrowed reference
- *          to NotImplemented to indicate the default implementation should
- *          be used.
+
+/*
+ * Helper to convert from vectorcall convention, since the protocol requires
+ * args and kwargs to be passed as tuple and dict explicitly.
+ * We always pass a dict, so always returns it.
  */
-static PyObject *
-array_implement_array_function_internal(
-    PyObject *public_api, PyObject *relevant_args,
-    PyObject *args, PyObject *kwargs)
+static int
+get_args_and_kwargs(
+        PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames,
+        PyObject **out_args, PyObject **out_kwargs)
 {
-    PyObject *implementing_args[NPY_MAXARGS];
-    PyObject *array_function_methods[NPY_MAXARGS];
-    PyObject *types = NULL;
+    len_args = PyVectorcall_NARGS(len_args);
+    PyObject *args = PyTuple_New(len_args);
+    PyObject *kwargs = NULL;
 
-    PyObject *result = NULL;
-
-    static PyObject *errmsg_formatter = NULL;
-
-    relevant_args = PySequence_Fast(
-        relevant_args,
-        "dispatcher for __array_function__ did not return an iterable");
-    if (relevant_args == NULL) {
-        return NULL;
+    if (args == NULL) {
+        return -1;
     }
-
-    /* Collect __array_function__ implementations */
-    int num_implementing_args = get_implementing_args_and_methods(
-        relevant_args, implementing_args, array_function_methods);
-    if (num_implementing_args == -1) {
-        goto cleanup;
+    for (Py_ssize_t i = 0; i < len_args; i++) {
+        Py_INCREF(fast_args[i]);
+        PyTuple_SET_ITEM(args, i, fast_args[i]);
     }
-
-    /*
-     * Handle the typical case of no overrides. This is merely an optimization
-     * if some arguments are ndarray objects, but is also necessary if no
-     * arguments implement __array_function__ at all (e.g., if they are all
-     * built-in types).
-     */
-    int any_overrides = 0;
-    for (int j = 0; j < num_implementing_args; j++) {
-        if (!is_default_array_function(array_function_methods[j])) {
-            any_overrides = 1;
-            break;
-        }
+    kwargs = PyDict_New();
+    if (kwargs == NULL) {
+        Py_DECREF(args);
+        return -1;
     }
-    if (!any_overrides) {
-        /*
-         * When the default implementation should be called, return
-         * `Py_NotImplemented` to indicate this.
-         */
-        result = Py_NotImplemented;
-        goto cleanup;
-    }
-
-    /*
-     * Create a Python object for types.
-     * We use a tuple, because it's the fastest Python collection to create
-     * and has the bonus of being immutable.
-     */
-    types = PyTuple_New(num_implementing_args);
-    if (types == NULL) {
-        goto cleanup;
-    }
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
-        Py_INCREF(arg_type);
-        PyTuple_SET_ITEM(types, j, arg_type);
-    }
-
-    /* Call __array_function__ methods */
-    for (int j = 0; j < num_implementing_args; j++) {
-        PyObject *argument = implementing_args[j];
-        PyObject *method = array_function_methods[j];
-
-        /*
-         * We use `public_api` instead of `implementation` here so
-         * __array_function__ implementations can do equality/identity
-         * comparisons.
-         */
-        result = call_array_function(
-            argument, method, public_api, types, args, kwargs);
-
-        if (result == Py_NotImplemented) {
-            /* Try the next one */
-            Py_DECREF(result);
-            result = NULL;
-        }
-        else {
-            /* Either a good result, or an exception was raised. */
-            goto cleanup;
+    if (kwnames != NULL) {
+        Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
+        for (Py_ssize_t i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = fast_args[i+len_args];
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                Py_DECREF(args);
+                Py_DECREF(kwargs);
+                return -1;
+            }
         }
     }
+    *out_args = args;
+    *out_kwargs = kwargs;
+    return 0;
+}
+
 
+static void
+set_no_matching_types_error(PyObject *public_api, PyObject *types)
+{
+    static PyObject *errmsg_formatter = NULL;
     /* No acceptable override found, raise TypeError. */
     npy_cache_import("numpy.core._internal",
                      "array_function_errmsg_formatter",
                      &errmsg_formatter);
     if (errmsg_formatter != NULL) {
         PyObject *errmsg = PyObject_CallFunctionObjArgs(
-            errmsg_formatter, public_api, types, NULL);
+                errmsg_formatter, public_api, types, NULL);
         if (errmsg != NULL) {
             PyErr_SetObject(PyExc_TypeError, errmsg);
             Py_DECREF(errmsg);
         }
     }
-
-cleanup:
-    for (int j = 0; j < num_implementing_args; j++) {
-        Py_DECREF(implementing_args[j]);
-        Py_DECREF(array_function_methods[j]);
-    }
-    Py_XDECREF(types);
-    Py_DECREF(relevant_args);
-    return result;
-}
-
-
-/*
- * Implements the __array_function__ protocol for a Python function, as described in
- * in NEP-18. See numpy.core.overrides for a full docstring.
- */
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args)
-{
-    PyObject *res, *implementation, *public_api, *relevant_args, *args, *kwargs;
-
-    if (!PyArg_UnpackTuple(
-            positional_args, "implement_array_function", 5, 5,
-            &implementation, &public_api, &relevant_args, &args, &kwargs)) {
-        return NULL;
-    }
-
-    /*
-     * Remove `like=` kwarg, which is NumPy-exclusive and thus not present
-     * in downstream libraries. If `like=` is specified but doesn't
-     * implement `__array_function__`, raise a `TypeError`.
-     */
-    if (kwargs != NULL && PyDict_Contains(kwargs, npy_ma_str_like)) {
-        PyObject *like_arg = PyDict_GetItem(kwargs, npy_ma_str_like);
-        if (like_arg != NULL) {
-            PyObject *tmp_has_override = get_array_function(like_arg);
-            if (tmp_has_override == NULL) {
-                return PyErr_Format(PyExc_TypeError,
-                        "The `like` argument must be an array-like that "
-                        "implements the `__array_function__` protocol.");
-            }
-            Py_DECREF(tmp_has_override);
-            PyDict_DelItem(kwargs, npy_ma_str_like);
-
-            /*
-             * If `like=` kwarg was removed, `implementation` points to the NumPy
-             * public API, as `public_api` is in that case the wrapper dispatcher
-             * function. For example, in the `np.full` case, `implementation` is
-             * `np.full`, whereas `public_api` is `_full_with_like`. This is done
-             * to ensure `__array_function__` implementations can do
-             * equality/identity comparisons when `like=` is present.
-             */
-            public_api = implementation;
-        }
-    }
-
-    res = array_implement_array_function_internal(
-        public_api, relevant_args, args, kwargs);
-
-    if (res == Py_NotImplemented) {
-        return PyObject_Call(implementation, args, kwargs);
-    }
-    return res;
 }
 
 /*
@@ -392,64 +280,52 @@ array_implement_c_array_function_creation(
     PyObject *args, PyObject *kwargs,
     PyObject *const *fast_args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyObject *relevant_args = NULL;
+    PyObject *dispatch_types = NULL;
     PyObject *numpy_module = NULL;
     PyObject *public_api = NULL;
     PyObject *result = NULL;
 
     /* If `like` doesn't implement `__array_function__`, raise a `TypeError` */
-    PyObject *tmp_has_override = get_array_function(like);
-    if (tmp_has_override == NULL) {
+    PyObject *method = get_array_function(like);
+    if (method == NULL) {
         return PyErr_Format(PyExc_TypeError,
                 "The `like` argument must be an array-like that "
                 "implements the `__array_function__` protocol.");
     }
-    Py_DECREF(tmp_has_override);
-
-    if (fast_args != NULL) {
+    if (is_default_array_function(method)) {
         /*
-         * Convert from vectorcall convention, since the protocol requires
-         * the normal convention.  We have to do this late to ensure the
-         * normal path where NotImplemented is returned is fast.
+         * Return a borrowed reference of Py_NotImplemented to defer back to
+         * the original function.
          */
+        Py_DECREF(method);
+        return Py_NotImplemented;
+    }
+
+    /* We needs args and kwargs for __array_function__ (when not using it). */
+    if (fast_args != NULL) {
         assert(args == NULL);
         assert(kwargs == NULL);
-        args = PyTuple_New(len_args);
-        if (args == NULL) {
-            return NULL;
-        }
-        for (Py_ssize_t i = 0; i < len_args; i++) {
-            Py_INCREF(fast_args[i]);
-            PyTuple_SET_ITEM(args, i, fast_args[i]);
-        }
-        if (kwnames != NULL) {
-            kwargs = PyDict_New();
-            if (kwargs == NULL) {
-                Py_DECREF(args);
-                return NULL;
-            }
-            Py_ssize_t nkwargs = PyTuple_GET_SIZE(kwnames);
-            for (Py_ssize_t i = 0; i < nkwargs; i++) {
-                PyObject *key = PyTuple_GET_ITEM(kwnames, i);
-                PyObject *value = fast_args[i+len_args];
-                if (PyDict_SetItem(kwargs, key, value) < 0) {
-                    Py_DECREF(args);
-                    Py_DECREF(kwargs);
-                    return NULL;
-                }
-            }
+        if (get_args_and_kwargs(
+                fast_args, len_args, kwnames, &args, &kwargs) < 0) {
+            goto finish;
         }
     }
+    else {
+        Py_INCREF(args);
+        Py_INCREF(kwargs);
+    }
 
-    relevant_args = PyTuple_Pack(1, like);
-    if (relevant_args == NULL) {
+    dispatch_types = PyTuple_Pack(1, Py_TYPE(like));
+    if (dispatch_types == NULL) {
         goto finish;
     }
+
     /* The like argument must be present in the keyword arguments, remove it */
     if (PyDict_DelItem(kwargs, npy_ma_str_like) < 0) {
         goto finish;
     }
 
+    /* Fetch the actual symbol (the long way right now) */
     numpy_module = PyImport_Import(npy_ma_str_numpy);
     if (numpy_module == NULL) {
         goto finish;
@@ -466,16 +342,21 @@ array_implement_c_array_function_creation(
         goto finish;
     }
 
-    result = array_implement_array_function_internal(
-            public_api, relevant_args, args, kwargs);
+    result = call_array_function(like, method,
+            public_api, dispatch_types, args, kwargs);
 
-  finish:
-    if (kwnames != NULL) {
-        /* args and kwargs were converted from vectorcall convention */
-        Py_XDECREF(args);
-        Py_XDECREF(kwargs);
+    if (result == Py_NotImplemented) {
+        /* This shouldn't really happen as there is only one type, but... */
+        Py_DECREF(result);
+        result = NULL;
+        set_no_matching_types_error(public_api, dispatch_types);
     }
-    Py_XDECREF(relevant_args);
+
+  finish:
+    Py_DECREF(method);
+    Py_XDECREF(args);
+    Py_XDECREF(kwargs);
+    Py_XDECREF(dispatch_types);
     Py_XDECREF(public_api);
     return result;
 }
@@ -530,3 +411,373 @@ cleanup:
     Py_DECREF(relevant_args);
     return result;
 }
+
+
+typedef struct {
+    PyObject_HEAD
+    vectorcallfunc vectorcall;
+    PyObject *dict;
+    PyObject *relevant_arg_func;
+    PyObject *default_impl;
+    /* The following fields are used to clean up TypeError messages only: */
+    PyObject *dispatcher_name;
+    PyObject *public_name;
+} PyArray_ArrayFunctionDispatcherObject;
+
+
+static void
+dispatcher_dealloc(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    Py_CLEAR(self->relevant_arg_func);
+    Py_CLEAR(self->default_impl);
+    Py_CLEAR(self->dict);
+    Py_CLEAR(self->dispatcher_name);
+    Py_CLEAR(self->public_name);
+    PyObject_FREE(self);
+}
+
+
+static void
+fix_name_if_typeerror(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    if (!PyErr_ExceptionMatches(PyExc_TypeError)) {
+        return;
+    }
+
+    PyObject *exc, *val, *tb, *message;
+    PyErr_Fetch(&exc, &val, &tb);
+
+    if (!PyUnicode_CheckExact(val)) {
+        /*
+         * We expect the error to be unnormalized, but maybe it isn't always
+         * the case, so normalize and fetch args[0] if it isn't a string.
+         */
+        PyErr_NormalizeException(&exc, &val, &tb);
+
+        PyObject *args = PyObject_GetAttrString(val, "args");
+        if (args == NULL || !PyTuple_CheckExact(args)
+                || PyTuple_GET_SIZE(args) != 1) {
+            Py_XDECREF(args);
+            goto restore_error;
+        }
+        message = PyTuple_GET_ITEM(args, 0);
+        Py_INCREF(message);
+        Py_DECREF(args);
+        if (!PyUnicode_CheckExact(message)) {
+            Py_DECREF(message);
+            goto restore_error;
+        }
+    }
+    else {
+        Py_INCREF(val);
+        message = val;
+    }
+
+    Py_ssize_t cmp = PyUnicode_Tailmatch(
+            message, self->dispatcher_name, 0, -1, -1);
+    if (cmp <= 0) {
+        Py_DECREF(message);
+        goto restore_error;
+    }
+    Py_SETREF(message, PyUnicode_Replace(
+            message, self->dispatcher_name, self->public_name, 1));
+    if (message == NULL) {
+        goto restore_error;
+    }
+    PyErr_SetObject(PyExc_TypeError, message);
+    Py_DECREF(exc);
+    Py_XDECREF(val);
+    Py_XDECREF(tb);
+    Py_DECREF(message);
+    return;
+
+  restore_error:
+    /* replacement not successful, so restore original error */
+    PyErr_Restore(exc, val, tb);
+}
+
+
+static PyObject *
+dispatcher_vectorcall(PyArray_ArrayFunctionDispatcherObject *self,
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    PyObject *result = NULL;
+    PyObject *types = NULL;
+    PyObject *relevant_args = NULL;
+
+    PyObject *public_api;
+
+    /* __array_function__ passes args, kwargs.  These may be filled: */
+    PyObject *packed_args = NULL;
+    PyObject *packed_kwargs = NULL;
+
+    PyObject *implementing_args[NPY_MAXARGS];
+    PyObject *array_function_methods[NPY_MAXARGS];
+
+    int num_implementing_args;
+
+    if (self->relevant_arg_func != NULL) {
+        public_api = (PyObject *)self;
+
+        /* Typical path, need to call the relevant_arg_func and unpack them */
+        relevant_args = PyObject_Vectorcall(
+                self->relevant_arg_func, args, len_args, kwnames);
+        if (relevant_args == NULL) {
+            fix_name_if_typeerror(self);
+            return NULL;
+        }
+        Py_SETREF(relevant_args, PySequence_Fast(relevant_args,
+                "dispatcher for __array_function__ did not return an iterable"));
+        if (relevant_args == NULL) {
+            return NULL;
+        }
+
+        num_implementing_args = get_implementing_args_and_methods(
+                relevant_args, implementing_args, array_function_methods);
+        if (num_implementing_args < 0) {
+            Py_DECREF(relevant_args);
+            return NULL;
+        }
+    }
+    else {
+        /* For like= dispatching from Python, the public_symbol is the impl */
+        public_api = self->default_impl;
+
+        /*
+         * We are dealing with `like=` from Python.  For simplicity, the
+         * Python code passes it on as the first argument.
+         */
+        if (PyVectorcall_NARGS(len_args) == 0) {
+            PyErr_Format(PyExc_TypeError,
+                    "`like` argument dispatching, but first argument is not "
+                    "positional in call to %S.", self->default_impl);
+            return NULL;
+        }
+
+        array_function_methods[0] = get_array_function(args[0]);
+        if (array_function_methods[0] == NULL) {
+            return PyErr_Format(PyExc_TypeError,
+                    "The `like` argument must be an array-like that "
+                    "implements the `__array_function__` protocol.");
+        }
+        num_implementing_args = 1;
+        implementing_args[0] = args[0];
+        Py_INCREF(implementing_args[0]);
+
+        /* do not pass the like argument */
+        len_args = PyVectorcall_NARGS(len_args) - 1;
+        len_args |= PY_VECTORCALL_ARGUMENTS_OFFSET;
+        args++;
+    }
+
+    /*
+     * Handle the typical case of no overrides. This is merely an optimization
+     * if some arguments are ndarray objects, but is also necessary if no
+     * arguments implement __array_function__ at all (e.g., if they are all
+     * built-in types).
+     */
+    int any_overrides = 0;
+    for (int j = 0; j < num_implementing_args; j++) {
+        if (!is_default_array_function(array_function_methods[j])) {
+            any_overrides = 1;
+            break;
+        }
+    }
+    if (!any_overrides) {
+        /* Directly call the actual implementation. */
+        result = PyObject_Vectorcall(self->default_impl, args, len_args, kwnames);
+        goto cleanup;
+    }
+
+    /* Find args and kwargs as tuple and dict, as we pass them out: */
+    if (get_args_and_kwargs(
+            args, len_args, kwnames, &packed_args, &packed_kwargs) < 0) {
+        goto cleanup;
+    }
+
+    /*
+     * Create a Python object for types.
+     * We use a tuple, because it's the fastest Python collection to create
+     * and has the bonus of being immutable.
+     */
+    types = PyTuple_New(num_implementing_args);
+    if (types == NULL) {
+        goto cleanup;
+    }
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *arg_type = (PyObject *)Py_TYPE(implementing_args[j]);
+        Py_INCREF(arg_type);
+        PyTuple_SET_ITEM(types, j, arg_type);
+    }
+
+    /* Call __array_function__ methods */
+    for (int j = 0; j < num_implementing_args; j++) {
+        PyObject *argument = implementing_args[j];
+        PyObject *method = array_function_methods[j];
+
+        result = call_array_function(
+                argument, method, public_api, types,
+                packed_args, packed_kwargs);
+
+        if (result == Py_NotImplemented) {
+            /* Try the next one */
+            Py_DECREF(result);
+            result = NULL;
+        }
+        else {
+            /* Either a good result, or an exception was raised. */
+            goto cleanup;
+        }
+    }
+
+    set_no_matching_types_error(public_api, types);
+
+cleanup:
+    for (int j = 0; j < num_implementing_args; j++) {
+        Py_DECREF(implementing_args[j]);
+        Py_DECREF(array_function_methods[j]);
+    }
+    Py_XDECREF(packed_args);
+    Py_XDECREF(packed_kwargs);
+    Py_XDECREF(types);
+    Py_XDECREF(relevant_args);
+    return result;
+}
+
+
+static PyObject *
+dispatcher_new(PyTypeObject *NPY_UNUSED(cls), PyObject *args, PyObject *kwargs)
+{
+    PyArray_ArrayFunctionDispatcherObject *self;
+
+    self = PyObject_New(
+            PyArray_ArrayFunctionDispatcherObject,
+            &PyArrayFunctionDispatcher_Type);
+    if (self == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    char *kwlist[] = {"", "", NULL};
+    if (!PyArg_ParseTupleAndKeywords(
+            args, kwargs, "OO:_ArrayFunctionDispatcher", kwlist,
+            &self->relevant_arg_func, &self->default_impl)) {
+        Py_DECREF(self);
+        return NULL;
+    }
+
+    self->vectorcall = (vectorcallfunc)dispatcher_vectorcall;
+    Py_INCREF(self->default_impl);
+    self->dict = NULL;
+    self->dispatcher_name = NULL;
+    self->public_name = NULL;
+
+    if (self->relevant_arg_func == Py_None) {
+        /* NULL in the relevant arg function means we use `like=` */
+        Py_CLEAR(self->relevant_arg_func);
+    }
+    else {
+        /* Fetch names to clean up TypeErrors (show actual name) */
+        Py_INCREF(self->relevant_arg_func);
+        self->dispatcher_name = PyObject_GetAttrString(
+            self->relevant_arg_func, "__qualname__");
+        if (self->dispatcher_name == NULL) {
+            Py_DECREF(self);
+            return NULL;
+        }
+        self->public_name = PyObject_GetAttrString(
+            self->default_impl, "__qualname__");
+        if (self->public_name == NULL) {
+            Py_DECREF(self);
+            return NULL;
+        }
+    }
+
+    /* Need to be like a Python function that has arbitrary attributes */
+    self->dict = PyDict_New();
+    if (self->dict == NULL) {
+        Py_DECREF(self);
+        return NULL;
+    }
+    return (PyObject *)self;
+}
+
+
+static PyObject *
+dispatcher_str(PyArray_ArrayFunctionDispatcherObject *self)
+{
+    return PyObject_Str(self->default_impl);
+}
+
+
+static PyObject *
+dispatcher_repr(PyObject *self)
+{
+    PyObject *name = PyObject_GetAttrString(self, "__name__");
+    if (name == NULL) {
+        return NULL;
+    }
+    /* Print like a normal function */
+    return PyUnicode_FromFormat("<function %S at %p>", name, self);
+}
+
+
+static PyObject *
+func_dispatcher___get__(PyObject *self, PyObject *obj, PyObject *cls)
+{
+    if (obj == NULL) {
+        /* Act like a static method, no need to bind */
+        Py_INCREF(self);
+        return self;
+    }
+    return PyMethod_New(self, obj);
+}
+
+
+static PyObject *
+dispatcher_get_implementation(
+        PyArray_ArrayFunctionDispatcherObject *self, void *NPY_UNUSED(closure))
+{
+    Py_INCREF(self->default_impl);
+    return self->default_impl;
+}
+
+
+static PyObject *
+dispatcher_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    return PyObject_GetAttrString(self, "__qualname__");
+}
+
+
+static struct PyMethodDef func_dispatcher_methods[] = {
+    {"__reduce__",
+        (PyCFunction)dispatcher_reduce, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+static struct PyGetSetDef func_dispatcher_getset[] = {
+    {"__dict__", &PyObject_GenericGetDict, 0, NULL, 0},
+    {"_implementation", (getter)&dispatcher_get_implementation, 0, NULL, 0},
+    {0, 0, 0, 0, 0}
+};
+
+
+NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type = {
+     PyVarObject_HEAD_INIT(NULL, 0)
+     .tp_name = "numpy._ArrayFunctionDispatcher",
+     .tp_basicsize = sizeof(PyArray_ArrayFunctionDispatcherObject),
+     /* We have a dict, so in theory could traverse, but in practice... */
+     .tp_dictoffset = offsetof(PyArray_ArrayFunctionDispatcherObject, dict),
+     .tp_dealloc = (destructor)dispatcher_dealloc,
+     .tp_new = (newfunc)dispatcher_new,
+     .tp_str = (reprfunc)dispatcher_str,
+     .tp_repr = (reprfunc)dispatcher_repr,
+     .tp_flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_VECTORCALL
+                  | Py_TPFLAGS_METHOD_DESCRIPTOR),
+     .tp_methods = func_dispatcher_methods,
+     .tp_getset = func_dispatcher_getset,
+     .tp_descr_get = func_dispatcher___get__,
+     .tp_call = &PyVectorcall_Call,
+     .tp_vectorcall_offset = offsetof(PyArray_ArrayFunctionDispatcherObject, vectorcall),
+};
diff --git a/numpy/core/src/multiarray/arrayfunction_override.h b/numpy/core/src/multiarray/arrayfunction_override.h
index 09f7ee548..3b8b88bac 100644
--- a/numpy/core/src/multiarray/arrayfunction_override.h
+++ b/numpy/core/src/multiarray/arrayfunction_override.h
@@ -1,9 +1,7 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_ARRAYFUNCTION_OVERRIDE_H_
 
-NPY_NO_EXPORT PyObject *
-array_implement_array_function(
-    PyObject *NPY_UNUSED(dummy), PyObject *positional_args);
+extern NPY_NO_EXPORT PyTypeObject PyArrayFunctionDispatcher_Type;
 
 NPY_NO_EXPORT PyObject *
 array__get_implementing_args(
diff --git a/numpy/core/src/multiarray/arrayobject.c b/numpy/core/src/multiarray/arrayobject.c
index 08e2cc683..46c4d90c9 100644
--- a/numpy/core/src/multiarray/arrayobject.c
+++ b/numpy/core/src/multiarray/arrayobject.c
@@ -56,6 +56,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "alloc.h"
 #include "mem_overlap.h"
 #include "numpyos.h"
+#include "refcount.h"
 #include "strfuncs.h"
 
 #include "binop_override.h"
@@ -452,9 +453,11 @@ array_dealloc(PyArrayObject *self)
     }
 
     if ((fa->flags & NPY_ARRAY_OWNDATA) && fa->data) {
-        /* Free internal references if an Object array */
-        if (PyDataType_FLAGCHK(fa->descr, NPY_ITEM_REFCOUNT)) {
-            PyArray_XDECREF(self);
+        /* Free any internal references */
+        if (PyDataType_REFCHK(fa->descr)) {
+            if (PyArray_ClearArray(self) < 0) {
+                PyErr_WriteUnraisable(NULL);
+            }
         }
         if (fa->mem_handler == NULL) {
             char *env = getenv("NUMPY_WARN_IF_NO_MEM_POLICY");
@@ -994,7 +997,7 @@ array_richcompare(PyArrayObject *self, PyObject *other, int cmp_op)
      * TODO: If/once we correctly push structured comparisons into the ufunc
      *       we could consider pushing this path into the ufunc itself as a
      *       fallback loop (which ignores the input arrays).
-     *       This would have the advantage that subclasses implemementing
+     *       This would have the advantage that subclasses implementing
      *       `__array_ufunc__` do not explicitly need `__eq__` and `__ne__`.
      */
     if (result == NULL
@@ -1220,7 +1223,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                                      (int)dims.len,
                                      dims.ptr,
                                      strides.ptr, NULL, is_f_order, NULL, NULL,
-                                     0, 1);
+                                     _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
@@ -1257,7 +1260,7 @@ array_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds)
                 subtype, descr,
                 dims.len, dims.ptr, strides.ptr, offset + (char *)buffer.ptr,
                 buffer.flags, NULL, buffer.base,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             descr = NULL;
             goto fail;
diff --git a/numpy/core/src/multiarray/arrayobject.h b/numpy/core/src/multiarray/arrayobject.h
index f7d0734db..a6b3a32bd 100644
--- a/numpy/core/src/multiarray/arrayobject.h
+++ b/numpy/core/src/multiarray/arrayobject.h
@@ -40,6 +40,13 @@ static const int NPY_ARRAY_WARN_ON_WRITE = (1 << 31);
 static const int NPY_ARRAY_WAS_PYTHON_INT = (1 << 30);
 static const int NPY_ARRAY_WAS_PYTHON_FLOAT = (1 << 29);
 static const int NPY_ARRAY_WAS_PYTHON_COMPLEX = (1 << 28);
+/*
+ * Mark that this was a huge int which was turned into an object array (or
+ * unsigned/non-default integer array), but then replaced by a temporary
+ * array for further processing. This flag is only used in the ufunc machinery
+ * where it is tricky to cover correctly all type resolution paths.
+ */
+static const int NPY_ARRAY_WAS_INT_AND_REPLACED = (1 << 27);
 static const int NPY_ARRAY_WAS_PYTHON_LITERAL = (1 << 30 | 1 << 29 | 1 << 28);
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYOBJECT_H_ */
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 3149ff36f..b84625c77 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -224,8 +224,6 @@ MyPyLong_AsUnsigned@Type@(PyObject *obj)
  **                         GETITEM AND SETITEM                             **
  *****************************************************************************
  */
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
  * parentheses" which is caused by the _ALIGN macro.
@@ -293,7 +291,7 @@ static int
                     "Python integers to integer arrays.  The conversion "
                     "of %.100R to %S will fail in the future.\n"
                     "For the old behavior, usually:\n"
-                    "    np.array(value).astype(dtype)`\n"
+                    "    np.array(value).astype(dtype)\n"
                     "will give the desired result (the cast overflows).",
                     obj, descr) < 0) {
                 Py_DECREF(descr);
@@ -4648,12 +4646,30 @@ set_typeinfo(PyObject *dict)
      * should be defined on the class and inherited to the scalar.
      * (NPY_HALF is the largest builtin one.)
      */
-    for (i = 0; i <= NPY_HALF; i++) {
-        if (dtypemeta_wrap_legacy_descriptor(_builtin_descrs[i]) < 0) {
-            return -1;
-        }
+    /**begin repeat
+     *
+     * #NAME = BOOL,
+     *         BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+     *         LONG, ULONG, LONGLONG, ULONGLONG,
+     *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+     *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+     *         OBJECT, STRING, UNICODE, VOID,
+     *         DATETIME, TIMEDELTA#
+     */
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_@NAME@],
+            "numpy.dtypes." NPY_@NAME@_Name "DType",
+#ifdef NPY_@NAME@_alias
+            "numpy.dtypes." NPY_@NAME@_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
     }
 
+    /**end repeat**/
+
     /*
      * Add cast functions for the new types
      */
diff --git a/numpy/core/src/multiarray/arraytypes.h.src b/numpy/core/src/multiarray/arraytypes.h.src
index aad464ccf..90a075dad 100644
--- a/numpy/core/src/multiarray/arraytypes.h.src
+++ b/numpy/core/src/multiarray/arraytypes.h.src
@@ -66,4 +66,102 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int @TYPE@_@func@,
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
     (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
 
+
+/*
+ * Define DType and scalar type names and aliases as used in Python.
+ */
+
+/**begin repeat
+ * #NAME = BOOL,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID, OBJECT,
+ *         DATETIME, TIMEDELTA#
+ * #name = bool_,
+ *         float16, float32, float64, longdouble,
+ *         complex64, complex128, clongdouble,
+ *         bytes_, str_, void, object_,
+ *         datetime64, timedelta64#
+ * #Name = Bool,
+ *         Float16, Float32, Float64, LongDouble,
+ *         Complex64, Complex128, CLongDouble,
+ *         Bytes, Str, Void, Object,
+ *         DateTime64, TimeDelta64#
+ */
+#define NPY_@NAME@_name "@name@"
+#define NPY_@NAME@_Name "@Name@"
+
+/**end repeat**/
+
+
+/*
+ * Give integers different names when they are the same size (gh-9799).
+ * `intX` always refers to the first int of that size in the sequence
+ * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
+ * Unfortunately, since the bitsize names are not strictly fixed, we add
+ * the C name for all integer types (as aliases).
+ *
+ * Right now, we do not define the C aliases for floats (which are always
+ * the same).
+ */
+
+#if NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT
+    #define BYTE_not_size_named
+#endif
+#if NPY_SIZEOF_SHORT == NPY_SIZEOF_INT
+    #define SHORT_not_size_named
+#endif
+#if NPY_SIZEOF_INT == NPY_SIZEOF_LONG
+    #define INT_not_size_named
+#endif
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+    #define LONGLONG_not_size_named
+#endif
+
+
+/**begin repeat
+ * #NAME = BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #CNAME = (BYTE, SHORT, INT, LONG, LONGLONG)*2#
+ * #cname = byte, short, intc, int_, longlong,
+ *          ubyte, ushort, uintc, uint, ulonglong#
+ * #CName = Byte, Short, Int, Long, LongLong,
+ *          UByte, UShort, UInt, ULong, ULongLong#
+ * #bitname = int*5, uint*5#
+ * #BitName = Int*5, UInt*5#
+ */
+
+#ifdef @CNAME@_not_size_named
+    #define NPY_@NAME@_name "@cname@"
+    #define NPY_@NAME@_Name "@CName@"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_@NAME@_alias "@cname@"
+    #define NPY_@NAME@_Alias "@CName@"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_@CNAME@ == 8
+        #define NPY_@NAME@_name "@bitname@8"
+        #define NPY_@NAME@_Name "@BitName@8"
+    #elif NPY_BITSOF_@CNAME@ == 16
+        #define NPY_@NAME@_name "@bitname@16"
+        #define NPY_@NAME@_Name "@BitName@16"
+    #elif NPY_BITSOF_@CNAME@ == 32
+        #define NPY_@NAME@_name "@bitname@32"
+        #define NPY_@NAME@_Name "@BitName@32"
+    #elif NPY_BITSOF_@CNAME@ == 64
+        #define NPY_@NAME@_name "@bitname@64"
+        #define NPY_@NAME@_Name "@BitName@64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+/**end repeat**/
+
+#undef BYTE_not_size_named
+#undef SHORT_not_size_named
+#undef INT_not_size_named
+#undef LONGLONG_not_size_named
+
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
index 76652550a..c9d881e16 100644
--- a/numpy/core/src/multiarray/buffer.c
+++ b/numpy/core/src/multiarray/buffer.c
@@ -17,6 +17,7 @@
 #include "numpyos.h"
 #include "arrayobject.h"
 #include "scalartypes.h"
+#include "dtypemeta.h"
 
 /*************************************************************************
  ****************   Implement Buffer Protocol ****************************
@@ -417,9 +418,16 @@ _buffer_format_string(PyArray_Descr *descr, _tmp_string_t *str,
             break;
         }
         default:
-            PyErr_Format(PyExc_ValueError,
-                         "cannot include dtype '%c' in a buffer",
-                         descr->type);
+            if (NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%c' in a buffer",
+                             descr->type);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                             "cannot include dtype '%s' in a buffer",
+                             ((PyTypeObject*)NPY_DTYPE(descr))->tp_name);
+            }
             return -1;
         }
     }
diff --git a/numpy/core/src/multiarray/calculation.c b/numpy/core/src/multiarray/calculation.c
index a985a2308..62a994c64 100644
--- a/numpy/core/src/multiarray/calculation.c
+++ b/numpy/core/src/multiarray/calculation.c
@@ -7,6 +7,7 @@
 
 #include "numpy/arrayobject.h"
 #include "lowlevel_strided_loops.h"
+#include "dtypemeta.h"
 
 #include "npy_config.h"
 
@@ -50,7 +51,7 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
     int axis_copy = axis;
     npy_intp _shape_buf[NPY_MAXDIMS];
     npy_intp *out_shape;
-    // Keep the number of dimensions and shape of 
+    // Keep the number of dimensions and shape of
     // original array. Helps when `keepdims` is True.
     npy_intp* original_op_shape = PyArray_DIMS(op);
     int out_ndim = PyArray_NDIM(op);
@@ -87,9 +88,13 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
         op = ap;
     }
 
-    /* Will get native-byte order contiguous copy. */
-    ap = (PyArrayObject *)PyArray_ContiguousFromAny((PyObject *)op,
-                                  PyArray_DESCR(op)->type_num, 1, 0);
+    // Will get native-byte order contiguous copy.
+    PyArray_Descr *descr = NPY_DT_CALL_ensure_canonical(PyArray_DESCR(op));
+    if (descr == NULL) {
+        return NULL;
+    }
+    ap = (PyArrayObject *)PyArray_FromArray(op, descr, NPY_ARRAY_DEFAULT);
+
     Py_DECREF(op);
     if (ap == NULL) {
         return NULL;
@@ -106,9 +111,9 @@ _PyArray_ArgMinMaxCommon(PyArrayObject *op,
             for (int i = 0; i < out_ndim; i++) {
                 out_shape[i] = 1;
             }
-        } 
+        }
         else {
-            /* 
+            /*
              * While `ap` may be transposed, we can ignore this for `out` because the
              * transpose only reorders the size 1 `axis` (not changing memory layout).
              */
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index da8d23a26..573d0d606 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -128,24 +128,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, PyArray_Descr **out_dtype)
 }
 
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret)
-{
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        PyObject *zero = PyLong_FromLong(0);
-        PyArray_FillObjectArray(ret, zero);
-        Py_DECREF(zero);
-        if (PyErr_Occurred()) {
-            return -1;
-        }
-    }
-    else {
-        npy_intp n = PyArray_NBYTES(ret);
-        memset(PyArray_DATA(ret), 0, n);
-    }
-    return 0;
-}
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap)
 {
@@ -459,3 +441,32 @@ new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
     }
 }
 
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v)
+{
+    if (PyArray_NDIM(v) == 0) {
+        return 0;
+    }
+
+    /* Remove this if-else block when the deprecation expires */
+    if (PyArray_SIZE(v) == 1) {
+        /* Numpy 1.25.0, 2023-01-02 */
+        if (DEPRECATE(
+                "Conversion of an array with ndim > 0 to a scalar "
+                "is deprecated, and will error in future. "
+                "Ensure you extract a single element from your array "
+                "before performing this operation. "
+                "(Deprecated NumPy 1.25.)") < 0) {
+            return -1;
+        }
+        return 0;
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "only length-1 arrays can be converted to Python scalars");
+        return -1;
+    }
+
+    PyErr_SetString(PyExc_TypeError,
+            "only 0-dimensional arrays can be converted to Python scalars");
+    return -1;
+}
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 06322e902..cb9fadc4e 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -55,9 +55,6 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims,
 NPY_NO_EXPORT PyArray_Descr *
 _array_find_python_scalar_type(PyObject *op);
 
-NPY_NO_EXPORT int
-_zerofill(PyArrayObject *ret);
-
 NPY_NO_EXPORT npy_bool
 _IsWriteable(PyArrayObject *ap);
 
@@ -178,7 +175,19 @@ check_and_adjust_axis(int *axis, int ndim)
 }
 
 /* used for some alignment checks */
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/* 
+ * GCC releases before GCC 4.9 had a bug in _Alignof.  See GCC bug 52023
+ * <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52023>.
+ * clang versions < 8.0.0 have the same bug.
+ */
+#if (!defined __STDC_VERSION__ || __STDC_VERSION__ < 201112 \
+     || (defined __GNUC__ && __GNUC__ < 4 + (__GNUC_MINOR__ < 9) \
+  && !defined __clang__) \
+     || (defined __clang__ && __clang_major__ < 8))
+# define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+#else
+# define _ALIGN(type) _Alignof(type)
+#endif
 #define _UINT_ALIGN(type) npy_uint_alignment(sizeof(type))
 /*
  * Disable harmless compiler warning "4116: unnamed type definition in
@@ -323,6 +332,13 @@ PyArray_TupleFromItems(int n, PyObject *const *items, int make_null_none)
     return tuple;
 }
 
+/*
+ * Returns 0 if the array has rank 0, -1 otherwise. Prints a deprecation
+ * warning for arrays of _size_ 1.
+ */
+NPY_NO_EXPORT int
+check_is_convertible_to_scalar(PyArrayObject *v);
+
 
 #include "ucsnarrow.h"
 
diff --git a/numpy/core/src/multiarray/compiled_base.c b/numpy/core/src/multiarray/compiled_base.c
index 2ae36c13a..22f2547ad 100644
--- a/numpy/core/src/multiarray/compiled_base.c
+++ b/numpy/core/src/multiarray/compiled_base.c
@@ -8,6 +8,7 @@
 #include "numpy/arrayobject.h"
 #include "numpy/npy_3kcompat.h"
 #include "numpy/npy_math.h"
+#include "npy_argparse.h"
 #include "npy_config.h"
 #include "templ_common.h" /* for npy_mul_sizes_with_overflow */
 #include "lowlevel_strided_loops.h" /* for npy_bswap8 */
@@ -109,7 +110,8 @@ minmax(const npy_intp *data, npy_intp data_len, npy_intp *mn, npy_intp *mx)
  * output array.
  */
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+arr_bincount(PyObject *NPY_UNUSED(self), PyObject *const *args,
+                            Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *list = NULL, *weight = Py_None, *mlength = NULL;
     PyArrayObject *lst = NULL, *ans = NULL, *wts = NULL;
@@ -117,11 +119,14 @@ arr_bincount(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     npy_intp minlength = 0;
     npy_intp i;
     double *weights , *dans;
-    static char *kwlist[] = {"list", "weights", "minlength", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO:bincount",
-                kwlist, &list, &weight, &mlength)) {
-            goto fail;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("bincount", args, len_args, kwnames,
+                "list", NULL, &list,
+                "|weights", NULL, &weight,
+                "|minlength", NULL, &mlength,
+                NULL, NULL, NULL) < 0) {
+        return NULL;
     }
 
     lst = (PyArrayObject *)PyArray_ContiguousFromAny(list, NPY_INTP, 1, 1);
@@ -265,7 +270,7 @@ arr__monotonicity(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
  * indicated by the mask
  */
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_place(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
 {
     char *src, *dest;
     npy_bool *mask_data;
@@ -489,7 +494,8 @@ binary_search_with_guess(const npy_double key, const npy_double *arr,
 #undef LIKELY_IN_CACHE_SIZE
 
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -500,12 +506,16 @@ arr_interp(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     const npy_double *dy, *dx, *dz;
     npy_double *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp", kwlist,
-                                     &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -654,7 +664,8 @@ fail:
 
 /* As for arr_interp but for complex fp values */
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
+arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
 
     PyObject *fp, *xp, *x;
@@ -667,12 +678,16 @@ arr_interp_complex(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwdict)
     npy_cdouble lval, rval;
     npy_cdouble *dres, *slopes = NULL;
 
-    static char *kwlist[] = {"x", "xp", "fp", "left", "right", NULL};
-
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwdict, "OOO|OO:interp_complex",
-                                     kwlist, &x, &xp, &fp, &left, &right)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("interp_complex", args, len_args, kwnames,
+                "x", NULL, &x,
+                "xp", NULL, &xp,
+                "fp", NULL, &fp,
+                "|left", NULL, &left,
+                "|right", NULL, &right,
+                NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -1389,7 +1404,7 @@ fail:
 
 /* Can only be called if doc is currently NULL */
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
+arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj;
     PyObject *str;
@@ -1401,7 +1416,16 @@ arr_add_docstring(PyObject *NPY_UNUSED(dummy), PyObject *args)
         Py_RETURN_NONE;
     }
 
-    if (!PyArg_ParseTuple(args, "OO!:add_docstring", &obj, &PyUnicode_Type, &str)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("add_docstring", args, len_args, NULL,
+            "", NULL, &obj,
+            "", NULL, &str,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+    if (!PyUnicode_Check(str)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument docstring of add_docstring should be a str");
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/compiled_base.h b/numpy/core/src/multiarray/compiled_base.h
index d3bc08cb2..e0e73ac79 100644
--- a/numpy/core/src/multiarray/compiled_base.h
+++ b/numpy/core/src/multiarray/compiled_base.h
@@ -4,21 +4,21 @@
 #include "numpy/ndarraytypes.h"
 
 NPY_NO_EXPORT PyObject *
-arr_insert(PyObject *, PyObject *, PyObject *);
+arr_place(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_bincount(PyObject *, PyObject *, PyObject *);
+arr_bincount(PyObject *, PyObject *const *, Py_ssize_t, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr__monotonicity(PyObject *, PyObject *, PyObject *kwds);
 NPY_NO_EXPORT PyObject *
-arr_interp(PyObject *, PyObject *, PyObject *);
+arr_interp(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_interp_complex(PyObject *, PyObject *, PyObject *);
+arr_interp_complex(PyObject *, PyObject *const *, Py_ssize_t, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_ravel_multi_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
 arr_unravel_index(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
-arr_add_docstring(PyObject *, PyObject *);
+arr_add_docstring(PyObject *, PyObject *const *, Py_ssize_t);
 NPY_NO_EXPORT PyObject *
 io_pack(PyObject *, PyObject *, PyObject *);
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 092a17c89..aef78ff5e 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -21,6 +21,7 @@
 
 #include "convert.h"
 #include "array_coercion.h"
+#include "refcount.h"
 
 int
 fallocate(int fd, int mode, off_t offset, off_t len);
@@ -156,11 +157,35 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
             size = PyArray_SIZE(self);
             NPY_BEGIN_ALLOW_THREADS;
 
-#if defined (_MSC_VER) && defined(_WIN64)
-            /* Workaround Win64 fwrite() bug. Ticket #1660 */
+#if defined(NPY_OS_WIN64)
+            /*
+             * Workaround Win64 fwrite() bug. Issue gh-2256
+             * The native 64 windows runtime has this issue, the above will
+             * also trigger UCRT (which doesn't), so it could be more precise.
+             *
+             * If you touch this code, please run this test which is so slow
+             * it was removed from the test suite. Note that the original
+             * failure mode involves an infinite loop during tofile()
+             *
+             * import tempfile, numpy as np
+             * from numpy.testing import (assert_)
+             * fourgbplus = 2**32 + 2**16
+             * testbytes = np.arange(8, dtype=np.int8)
+             * n = len(testbytes)
+             * flike = tempfile.NamedTemporaryFile()
+             * f = flike.file
+             * np.tile(testbytes, fourgbplus // testbytes.nbytes).tofile(f)
+             * flike.seek(0)
+             * a = np.fromfile(f, dtype=np.int8)
+             * flike.close()
+             * assert_(len(a) == fourgbplus)
+             * # check only start and end for speed:
+             * assert_((a[:n] == testbytes).all())
+             * assert_((a[-n:] == testbytes).all())
+             */
             {
-                npy_intp maxsize = 2147483648 / PyArray_DESCR(self)->elsize;
-                npy_intp chunksize;
+                size_t maxsize = 2147483648 / (size_t)PyArray_DESCR(self)->elsize;
+                size_t chunksize;
 
                 n = 0;
                 while (size > 0) {
@@ -168,7 +193,7 @@ PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format)
                     n2 = fwrite((const void *)
                              ((char *)PyArray_DATA(self) + (n * PyArray_DESCR(self)->elsize)),
                              (size_t) PyArray_DESCR(self)->elsize,
-                             (size_t) chunksize, fp);
+                             chunksize, fp);
                     if (n2 < chunksize) {
                         break;
                     }
@@ -359,6 +384,11 @@ PyArray_ToString(PyArrayObject *self, NPY_ORDER order)
 NPY_NO_EXPORT int
 PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 {
+
+    if (PyArray_FailUnlessWriteable(arr, "assignment destination") < 0) {
+        return -1;
+    }
+
     /*
      * If we knew that the output array has at least one element, we would
      * not actually need a helping buffer, we always null it, just in case.
@@ -370,7 +400,7 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
     char *value = (char *)value_buffer_stack;
     PyArray_Descr *descr = PyArray_DESCR(arr);
 
-    if (descr->elsize > sizeof(value_buffer_stack)) {
+    if ((size_t)descr->elsize > sizeof(value_buffer_stack)) {
         /* We need a large temporary buffer... */
         value_buffer_heap = PyObject_Calloc(1, descr->elsize);
         if (value_buffer_heap == NULL) {
@@ -393,12 +423,17 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
             PyArray_BYTES(arr), PyArray_STRIDES(arr),
             descr, value);
 
+    if (PyDataType_REFCHK(descr)) {
+        PyArray_ClearBuffer(descr, value, 0, 1, 1);
+    }
     PyMem_FREE(value_buffer_heap);
     return retcode;
 }
 
 /*
- * Fills an array with zeros.
+ * Internal function to fill an array with zeros.
+ * Used in einsum and dot, which ensures the dtype is, in some sense, numerical
+ * and not a str or struct
  *
  * dst: The destination array.
  * wheremask: If non-NULL, a boolean mask specifying where to set the values.
@@ -409,21 +444,26 @@ NPY_NO_EXPORT int
 PyArray_AssignZero(PyArrayObject *dst,
                    PyArrayObject *wheremask)
 {
-    npy_bool value;
-    PyArray_Descr *bool_dtype;
-    int retcode;
-
-    /* Create a raw bool scalar with the value False */
-    bool_dtype = PyArray_DescrFromType(NPY_BOOL);
-    if (bool_dtype == NULL) {
-        return -1;
+    int retcode = 0;
+    if (PyArray_ISOBJECT(dst)) {
+        PyObject * pZero = PyLong_FromLong(0);
+        retcode = PyArray_AssignRawScalar(dst, PyArray_DESCR(dst),
+                                     (char *)&pZero, wheremask, NPY_SAFE_CASTING);
+        Py_DECREF(pZero);
     }
-    value = 0;
+    else {
+        /* Create a raw bool scalar with the value False */
+        PyArray_Descr *bool_dtype = PyArray_DescrFromType(NPY_BOOL);
+        if (bool_dtype == NULL) {
+            return -1;
+        }
+        npy_bool value = 0;
 
-    retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
-                                      wheremask, NPY_SAFE_CASTING);
+        retcode = PyArray_AssignRawScalar(dst, bool_dtype, (char *)&value,
+                                          wheremask, NPY_SAFE_CASTING);
 
-    Py_DECREF(bool_dtype);
+        Py_DECREF(bool_dtype);
+    }
     return retcode;
 }
 
@@ -483,7 +523,7 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
             PyArray_NDIM(self), PyArray_DIMS(self), PyArray_STRIDES(self),
             PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (ret == NULL) {
         Py_XDECREF(type);
         return NULL;
diff --git a/numpy/core/src/multiarray/convert_datatype.c b/numpy/core/src/multiarray/convert_datatype.c
index 3973fc795..de1cd075d 100644
--- a/numpy/core/src/multiarray/convert_datatype.c
+++ b/numpy/core/src/multiarray/convert_datatype.c
@@ -18,6 +18,7 @@
 #include "can_cast_table.h"
 #include "common.h"
 #include "ctors.h"
+#include "descriptor.h"
 #include "dtypemeta.h"
 #include "common_dtype.h"
 #include "scalartypes.h"
@@ -31,6 +32,7 @@
 #include "array_method.h"
 #include "usertypes.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "arrayobject.h"
 
 
@@ -152,7 +154,7 @@ PyArray_GetCastingImpl(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
 {
     PyObject *res;
     if (from == to) {
-        res = NPY_DT_SLOTS(from)->within_dtype_castingimpl;
+        res = (PyObject *)NPY_DT_SLOTS(from)->within_dtype_castingimpl;
     }
     else {
         res = PyDict_GetItemWithError(NPY_DT_SLOTS(from)->castingimpls, (PyObject *)to);
@@ -329,7 +331,7 @@ PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int is_f_order)
         return NULL;
     }
 
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, (PyObject *)dtype));
+    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(arr, NULL, dtype));
     if (dtype == NULL) {
         return NULL;
     }
@@ -1148,7 +1150,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType)
  */
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype)
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype)
 {
     if (requested_dtype == NULL) {
         return PyArray_ResultType(n, arrays, 0, NULL);
@@ -2414,8 +2416,7 @@ PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth)
             return -1;
         }
         Py_INCREF(meth->method);
-        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = (
-                (PyObject *)meth->method);
+        NPY_DT_SLOTS(meth->dtypes[0])->within_dtype_castingimpl = meth->method;
 
         return 0;
     }
@@ -2668,7 +2669,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
          * consider moving this warning into the inner-loop at some point
          * for simplicity (this requires ensuring it is only emitted once).
          */
-        slots[5].slot = NPY_METH_get_loop;
+        slots[5].slot = _NPY_METH_get_loop;
         slots[5].pfunc = &complex_to_noncomplex_get_loop;
         slots[6].slot = 0;
         slots[6].pfunc = NULL;
@@ -2689,7 +2690,7 @@ add_numeric_cast(PyArray_DTypeMeta *from, PyArray_DTypeMeta *to)
         /* When there is no casting (equivalent C-types) use byteswap loops */
         slots[0].slot = NPY_METH_resolve_descriptors;
         slots[0].pfunc = &legacy_same_dtype_resolve_descriptors;
-        slots[1].slot = NPY_METH_get_loop;
+        slots[1].slot = _NPY_METH_get_loop;
         slots[1].pfunc = &get_byteswap_loop;
         slots[2].slot = 0;
         slots[2].pfunc = NULL;
@@ -2873,14 +2874,14 @@ add_other_to_and_from_string_cast(
      */
     PyArray_DTypeMeta *dtypes[2] = {other, string};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &cast_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
         .name = "legacy_cast_to_string",
         .nin = 1,
         .nout = 1,
-        .flags = NPY_METH_REQUIRES_PYAPI,
+        .flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS,
         .dtypes = dtypes,
         .slots = slots,
     };
@@ -3012,7 +3013,7 @@ PyArray_InitializeStringCasts(void)
     /* string<->string and unicode<->unicode have their own specialized casts */
     PyArray_DTypeMeta *dtypes[2];
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &string_to_string_get_loop},
+            {_NPY_METH_get_loop, &string_to_string_get_loop},
             {NPY_METH_resolve_descriptors, &string_to_string_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
@@ -3711,7 +3712,7 @@ PyArray_InitializeVoidToVoidCast(void)
     PyArray_DTypeMeta *Void = PyArray_DTypeFromTypeNum(NPY_VOID);
     PyArray_DTypeMeta *dtypes[2] = {Void, Void};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &void_to_void_get_loop},
+            {_NPY_METH_get_loop, &void_to_void_get_loop},
             {NPY_METH_resolve_descriptors, &void_to_void_resolve_descriptors},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
@@ -3894,7 +3895,7 @@ PyArray_InitializeObjectToObjectCast(void)
     PyArray_DTypeMeta *Object = PyArray_DTypeFromTypeNum(NPY_OBJECT);
     PyArray_DTypeMeta *dtypes[2] = {Object, Object};
     PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &object_to_object_get_loop},
+            {_NPY_METH_get_loop, &object_to_object_get_loop},
             {0, NULL}};
     PyArrayMethod_Spec spec = {
             .name = "object_to_object_cast",
diff --git a/numpy/core/src/multiarray/convert_datatype.h b/numpy/core/src/multiarray/convert_datatype.h
index 1a23965f8..a68c669ee 100644
--- a/numpy/core/src/multiarray/convert_datatype.h
+++ b/numpy/core/src/multiarray/convert_datatype.h
@@ -82,7 +82,7 @@ PyArray_CastDescrToDType(PyArray_Descr *descr, PyArray_DTypeMeta *given_DType);
 
 NPY_NO_EXPORT PyArray_Descr *
 PyArray_FindConcatenationDescriptor(
-        npy_intp n, PyArrayObject **arrays, PyObject *requested_dtype);
+        npy_intp n, PyArrayObject **arrays, PyArray_Descr *requested_dtype);
 
 NPY_NO_EXPORT int
 PyArray_AddCastingImplementation(PyBoundArrayMethodObject *meth);
diff --git a/numpy/core/src/multiarray/ctors.c b/numpy/core/src/multiarray/ctors.c
index 4f0bc7337..3c5b047c3 100644
--- a/numpy/core/src/multiarray/ctors.c
+++ b/numpy/core/src/multiarray/ctors.c
@@ -20,6 +20,8 @@
 #include "common.h"
 #include "ctors.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
+#include "dtypemeta.h"
 #include "shape.h"
 #include "npy_buffer.h"
 #include "lowlevel_strided_loops.h"
@@ -626,8 +628,7 @@ NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring)
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags)
 {
     PyArrayObject_fields *fa;
     npy_intp nbytes;
@@ -642,44 +643,54 @@ PyArray_NewFromDescr_int(
         return NULL;
     }
 
-    if (descr->subarray) {
-        PyObject *ret;
-        npy_intp newdims[2*NPY_MAXDIMS];
-        npy_intp *newstrides = NULL;
-        memcpy(newdims, dims, nd*sizeof(npy_intp));
-        if (strides) {
-            newstrides = newdims + NPY_MAXDIMS;
-            memcpy(newstrides, strides, nd*sizeof(npy_intp));
-        }
-        nd =_update_descr_and_dimensions(&descr, newdims,
-                                         newstrides, nd);
-        ret = PyArray_NewFromDescr_int(
-                subtype, descr,
-                nd, newdims, newstrides, data,
-                flags, obj, base,
-                zeroed, allow_emptystring);
-        return ret;
-    }
-
-    /* Check datatype element size */
     nbytes = descr->elsize;
-    if (PyDataType_ISUNSIZED(descr)) {
-        if (!PyDataType_ISFLEXIBLE(descr)) {
-            PyErr_SetString(PyExc_TypeError, "Empty data-type");
-            Py_DECREF(descr);
-            return NULL;
-        }
-        else if (PyDataType_ISSTRING(descr) && !allow_emptystring &&
-                 data == NULL) {
-            PyArray_DESCR_REPLACE(descr);
-            if (descr == NULL) {
+    /*
+     * Unless explicitly asked not to, we do replace dtypes in some cases.
+     * This mainly means that we never create arrays with a subarray dtype
+     * (unless for internal use when requested).  And neither do we create
+     * S0/U0 arrays in most cases (unless data == NULL so this is probably
+     * a view where growing the dtype would be presumable wrong).
+     */
+    if (!(cflags & _NPY_ARRAY_ENSURE_DTYPE_IDENTITY)) {
+        if (descr->subarray) {
+            PyObject *ret;
+            npy_intp newdims[2*NPY_MAXDIMS];
+            npy_intp *newstrides = NULL;
+            memcpy(newdims, dims, nd*sizeof(npy_intp));
+            if (strides) {
+                newstrides = newdims + NPY_MAXDIMS;
+                memcpy(newstrides, strides, nd*sizeof(npy_intp));
+            }
+            nd =_update_descr_and_dimensions(&descr, newdims,
+                                            newstrides, nd);
+            ret = PyArray_NewFromDescr_int(
+                    subtype, descr,
+                    nd, newdims, newstrides, data,
+                    flags, obj, base, cflags);
+            return ret;
+        }
+
+        /* Check datatype element size */
+        if (PyDataType_ISUNSIZED(descr)) {
+            if (!PyDataType_ISFLEXIBLE(descr) &&
+                NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+                PyErr_SetString(PyExc_TypeError, "Empty data-type");
+                Py_DECREF(descr);
                 return NULL;
             }
-            if (descr->type_num == NPY_STRING) {
-                nbytes = descr->elsize = 1;
-            }
-            else {
-                nbytes = descr->elsize = sizeof(npy_ucs4);
+            else if (PyDataType_ISSTRING(descr)
+                        && !(cflags & _NPY_ARRAY_ALLOW_EMPTY_STRING)
+                        && data == NULL) {
+                PyArray_DESCR_REPLACE(descr);
+                if (descr == NULL) {
+                    return NULL;
+                }
+                if (descr->type_num == NPY_STRING) {
+                    nbytes = descr->elsize = 1;
+                }
+                else {
+                    nbytes = descr->elsize = sizeof(npy_ucs4);
+                }
             }
         }
     }
@@ -712,6 +723,11 @@ PyArray_NewFromDescr_int(
     fa->base = (PyObject *)NULL;
     fa->weakreflist = (PyObject *)NULL;
 
+    /* needed for zero-filling logic below, defined and initialized up here
+       so cleanup logic can go in the fail block */
+    NPY_traverse_info fill_zero_info;
+    NPY_traverse_info_init(&fill_zero_info);
+
     if (nd > 0) {
         fa->dimensions = npy_alloc_cache_dim(2 * nd);
         if (fa->dimensions == NULL) {
@@ -781,6 +797,32 @@ PyArray_NewFromDescr_int(
 
 
     if (data == NULL) {
+        /* float errors do not matter and we do not release GIL */
+        NPY_ARRAYMETHOD_FLAGS zero_flags;
+        get_traverse_loop_function *get_fill_zero_loop =
+            NPY_DT_SLOTS(NPY_DTYPE(descr))->get_fill_zero_loop;
+        if (get_fill_zero_loop != NULL) {
+            if (get_fill_zero_loop(
+                    NULL, descr, 1, descr->elsize, &(fill_zero_info.func),
+                    &(fill_zero_info.auxdata), &zero_flags) < 0) {
+                goto fail;
+            }
+        }
+
+        /*
+         * We always want a zero-filled array allocated with calloc if
+         * NPY_NEEDS_INIT is set on the dtype, for safety.  We also want a
+         * zero-filled array if zeroed is set and the zero-filling loop isn't
+         * defined, for better performance.
+         *
+         * If the zero-filling loop is defined and zeroed is set, allocate
+         * with malloc and let the zero-filling loop fill the array buffer
+         * with valid zero values for the dtype.
+         */
+        int use_calloc = (
+                PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT) ||
+                ((cflags & _NPY_ARRAY_ZEROED) && (fill_zero_info.func == NULL)));
+
         /* Store the handler in case the default is modified */
         fa->mem_handler = PyDataMem_GetHandler();
         if (fa->mem_handler == NULL) {
@@ -798,11 +840,8 @@ PyArray_NewFromDescr_int(
                 fa->strides[i] = 0;
             }
         }
-        /*
-         * It is bad to have uninitialized OBJECT pointers
-         * which could also be sub-fields of a VOID array
-         */
-        if (zeroed || PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+
+        if (use_calloc) {
             data = PyDataMem_UserNEW_ZEROED(nbytes, 1, fa->mem_handler);
         }
         else {
@@ -813,6 +852,19 @@ PyArray_NewFromDescr_int(
             goto fail;
         }
 
+        /*
+         * If the array needs special dtype-specific zero-filling logic, do that
+         */
+        if (NPY_UNLIKELY((cflags & _NPY_ARRAY_ZEROED)
+                         && (fill_zero_info.func != NULL))) {
+            npy_intp size = PyArray_MultiplyList(fa->dimensions, fa->nd);
+            if (fill_zero_info.func(
+                    NULL, descr, data, size, descr->elsize,
+                    fill_zero_info.auxdata) < 0) {
+                goto fail;
+            }
+        }
+
         fa->flags |= NPY_ARRAY_OWNDATA;
     }
     else {
@@ -907,9 +959,11 @@ PyArray_NewFromDescr_int(
             }
         }
     }
+    NPY_traverse_info_xfree(&fill_zero_info);
     return (PyObject *)fa;
 
  fail:
+    NPY_traverse_info_xfree(&fill_zero_info);
     Py_XDECREF(fa->mem_handler);
     Py_DECREF(fa);
     return NULL;
@@ -957,7 +1011,7 @@ PyArray_NewFromDescrAndBase(
 {
     return PyArray_NewFromDescr_int(subtype, descr, nd,
                                     dims, strides, data,
-                                    flags, obj, base, 0, 0);
+                                    flags, obj, base, 0);
 }
 
 /*
@@ -1440,167 +1494,44 @@ PyArray_GetArrayParamsFromObject(PyObject *NPY_UNUSED(op),
 }
 
 
-/*
- * This function is a legacy implementation to retain subarray dtype
- * behaviour in array coercion. The behaviour here makes sense if tuples
- * of matching dimensionality are being coerced. Due to the difficulty
- * that the result is ill-defined for lists of array-likes, this is deprecated.
- *
- * WARNING: Do not use this function, it exists purely to support a deprecated
- *          code path.
+/*NUMPY_API
+ * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
+ * Steals a reference to newtype --- which can be NULL
  */
-static int
-setArrayFromSequence(PyArrayObject *a, PyObject *s,
-                        int dim, PyArrayObject * dst)
+NPY_NO_EXPORT PyObject *
+PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
+                int max_depth, int flags, PyObject *context)
 {
-    Py_ssize_t i, slen;
-    int res = -1;
-
-    /* first recursion, view equal destination */
-    if (dst == NULL)
-        dst = a;
+    npy_dtype_info dt_info = {NULL, NULL};
 
-    /*
-     * This code is to ensure that the sequence access below will
-     * return a lower-dimensional sequence.
-     */
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        newtype, &dt_info.descr, &dt_info.dtype);
 
-    /* INCREF on entry DECREF on exit */
-    Py_INCREF(s);
-
-    PyObject *seq = NULL;
-
-    if (PyArray_Check(s)) {
-        if (!(PyArray_CheckExact(s))) {
-            /*
-             * make sure a base-class array is used so that the dimensionality
-             * reduction assumption is correct.
-             */
-            /* This will DECREF(s) if replaced */
-            s = PyArray_EnsureArray(s);
-            if (s == NULL) {
-                goto fail;
-            }
-        }
-
-        /* dst points to correct array subsection */
-        if (PyArray_CopyInto(dst, (PyArrayObject *)s) < 0) {
-            goto fail;
-        }
-
-        Py_DECREF(s);
-        return 0;
-    }
-
-    if (dim > PyArray_NDIM(a)) {
-        PyErr_Format(PyExc_ValueError,
-                 "setArrayFromSequence: sequence/array dimensions mismatch.");
-        goto fail;
-    }
-
-    /* Try __array__ before using s as a sequence */
-    PyObject *tmp = _array_from_array_like(s, NULL, 0, NULL, 0);
-    if (tmp == NULL) {
-        goto fail;
-    }
-    else if (tmp == Py_NotImplemented) {
-        Py_DECREF(tmp);
-    }
-    else {
-        int r = PyArray_CopyInto(dst, (PyArrayObject *)tmp);
-        Py_DECREF(tmp);
-        if (r < 0) {
-            goto fail;
-        }
-        Py_DECREF(s);
-        return 0;
-    }
-
-    seq = PySequence_Fast(s, "Could not convert object to sequence");
-    if (seq == NULL) {
-        goto fail;
-    }
-    slen = PySequence_Fast_GET_SIZE(seq);
-
-    /*
-     * Either the dimensions match, or the sequence has length 1 and can
-     * be broadcast to the destination.
-     */
-    if (slen != PyArray_DIMS(a)[dim] && slen != 1) {
-        PyErr_Format(PyExc_ValueError,
-                 "cannot copy sequence with size %zd to array axis "
-                 "with dimension %" NPY_INTP_FMT, slen, PyArray_DIMS(a)[dim]);
-        goto fail;
-    }
-
-    /* Broadcast the one element from the sequence to all the outputs */
-    if (slen == 1) {
-        PyObject *o = PySequence_Fast_GET_ITEM(seq, 0);
-        npy_intp alen = PyArray_DIM(a, dim);
-
-        for (i = 0; i < alen; i++) {
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
-
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
-    }
-    /* Copy element by element */
-    else {
-        for (i = 0; i < slen; i++) {
-            PyObject * o = PySequence_Fast_GET_ITEM(seq, i);
-            if ((PyArray_NDIM(a) - dim) > 1) {
-                PyArrayObject * tmp =
-                    (PyArrayObject *)array_item_asarray(dst, i);
-                if (tmp == NULL) {
-                    goto fail;
-                }
+    Py_XDECREF(newtype);
 
-                res = setArrayFromSequence(a, o, dim+1, tmp);
-                Py_DECREF(tmp);
-            }
-            else {
-                char * b = (PyArray_BYTES(dst) + i * PyArray_STRIDES(dst)[0]);
-                res = PyArray_SETITEM(dst, b, o);
-            }
-            if (res < 0) {
-                goto fail;
-            }
-        }
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
     }
 
-    Py_DECREF(seq);
-    Py_DECREF(s);
-    return 0;
+    PyObject* ret =  PyArray_FromAny_int(op, dt_info.descr, dt_info.dtype,
+                                         min_depth, max_depth, flags, context);
 
- fail:
-    Py_XDECREF(seq);
-    Py_DECREF(s);
-    return res;
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
 }
 
-
-
-/*NUMPY_API
- * Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags
- * Steals a reference to newtype --- which can be NULL
+/*
+ * Internal version of PyArray_FromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
  */
+
 NPY_NO_EXPORT PyObject *
-PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
-                int max_depth, int flags, PyObject *context)
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context)
 {
     /*
      * This is the main code to make a NumPy array from a Python
@@ -1617,89 +1548,14 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    PyArray_Descr *fixed_descriptor;
-    PyArray_DTypeMeta *fixed_DType;
-    if (PyArray_ExtractDTypeAndDescriptor((PyObject *)newtype,
-            &fixed_descriptor, &fixed_DType) < 0) {
-        Py_XDECREF(newtype);
-        return NULL;
-    }
-    Py_XDECREF(newtype);
-
     ndim = PyArray_DiscoverDTypeAndShape(op,
-            NPY_MAXDIMS, dims, &cache, fixed_DType, fixed_descriptor, &dtype,
+            NPY_MAXDIMS, dims, &cache, in_DType, in_descr, &dtype,
             flags & NPY_ARRAY_ENSURENOCOPY);
 
-    Py_XDECREF(fixed_descriptor);
-    Py_XDECREF(fixed_DType);
     if (ndim < 0) {
         return NULL;
     }
 
-    if (NPY_UNLIKELY(fixed_descriptor != NULL && PyDataType_HASSUBARRAY(dtype))) {
-        /*
-         * When a subarray dtype was passed in, its dimensions are appended
-         * to the array dimension (causing a dimension mismatch).
-         * There is a problem with that, because if we coerce from non-arrays
-         * we do this correctly by element (as defined by tuples), but for
-         * arrays we first append the dimensions and then assign to the base
-         * dtype and then assign which causes the problem.
-         *
-         * Thus, we check if there is an array included, in that case we
-         * give a FutureWarning.
-         * When the warning is removed, PyArray_Pack will have to ensure
-         * that it does not append the dimensions when creating the subarrays
-         * to assign `arr[0] = obj[0]`.
-         */
-        int includes_array = 0;
-        if (cache != NULL) {
-            /* This is not ideal, but it is a pretty special case */
-            coercion_cache_obj *next = cache;
-            while (next != NULL) {
-                if (!next->sequence) {
-                    includes_array = 1;
-                    break;
-                }
-                next = next->next;
-            }
-        }
-        if (includes_array) {
-            npy_free_coercion_cache(cache);
-
-            ret = (PyArrayObject *) PyArray_NewFromDescr(
-                    &PyArray_Type, dtype, ndim, dims, NULL, NULL,
-                    flags & NPY_ARRAY_F_CONTIGUOUS, NULL);
-            if (ret == NULL) {
-                return NULL;
-            }
-            assert(PyArray_NDIM(ret) != ndim);
-
-            /* NumPy 1.20, 2020-10-01 */
-            if (DEPRECATE_FUTUREWARNING(
-                    "creating an array with a subarray dtype will behave "
-                    "differently when the `np.array()` (or `asarray`, etc.) "
-                    "call includes an array or array object.\n"
-                    "If you are converting a single array or a list of arrays,"
-                    "you can opt-in to the future behaviour using:\n"
-                    "    np.array(arr, dtype=np.dtype(['f', dtype]))['f']\n"
-                    "    np.array([arr1, arr2], dtype=np.dtype(['f', dtype]))['f']\n"
-                    "\n"
-                    "By including a new field and indexing it after the "
-                    "conversion.\n"
-                    "This may lead to a different result or to current failures "
-                    "succeeding.  (FutureWarning since NumPy 1.20)") < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-
-            if (setArrayFromSequence(ret, op, 0, NULL) < 0) {
-                Py_DECREF(ret);
-                return NULL;
-            }
-            return (PyObject *)ret;
-        }
-    }
-
     if (dtype == NULL) {
         dtype = PyArray_DescrFromType(NPY_DEFAULT_TYPE);
     }
@@ -1734,7 +1590,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
     }
     else if (cache == NULL && PyArray_IsScalar(op, Void) &&
             !(((PyVoidScalarObject *)op)->flags & NPY_ARRAY_OWNDATA) &&
-            newtype == NULL) {
+             ((in_descr == NULL) && (in_DType == NULL))) {
         /*
          * Special case, we return a *view* into void scalars, mainly to
          * allow things similar to the "reversed" assignment:
@@ -1765,7 +1621,7 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
         return NULL;
     }
 
-    if (cache == NULL && newtype != NULL &&
+    if (cache == NULL && in_descr != NULL &&
             PyDataType_ISSIGNED(dtype) &&
             PyArray_IsScalar(op, Generic)) {
         assert(ndim == 0);
@@ -1896,7 +1752,6 @@ PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
  * NPY_ARRAY_FORCECAST will cause a cast to occur regardless of whether or not
  * it is safe.
  *
- * context is passed through to PyArray_GetArrayParamsFromObject
  */
 
 /*NUMPY_API
@@ -1906,24 +1761,57 @@ NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context)
 {
+    npy_dtype_info dt_info = {NULL, NULL};
+
+    int res = PyArray_ExtractDTypeAndDescriptor(
+        descr, &dt_info.descr, &dt_info.dtype);
+
+    Py_XDECREF(descr);
+
+    if (res < 0) {
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
+        return NULL;
+    }
+
+    PyObject* ret =  PyArray_CheckFromAny_int(
+        op, dt_info.descr, dt_info.dtype, min_depth, max_depth, requires,
+        context);
+
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
+    return ret;
+}
+
+/*
+ * Internal version of PyArray_CheckFromAny that accepts a dtypemeta. Borrows
+ * references to the descriptor and dtype.
+ */
+
+NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context)
+{
     PyObject *obj;
     if (requires & NPY_ARRAY_NOTSWAPPED) {
-        if (!descr && PyArray_Check(op) &&
+        if (!in_descr && PyArray_Check(op) &&
                 PyArray_ISBYTESWAPPED((PyArrayObject* )op)) {
-            descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
-            if (descr == NULL) {
+            in_descr = PyArray_DescrNew(PyArray_DESCR((PyArrayObject *)op));
+            if (in_descr == NULL) {
                 return NULL;
             }
         }
-        else if (descr && !PyArray_ISNBO(descr->byteorder)) {
-            PyArray_DESCR_REPLACE(descr);
+        else if (in_descr && !PyArray_ISNBO(in_descr->byteorder)) {
+            PyArray_DESCR_REPLACE(in_descr);
         }
-        if (descr && descr->byteorder != NPY_IGNORE) {
-            descr->byteorder = NPY_NATIVE;
+        if (in_descr && in_descr->byteorder != NPY_IGNORE) {
+            in_descr->byteorder = NPY_NATIVE;
         }
     }
 
-    obj = PyArray_FromAny(op, descr, min_depth, max_depth, requires, context);
+    obj = PyArray_FromAny_int(op, in_descr, in_DType, min_depth,
+                              max_depth, requires, context);
     if (obj == NULL) {
         return NULL;
     }
@@ -2031,13 +1919,26 @@ PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int flags)
         if ((flags & NPY_ARRAY_ENSUREARRAY)) {
             subok = 0;
         }
+        Py_INCREF(newtype);
         ret = (PyArrayObject *)PyArray_NewLikeArray(arr, order,
                                                     newtype, subok);
         if (ret == NULL) {
+            Py_DECREF(newtype);
             return NULL;
         }
 
-        if (PyArray_CopyInto(ret, arr) < 0) {
+        int actual_ndim = PyArray_NDIM(ret);
+        PyArray_Descr *actual_dtype = PyArray_DESCR(ret);
+        ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(arr);
+        ((PyArrayObject_fields *)ret)->descr = newtype;
+
+        int success = PyArray_CopyInto(ret, arr);
+
+        Py_DECREF(newtype);
+        ((PyArrayObject_fields *)ret)->nd = actual_ndim;
+        ((PyArrayObject_fields *)ret)->descr = actual_dtype;
+
+        if (success < 0) {
             Py_DECREF(ret);
             return NULL;
         }
@@ -2832,7 +2733,7 @@ PyArray_CopyAsFlat(PyArrayObject *dst, PyArrayObject *src, NPY_ORDER order)
     }
 
     if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
-        int fpes = npy_get_floatstatus_barrier((char *)src_iter);
+        int fpes = npy_get_floatstatus_barrier((char *)&src_iter);
         if (fpes && PyUFunc_GiveFloatingpointErrors("cast", fpes) < 0) {
             return -1;
         }
@@ -2956,23 +2857,13 @@ PyArray_Zeros(int nd, npy_intp const *dims, PyArray_Descr *type, int is_f_order)
             &PyArray_Type, type,
             nd, dims, NULL, NULL,
             is_f_order, NULL, NULL,
-            1, 0);
+            _NPY_ARRAY_ZEROED);
 
     if (ret == NULL) {
         return NULL;
     }
 
-    /* handle objects */
-    if (PyDataType_REFCHK(PyArray_DESCR(ret))) {
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-    }
-
-
     return (PyObject *)ret;
-
 }
 
 /*NUMPY_API
@@ -3620,7 +3511,7 @@ PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char *sep)
                 &PyArray_Type, dtype,
                 1, &num, NULL, NULL,
                 0, NULL, NULL,
-                0, 1);
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
     }
     if ((sep == NULL) || (strlen(sep) == 0)) {
         ret = array_fromfile_binary(fp, dtype, num, &nread);
diff --git a/numpy/core/src/multiarray/ctors.h b/numpy/core/src/multiarray/ctors.h
index 98160b1cc..a876ab2c0 100644
--- a/numpy/core/src/multiarray/ctors.h
+++ b/numpy/core/src/multiarray/ctors.h
@@ -13,12 +13,29 @@ PyArray_NewFromDescrAndBase(
         npy_intp const *dims, npy_intp const *strides, void *data,
         int flags, PyObject *obj, PyObject *base);
 
+
+/* Private options for NewFromDescriptor */
+typedef enum {
+    /*
+     * Indicate the the array should be zeroed, we may use calloc to do so
+     * (otherwise much like ).
+     */
+    _NPY_ARRAY_ZEROED = 1 << 0,
+    /* Whether to allow empty strings (implied by ensure dtype identity) */
+    _NPY_ARRAY_ALLOW_EMPTY_STRING = 1 << 1,
+    /*
+     * If we take a view into an existing array and use its dtype, then that
+     * dtype must be preserved (for example for subarray and S0, but also
+     * possibly for future dtypes that store more metadata).
+     */
+    _NPY_ARRAY_ENSURE_DTYPE_IDENTITY = 1 << 2,
+} _NPY_CREATION_FLAGS;
+
 NPY_NO_EXPORT PyObject *
 PyArray_NewFromDescr_int(
         PyTypeObject *subtype, PyArray_Descr *descr, int nd,
         npy_intp const *dims, npy_intp const *strides, void *data,
-        int flags, PyObject *obj, PyObject *base, int zeroed,
-        int allow_emptystring);
+        int flags, PyObject *obj, PyObject *base, _NPY_CREATION_FLAGS cflags);
 
 NPY_NO_EXPORT PyObject *
 PyArray_NewLikeArrayWithShape(
@@ -36,10 +53,20 @@ _array_from_array_like(PyObject *op,
         int never_copy);
 
 NPY_NO_EXPORT PyObject *
+PyArray_FromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                    PyArray_DTypeMeta *in_DType, int min_depth, int max_depth,
+                    int flags, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int min_depth,
                 int max_depth, int flags, PyObject *context);
 
 NPY_NO_EXPORT PyObject *
+PyArray_CheckFromAny_int(PyObject *op, PyArray_Descr *in_descr,
+                         PyArray_DTypeMeta *in_DType, int min_depth,
+                         int max_depth, int requires, PyObject *context);
+
+NPY_NO_EXPORT PyObject *
 PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int min_depth,
                      int max_depth, int requires, PyObject *context);
 
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index 695b696c2..820f40fd8 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -4100,7 +4100,7 @@ PyArray_InitializeDatetimeCasts()
     };
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_time_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &time_to_time_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4130,7 +4130,7 @@ PyArray_InitializeDatetimeCasts()
 
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &datetime_to_timedelta_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &legacy_cast_get_strided_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
@@ -4203,7 +4203,7 @@ PyArray_InitializeDatetimeCasts()
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &time_to_string_resolve_descriptors;
     /* Strided loop differs for the two */
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
 
@@ -4252,7 +4252,7 @@ PyArray_InitializeDatetimeCasts()
     /* The default type resolution should work fine. */
     slots[0].slot = NPY_METH_resolve_descriptors;
     slots[0].pfunc = &string_to_datetime_cast_resolve_descriptors;
-    slots[1].slot = NPY_METH_get_loop;
+    slots[1].slot = _NPY_METH_get_loop;
     slots[1].pfunc = &string_to_datetime_cast_get_loop;
     slots[2].slot = 0;
     slots[2].pfunc = NULL;
diff --git a/numpy/core/src/multiarray/datetime_busday.c b/numpy/core/src/multiarray/datetime_busday.c
index d3e9e1451..93ed0972e 100644
--- a/numpy/core/src/multiarray/datetime_busday.c
+++ b/numpy/core/src/multiarray/datetime_busday.c
@@ -365,6 +365,7 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
                     npy_datetime *holidays_begin, npy_datetime *holidays_end)
 {
     npy_int64 count, whole_weeks;
+
     int day_of_week = 0;
     int swapped = 0;
 
@@ -386,6 +387,10 @@ apply_business_day_count(npy_datetime date_begin, npy_datetime date_end,
         date_begin = date_end;
         date_end = tmp;
         swapped = 1;
+        // we swapped date_begin and date_end, so we need to correct for the
+        // original date_end that should not be included. gh-23197
+        date_begin++;
+        date_end++;
     }
 
     /* Remove any earlier holidays */
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index d1973442a..68d398d64 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -74,9 +74,6 @@ _try_convert_from_ctypes_type(PyTypeObject *type)
     return (PyArray_Descr *)res;
 }
 
-static PyArray_Descr *
-_convert_from_any(PyObject *obj, int align);
-
 /*
  * This function creates a dtype object when the object has a "dtype" attribute,
  * and it can be converted to a dtype object.
@@ -1393,6 +1390,141 @@ PyArray_DescrConverter2(PyObject *obj, PyArray_Descr **at)
     }
 }
 
+
+/**
+ * Check the descriptor is a legacy "flexible" DType instance, this is
+ * an instance which is (normally) not attached to an array, such as a string
+ * of length 0 or a datetime with no unit.
+ * These should be largely deprecated, and represent only the DType class
+ * for most `dtype` parameters.
+ *
+ * TODO: This function should eventually receive a deprecation warning and
+ *       be removed.
+ *
+ * @param descr
+ * @return 1 if this is not a concrete dtype instance 0 otherwise
+ */
+static int
+descr_is_legacy_parametric_instance(PyArray_Descr *descr,
+                                    PyArray_DTypeMeta *DType)
+{
+    if (!NPY_DT_is_legacy(DType)) {
+        return 0;
+    }
+
+    if (PyDataType_ISUNSIZED(descr)) {
+        return 1;
+    }
+    /* Flexible descr with generic time unit (which can be adapted) */
+    if (PyDataType_ISDATETIME(descr)) {
+        PyArray_DatetimeMetaData *meta;
+        meta = get_datetime_metadata_from_dtype(descr);
+        if (meta->base == NPY_FR_GENERIC) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Given a descriptor (dtype instance), handles conversion of legacy flexible
+ * "unsized" descriptors to their DType.  It returns the DType and descriptor
+ * both results can be NULL (if the input is).  But it always sets the DType
+ * when a descriptor is set.
+ *
+ * @param dtype
+ * @param out_descr
+ * @param out_DType
+ * @return 0 on success -1 on failure
+ */
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType)
+{
+    *out_DType = NULL;
+    *out_descr = NULL;
+
+    if (dtype != NULL) {
+        *out_DType = NPY_DTYPE(dtype);
+        Py_INCREF(*out_DType);
+        if (!descr_is_legacy_parametric_instance((PyArray_Descr *)dtype,
+                                                    *out_DType)) {
+            *out_descr = (PyArray_Descr *)dtype;
+            Py_INCREF(*out_descr);
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.
+ *
+ * @param obj representing a dtype instance (descriptor) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  The class is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *obj, npy_dtype_info *dt_info)
+{
+    /*
+     * Allow dtype classes pass, this could also be generalized to at least
+     * some scalar types (right now most of these give instances or)
+     */
+    dt_info->dtype = NULL;
+    dt_info->descr = NULL;
+
+    if (PyObject_TypeCheck(obj, &PyArrayDTypeMeta_Type)) {
+        Py_INCREF(obj);
+        dt_info->dtype = (PyArray_DTypeMeta *)obj;
+        dt_info->descr = NULL;
+        return NPY_SUCCEED;
+    }
+    PyArray_Descr *descr;
+    if (PyArray_DescrConverter(obj, &descr) != NPY_SUCCEED) {
+        return NPY_FAIL;
+    }
+    /*
+     * The above converts e.g. "S" or "S0" to the prototype instance, we make
+     * it behave the same as the DType.  This is not fully correct, "S0" should
+     * be considered an instance with actual 0 length.
+     * TODO: It would be nice to fix that eventually.
+     */
+    int res = PyArray_ExtractDTypeAndDescriptor(
+                descr, &dt_info->descr, &dt_info->dtype);
+    Py_DECREF(descr);
+    if (res < 0) {
+        return NPY_FAIL;
+    }
+    return NPY_SUCCEED;
+}
+
+
+/**
+ * Converter function filling in an npy_dtype_info struct on success.  It
+ * accepts `None` and does nothing in that case (user must initialize to
+ * NULL anyway).
+ *
+ * @param obj None or obj representing a dtype instance (descr) or DType class.
+ * @param[out] npy_dtype_info filled with the DType class and dtype/descriptor
+ *         instance.  If `obj` is None, is not modified.  Otherwise the class
+ *         is always set while the instance may be NULL.
+ *         On error, both will be NULL.
+ * @return 0 on failure and 1 on success (as a converter)
+ */
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *obj, npy_dtype_info *dt_info)
+{
+    if (obj == Py_None) {
+        /* caller must have initialized for the optional version */
+        return NPY_SUCCEED;
+    }
+    return PyArray_DTypeOrDescrConverterRequired(obj, dt_info);
+}
+
 /**
  * Get a dtype instance from a python type
  */
@@ -2315,7 +2447,6 @@ arraydescr_new(PyTypeObject *subtype,
                 PyErr_NoMemory();
                 return NULL;
             }
-            PyObject_Init((PyObject *)descr, subtype);
             descr->f = &NPY_DT_SLOTS(DType)->f;
             Py_XINCREF(DType->scalar_type);
             descr->typeobj = DType->scalar_type;
@@ -3141,25 +3272,15 @@ arraydescr_newbyteorder(PyArray_Descr *self, PyObject *args)
 static PyObject *
 arraydescr_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
-
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if (args_len != 1) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 1 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 static PyMethodDef arraydescr_methods[] = {
diff --git a/numpy/core/src/multiarray/descriptor.h b/numpy/core/src/multiarray/descriptor.h
index 7e6f212f2..b14edc3aa 100644
--- a/numpy/core/src/multiarray/descriptor.h
+++ b/numpy/core/src/multiarray/descriptor.h
@@ -1,6 +1,29 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DESCRIPTOR_H_
 
+
+/*
+ * In some API calls we wish to allow users to pass a DType class or a
+ * dtype instances with different meanings.
+ * This struct is mainly used for the argument parsing in
+ * `PyArray_DTypeOrDescrConverter`.
+ */
+typedef struct {
+    PyArray_DTypeMeta *dtype;
+    PyArray_Descr *descr;
+} npy_dtype_info;
+
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterOptional(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_DTypeOrDescrConverterRequired(PyObject *, npy_dtype_info *dt_info);
+
+NPY_NO_EXPORT int
+PyArray_ExtractDTypeAndDescriptor(PyArray_Descr *dtype,
+        PyArray_Descr **out_descr, PyArray_DTypeMeta **out_DType);
+
 NPY_NO_EXPORT PyObject *arraydescr_protocol_typestr_get(
         PyArray_Descr *, void *);
 NPY_NO_EXPORT PyObject *arraydescr_protocol_descr_get(
diff --git a/numpy/core/src/multiarray/dlpack.c b/numpy/core/src/multiarray/dlpack.c
index 1c6eb58f2..d26701df8 100644
--- a/numpy/core/src/multiarray/dlpack.c
+++ b/numpy/core/src/multiarray/dlpack.c
@@ -171,7 +171,10 @@ array_dlpack(PyArrayObject *self,
     managed_dtype.bits = 8 * itemsize;
     managed_dtype.lanes = 1;
 
-    if (PyDataType_ISSIGNED(dtype)) {
+    if (PyDataType_ISBOOL(dtype)) {
+        managed_dtype.code = kDLBool;
+    }
+    else if (PyDataType_ISSIGNED(dtype)) {
         managed_dtype.code = kDLInt;
     }
     else if (PyDataType_ISUNSIGNED(dtype)) {
@@ -331,6 +334,11 @@ from_dlpack(PyObject *NPY_UNUSED(self), PyObject *obj) {
     const uint8_t bits = managed->dl_tensor.dtype.bits;
     const npy_intp itemsize = bits / 8;
     switch (managed->dl_tensor.dtype.code) {
+    case kDLBool:
+        if (bits == 8) {
+            typenum = NPY_BOOL;
+        }
+        break;
     case kDLInt:
         switch (bits)
         {
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 549300f73..f79662040 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -32,6 +32,7 @@
 
 #include "shape.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
 #include "alloc.h"
 #include "dtypemeta.h"
 #include "array_method.h"
@@ -73,18 +74,6 @@ _safe_print(PyObject *obj)
 }
 #endif
 
-/*
- * Returns a transfer function which DECREFs any references in src_type.
- *
- * Returns NPY_SUCCEED or NPY_FAIL.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api);
-
 
 /*************************** COPY REFERENCES *******************************/
 
@@ -159,7 +148,7 @@ typedef struct {
     NpyAuxData base;
     PyArray_GetItemFunc *getitem;
     PyArrayObject_fields arr_fields;
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _any_to_object_auxdata;
 
 
@@ -169,7 +158,7 @@ _any_to_object_auxdata_free(NpyAuxData *auxdata)
     _any_to_object_auxdata *data = (_any_to_object_auxdata *)auxdata;
 
     Py_DECREF(data->arr_fields.descr);
-    NPY_cast_info_xfree(&data->decref_src);
+    NPY_traverse_info_xfree(&data->decref_src);
     PyMem_Free(data);
 }
 
@@ -187,7 +176,7 @@ _any_to_object_auxdata_clone(NpyAuxData *auxdata)
     Py_INCREF(res->arr_fields.descr);
 
     if (data->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&res->decref_src, &data->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&res->decref_src, &data->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)res);
             return NULL;
         }
@@ -228,8 +217,8 @@ _strided_to_strided_any_to_object(
     }
     if (data->decref_src.func != NULL) {
         /* If necessary, clear the input buffer (`move_references`) */
-        if (data->decref_src.func(&data->decref_src.context,
-                &orig_src, &N, &src_stride, data->decref_src.auxdata) < 0) {
+        if (data->decref_src.func(NULL, data->decref_src.descr,
+                orig_src, N, src_stride, data->decref_src.auxdata) < 0) {
             return -1;
         }
     }
@@ -265,18 +254,18 @@ any_to_object_get_loop(
     data->arr_fields.nd = 0;
 
     data->getitem = context->descriptors[0]->f->getitem;
-    NPY_cast_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_src);
 
     if (move_references && PyDataType_REFCHK(context->descriptors[0])) {
-        int needs_api;
-        if (get_decref_transfer_function(
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
                 aligned, strides[0], context->descriptors[0],
-                &data->decref_src,
-                &needs_api) == NPY_FAIL)  {
+                &data->decref_src, &clear_flags) < 0)  {
             NPY_AUXDATA_FREE(*out_transferdata);
             *out_transferdata = NULL;
             return -1;
         }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
     }
     return 0;
 }
@@ -578,7 +567,7 @@ wrap_copy_swap_function(
             &PyArray_Type, dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->arr == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1327,7 +1316,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aip == NULL) {
         PyMem_Free(data);
         return NPY_FAIL;
@@ -1354,7 +1343,7 @@ get_legacy_dtype_cast_function(
             &PyArray_Type, tmp_dtype,
             1, &shape, NULL, NULL,
             0, NULL, NULL,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (data->aop == NULL) {
         Py_DECREF(data->aip);
         PyMem_Free(data);
@@ -1393,7 +1382,7 @@ typedef struct {
     npy_intp N;
     NPY_cast_info wrapped;
     /* If finish->func is non-NULL the source needs a decref */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _one_to_n_data;
 
 /* transfer data free function */
@@ -1401,7 +1390,7 @@ static void _one_to_n_data_free(NpyAuxData *data)
 {
     _one_to_n_data *d = (_one_to_n_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -1420,7 +1409,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
     newdata->base.clone = &_one_to_n_data_clone;
     newdata->N = d->N;
     /* Initialize in case of error, or if it is unused */
-    NPY_cast_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_src);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
@@ -1430,7 +1419,7 @@ static NpyAuxData *_one_to_n_data_clone(NpyAuxData *data)
         return (NpyAuxData *)newdata;
     }
 
-    if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
         _one_to_n_data_free((NpyAuxData *)newdata);
         return NULL;
     }
@@ -1490,8 +1479,8 @@ _strided_to_strided_one_to_n_with_finish(
             return -1;
         }
 
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &one_item, &zero_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, one_item, zero_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
 
@@ -1522,7 +1511,7 @@ get_one_to_n_transfer_function(int aligned,
     data->base.free = &_one_to_n_data_free;
     data->base.clone = &_one_to_n_data_clone;
     data->N = N;
-    NPY_cast_info_init(&data->decref_src);  /* In case of error */
+    NPY_traverse_info_init(&data->decref_src);  /* In case of error */
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1542,15 +1531,14 @@ get_one_to_n_transfer_function(int aligned,
 
     /* If the src object will need a DECREF, set src_dtype */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
     }
 
     if (data->decref_src.func == NULL) {
@@ -1741,8 +1729,8 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     NPY_cast_info wrapped;
-    NPY_cast_info decref_src;
-    NPY_cast_info decref_dst;  /* The use-case should probably be deprecated */
+    NPY_traverse_info decref_src;
+    NPY_traverse_info decref_dst;  /* The use-case should probably be deprecated */
     npy_intp src_N, dst_N;
     /* This gets a run-length encoded representation of the transfer */
     npy_intp run_count;
@@ -1755,8 +1743,8 @@ static void _subarray_broadcast_data_free(NpyAuxData *data)
 {
     _subarray_broadcast_data *d = (_subarray_broadcast_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
-    NPY_cast_info_xfree(&d->decref_dst);
+    NPY_traverse_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_dst);
     PyMem_Free(data);
 }
 
@@ -1780,21 +1768,21 @@ static NpyAuxData *_subarray_broadcast_data_clone(NpyAuxData *data)
     newdata->run_count = d->run_count;
     memcpy(newdata->offsetruns, d->offsetruns, offsetruns_size);
 
-    NPY_cast_info_init(&newdata->decref_src);
-    NPY_cast_info_init(&newdata->decref_dst);
+    NPY_traverse_info_init(&newdata->decref_src);
+    NPY_traverse_info_init(&newdata->decref_dst);
 
     if (NPY_cast_info_copy(&newdata->wrapped, &d->wrapped) < 0) {
         _subarray_broadcast_data_free((NpyAuxData *)newdata);
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
     }
     if (d->decref_dst.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_dst, &d->decref_dst) < 0) {
             _subarray_broadcast_data_free((NpyAuxData *) newdata);
             return NULL;
         }
@@ -1883,8 +1871,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
             }
             else {
                 if (d->decref_dst.func != NULL) {
-                    if (d->decref_dst.func(&d->decref_dst.context,
-                            &dst_ptr, &count, &dst_subitemsize,
+                    if (d->decref_dst.func(NULL, d->decref_dst.descr,
+                            dst_ptr, count, dst_subitemsize,
                             d->decref_dst.auxdata) < 0) {
                         return -1;
                     }
@@ -1895,8 +1883,8 @@ _strided_to_strided_subarray_broadcast_withrefs(
         }
 
         if (d->decref_src.func != NULL) {
-            if (d->decref_src.func(&d->decref_src.context,
-                    &src, &d->src_N, &src_subitemsize,
+            if (d->decref_src.func(NULL, d->decref_src.descr,
+                    src, d->src_N, src_subitemsize,
                     d->decref_src.auxdata) < 0) {
                 return -1;
             }
@@ -1939,8 +1927,8 @@ get_subarray_broadcast_transfer_function(int aligned,
     data->src_N = src_size;
     data->dst_N = dst_size;
 
-    NPY_cast_info_init(&data->decref_src);
-    NPY_cast_info_init(&data->decref_dst);
+    NPY_traverse_info_init(&data->decref_src);
+    NPY_traverse_info_init(&data->decref_dst);
 
     /*
      * move_references is set to 0, handled in the wrapping transfer fn,
@@ -1959,12 +1947,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the src object will need a DECREF */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        src_dtype->elsize, 0,
-                        src_dtype, NULL,
-                        1,
-                        &data->decref_src,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        src_dtype->elsize, src_dtype,
+                        &data->decref_src, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -1972,12 +1957,9 @@ get_subarray_broadcast_transfer_function(int aligned,
 
     /* If the dst object needs a DECREF to set it to NULL */
     if (PyDataType_REFCHK(dst_dtype)) {
-        if (PyArray_GetDTypeTransferFunction(aligned,
-                        dst_dtype->elsize, 0,
-                        dst_dtype, NULL,
-                        1,
-                        &data->decref_dst,
-                        out_flags) != NPY_SUCCEED) {
+        if (PyArray_GetClearFunction(aligned,
+                        dst_dtype->elsize, dst_dtype,
+                        &data->decref_dst, out_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
@@ -2182,6 +2164,7 @@ typedef struct {
 typedef struct {
     NpyAuxData base;
     npy_intp field_count;
+    NPY_traverse_info decref_src;
     _single_field_transfer fields[];
 } _field_transfer_data;
 
@@ -2190,6 +2173,7 @@ typedef struct {
 static void _field_transfer_data_free(NpyAuxData *data)
 {
     _field_transfer_data *d = (_field_transfer_data *)data;
+    NPY_traverse_info_xfree(&d->decref_src);
 
     for (npy_intp i = 0; i < d->field_count; ++i) {
         NPY_cast_info_xfree(&d->fields[i].info);
@@ -2213,6 +2197,10 @@ static NpyAuxData *_field_transfer_data_clone(NpyAuxData *data)
     }
     newdata->base = d->base;
     newdata->field_count = 0;
+    if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
 
     /* Copy all the fields transfer data */
     for (npy_intp i = 0; i < field_count; ++i) {
@@ -2254,6 +2242,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             N -= NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
             src += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*src_stride;
             dst += NPY_LOWLEVEL_BUFFER_BLOCKSIZE*dst_stride;
@@ -2267,6 +2260,11 @@ _strided_to_strided_field_transfer(
                     return -1;
                 }
             }
+            if (d->decref_src.func != NULL && d->decref_src.func(
+                    NULL, d->decref_src.descr, src, blocksize, src_stride,
+                    d->decref_src.auxdata) < 0) {
+                return -1;
+            }
             return 0;
         }
     }
@@ -2313,6 +2311,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
         data->field_count = 0;
+        NPY_traverse_info_init(&data->decref_src);
 
         *out_flags = PyArrayMethod_MINIMAL_FLAGS;
         for (i = 0; i < field_count; ++i) {
@@ -2340,23 +2339,17 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
 
         /*
-         * If references should be decrefd in src, add another transfer
-         * function to do that. Since a decref function only uses a single
-         * input, the second one (normally output) just does not matter here.
+         * If references should be decrefd in src, add a clear function.
          */
         if (move_references && PyDataType_REFCHK(src_dtype)) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_dtype,
-                                    &data->fields[field_count].info,
-                                    NULL) != NPY_SUCCEED) {
+            NPY_ARRAYMETHOD_FLAGS clear_flags;
+            if (PyArray_GetClearFunction(
+                    0, src_stride, src_dtype, &data->decref_src,
+                    &clear_flags) < 0) {
                 NPY_AUXDATA_FREE((NpyAuxData *)data);
                 return NPY_FAIL;
             }
-            data->fields[field_count].src_offset = 0;
-            data->fields[field_count].dst_offset = 0;
-            data->field_count = field_count;
+            *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         }
 
         *out_stransfer = &_strided_to_strided_field_transfer;
@@ -2384,6 +2377,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
         }
         data->base.free = &_field_transfer_data_free;
         data->base.clone = &_field_transfer_data_clone;
+        NPY_traverse_info_init(&data->decref_src);
 
         key = PyTuple_GET_ITEM(src_dtype->names, 0);
         tup = PyDict_GetItem(src_dtype->fields, key);
@@ -2432,6 +2426,7 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     data->base.free = &_field_transfer_data_free;
     data->base.clone = &_field_transfer_data_clone;
     data->field_count = 0;
+    NPY_traverse_info_init(&data->decref_src);
 
     *out_flags = PyArrayMethod_MINIMAL_FLAGS;
     /* set up the transfer function for each field */
@@ -2473,69 +2468,6 @@ get_fields_transfer_function(int NPY_UNUSED(aligned),
     return NPY_SUCCEED;
 }
 
-static int
-get_decref_fields_transfer_function(int NPY_UNUSED(aligned),
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            PyArrayMethod_StridedLoop **out_stransfer,
-                            NpyAuxData **out_transferdata,
-                            int *out_needs_api)
-{
-    PyObject *names, *key, *tup, *title;
-    PyArray_Descr *src_fld_dtype;
-    npy_int i, structsize;
-    Py_ssize_t field_count;
-    int src_offset;
-
-    names = src_dtype->names;
-    field_count = PyTuple_GET_SIZE(src_dtype->names);
-
-    /* Over-allocating here: less fields may be used */
-    structsize = sizeof(_field_transfer_data) +
-                    field_count * sizeof(_single_field_transfer);
-    /* Allocate the data and populate it */
-    _field_transfer_data *data = PyMem_Malloc(structsize);
-    if (data == NULL) {
-        PyErr_NoMemory();
-        return NPY_FAIL;
-    }
-    data->base.free = &_field_transfer_data_free;
-    data->base.clone = &_field_transfer_data_clone;
-    data->field_count = 0;
-
-    _single_field_transfer *field = data->fields;
-    for (i = 0; i < field_count; ++i) {
-        key = PyTuple_GET_ITEM(names, i);
-        tup = PyDict_GetItem(src_dtype->fields, key);
-        if (!PyArg_ParseTuple(tup, "Oi|O", &src_fld_dtype,
-                                                &src_offset, &title)) {
-            NPY_AUXDATA_FREE((NpyAuxData *)data);
-            return NPY_FAIL;
-        }
-        if (PyDataType_REFCHK(src_fld_dtype)) {
-            if (out_needs_api) {
-                *out_needs_api = 1;
-            }
-            if (get_decref_transfer_function(0,
-                                    src_stride,
-                                    src_fld_dtype,
-                                    &field->info,
-                                    out_needs_api) != NPY_SUCCEED) {
-                NPY_AUXDATA_FREE((NpyAuxData *)data);
-                return NPY_FAIL;
-            }
-            field->src_offset = src_offset;
-            data->field_count++;
-            field++;
-        }
-    }
-
-    *out_stransfer = &_strided_to_strided_field_transfer;
-    *out_transferdata = (NpyAuxData *)data;
-
-    return NPY_SUCCEED;
-}
-
 
 /************************* MASKED TRANSFER WRAPPER *************************/
 
@@ -2544,7 +2476,7 @@ typedef struct {
     /* The transfer function being wrapped (could likely be stored directly) */
     NPY_cast_info wrapped;
     /* The src decref function if necessary */
-    NPY_cast_info decref_src;
+    NPY_traverse_info decref_src;
 } _masked_wrapper_transfer_data;
 
 /* transfer data free function */
@@ -2553,7 +2485,7 @@ _masked_wrapper_transfer_data_free(NpyAuxData *data)
 {
     _masked_wrapper_transfer_data *d = (_masked_wrapper_transfer_data *)data;
     NPY_cast_info_xfree(&d->wrapped);
-    NPY_cast_info_xfree(&d->decref_src);
+    NPY_traverse_info_xfree(&d->decref_src);
     PyMem_Free(data);
 }
 
@@ -2576,7 +2508,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
         return NULL;
     }
     if (d->decref_src.func != NULL) {
-        if (NPY_cast_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
+        if (NPY_traverse_info_copy(&newdata->decref_src, &d->decref_src) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)newdata);
             return NULL;
         }
@@ -2586,7 +2518,7 @@ _masked_wrapper_transfer_data_clone(NpyAuxData *data)
 }
 
 static int
-_strided_masked_wrapper_decref_transfer_function(
+_strided_masked_wrapper_clear_function(
         PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
         const npy_intp *dimensions, const npy_intp *strides,
         npy_bool *mask, npy_intp mask_stride,
@@ -2603,8 +2535,8 @@ _strided_masked_wrapper_decref_transfer_function(
         /* Skip masked values, still calling decref for move_references */
         mask = (npy_bool*)npy_memchr((char *)mask, 0, mask_stride, N,
                                      &subloopsize, 1);
-        if (d->decref_src.func(&d->decref_src.context,
-                &src, &subloopsize, &src_stride, d->decref_src.auxdata) < 0) {
+        if (d->decref_src.func(NULL, d->decref_src.descr,
+                src, subloopsize, src_stride, d->decref_src.auxdata) < 0) {
             return -1;
         }
         dst += subloopsize * dst_stride;
@@ -2670,139 +2602,18 @@ _strided_masked_wrapper_transfer_function(
 }
 
 
-/*************************** CLEAR SRC *******************************/
 
+/* A no-op function (currently only used for cleanup purposes really) */
 static int
-_dec_src_ref_nop(
+_cast_no_op(
         PyArrayMethod_Context *NPY_UNUSED(context),
         char *const *NPY_UNUSED(args), const npy_intp *NPY_UNUSED(dimensions),
         const npy_intp *NPY_UNUSED(strides), NpyAuxData *NPY_UNUSED(auxdata))
 {
-    /* NOP */
+    /* Do nothing */
     return 0;
 }
 
-static int
-_strided_to_null_dec_src_ref_reference(
-        PyArrayMethod_Context *NPY_UNUSED(context),
-        char *const *args, const npy_intp *dimensions,
-        const npy_intp *strides, NpyAuxData *NPY_UNUSED(auxdata))
-{
-    char *src = args[0];
-    npy_intp N = dimensions[0];
-    npy_intp stride = strides[0];
-
-    PyObject *src_ref = NULL;
-    while (N > 0) {
-        /* Release the reference in src and set it to NULL */
-        NPY_DT_DBG_REFTRACE("dec src ref (null dst)", src_ref);
-        memcpy(&src_ref, src, sizeof(src_ref));
-        Py_XDECREF(src_ref);
-        memset(src, 0, sizeof(PyObject *));
-
-        src += stride;
-        --N;
-    }
-    return 0;
-}
-
-
-/*
- * Get a function to decref.  Currently, this uses a cast info slot, which
- * means that the second (destination) descriptor is always set to NULL
- * and generally does not have to be passed.
- * Since we do not currently have an `ArrayMethod` representing this, the
- * method is also set to NULL.
- *
- * TODO: this function should probably be moved onto the DType eventually,
- *       which would allow for user DTypes to include dynamic allocated
- *       memory or Python objects.
- */
-static int
-get_decref_transfer_function(int aligned,
-                            npy_intp src_stride,
-                            PyArray_Descr *src_dtype,
-                            NPY_cast_info *cast_info,
-                            int *out_needs_api)
-{
-    NPY_cast_info_init(cast_info);
-
-    /* If there are no references, it's a nop */
-    if (!PyDataType_REFCHK(src_dtype)) {
-        cast_info->func = &_dec_src_ref_nop;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If it's a single reference, it's one decref */
-    else if (src_dtype->type_num == NPY_OBJECT) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        cast_info->func = &_strided_to_null_dec_src_ref_reference;
-        cast_info->auxdata = NULL;
-        goto finalize;
-    }
-    /* If there are subarrays, need to wrap it */
-    else if (PyDataType_HASSUBARRAY(src_dtype)) {
-        PyArray_Dims src_shape = {NULL, -1};
-        npy_intp src_size;
-
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (!(PyArray_IntpConverter(src_dtype->subarray->shape,
-                                            &src_shape))) {
-            PyErr_SetString(PyExc_ValueError,
-                    "invalid subarray shape");
-            return NPY_FAIL;
-        }
-        src_size = PyArray_MultiplyList(src_shape.ptr, src_shape.len);
-        npy_free_cache_dim_obj(src_shape);
-
-        NPY_ARRAYMETHOD_FLAGS ignored_flags;
-        if (get_n_to_n_transfer_function(aligned,
-                src_stride, 0,
-                src_dtype->subarray->base, NULL, 1, src_size,
-                &cast_info->func, &cast_info->auxdata,
-                &ignored_flags) != NPY_SUCCEED) {
-            return NPY_FAIL;
-        }
-
-        goto finalize;
-    }
-    /* If there are fields, need to do each field */
-    else if (PyDataType_HASFIELDS(src_dtype)) {
-        if (out_needs_api) {
-            *out_needs_api = 1;
-        }
-
-        if (get_decref_fields_transfer_function(aligned,
-                            src_stride, src_dtype,
-                            &cast_info->func, &cast_info->auxdata,
-                            out_needs_api) < 0) {
-            return NPY_FAIL;
-        }
-        goto finalize;
-    }
-    else {
-        PyErr_Format(PyExc_RuntimeError,
-                "Internal error, tried to fetch decref function for the "
-                "unsupported DType '%S'.", src_dtype);
-        return NPY_FAIL;
-    }
-
-  finalize:
-    /* Make sure all important fields are either set or cleared */
-    Py_INCREF(src_dtype);
-    cast_info->descriptors[0] = src_dtype;
-    cast_info->descriptors[1] = NULL;
-    cast_info->context.method = NULL;
-    cast_info->context.caller = NULL;
-    return NPY_SUCCEED;
-}
-
 
 /*
  * ********************* Generalized Multistep Cast ************************
@@ -3083,14 +2894,14 @@ _clear_cast_info_after_get_loop_failure(NPY_cast_info *cast_info)
     /* As public API we could choose to clear auxdata != NULL */
     assert(cast_info->auxdata == NULL);
     /* Set func to be non-null so that `NPY_cats_info_xfree` does not skip */
-    cast_info->func = &_dec_src_ref_nop;
+    cast_info->func = &_cast_no_op;
     NPY_cast_info_xfree(cast_info);
 }
 
 
 /*
  * Helper for PyArray_GetDTypeTransferFunction, which fetches a single
- * transfer function from the each casting implementation (ArrayMethod).
+ * transfer function from the each casting implementation (ArrayMethod)
  * May set the transfer function to NULL when the cast can be achieved using
  * a view.
  * TODO: Expand the view functionality for general offsets, not just 0:
@@ -3120,6 +2931,8 @@ define_cast_for_descrs(
         int move_references,
         NPY_cast_info *cast_info, NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
+    assert(dst_dtype != NULL);  /* Was previously used for decref */
+
     /* Storage for all cast info in case multi-step casting is necessary */
     _multistep_castdata castdata;
     /* Initialize funcs to NULL to simplify cleanup on error. */
@@ -3164,7 +2977,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (from_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped */
-            castdata.from.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.from.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.from);
         }
         else {
@@ -3203,7 +3016,7 @@ define_cast_for_descrs(
         /* Prepare the actual cast (if necessary): */
         if (to_view_offset == 0 && !must_wrap) {
             /* This step is not necessary and can be skipped. */
-            castdata.to.func = &_dec_src_ref_nop;  /* avoid NULL */
+            castdata.to.func = &_cast_no_op;  /* avoid NULL */
             NPY_cast_info_xfree(&castdata.to);
         }
         else {
@@ -3279,33 +3092,6 @@ PyArray_GetDTypeTransferFunction(int aligned,
                             NPY_cast_info *cast_info,
                             NPY_ARRAYMETHOD_FLAGS *out_flags)
 {
-    assert(src_dtype != NULL);
-
-    /*
-     * If one of the dtypes is NULL, we give back either a src decref
-     * function or a dst setzero function
-     *
-     * TODO: Eventually, we may wish to support user dtype with references
-     *       (including and beyond bare `PyObject *` this may require extending
-     *       the ArrayMethod API and those paths should likely be split out
-     *       from this function.)
-     */
-    if (dst_dtype == NULL) {
-        assert(move_references);
-        int needs_api = 0;
-        int res = get_decref_transfer_function(aligned,
-                                src_dtype->elsize,
-                                src_dtype,
-                                cast_info,
-                                &needs_api);
-        /* decref'ing never creates floating point errors, so just ignore it */
-        *out_flags = PyArrayMethod_MINIMAL_FLAGS;
-        if (needs_api) {
-            *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        }
-        return res;
-    }
-
     if (define_cast_for_descrs(aligned,
             src_stride, dst_stride,
             src_dtype, dst_dtype, move_references,
@@ -3563,20 +3349,19 @@ PyArray_GetMaskedDTypeTransferFunction(int aligned,
 
     /* If the src object will need a DECREF, get a function to handle that */
     if (move_references && PyDataType_REFCHK(src_dtype)) {
-        *out_flags |= NPY_METH_REQUIRES_PYAPI;
-        if (get_decref_transfer_function(aligned,
-                            src_stride,
-                            src_dtype,
-                            &data->decref_src,
-                            NULL) != NPY_SUCCEED) {
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (PyArray_GetClearFunction(
+                aligned, src_stride, src_dtype,
+                &data->decref_src, &clear_flags) < 0) {
             NPY_AUXDATA_FREE((NpyAuxData *)data);
             return NPY_FAIL;
         }
+        *out_flags = PyArrayMethod_COMBINED_FLAGS(*out_flags, clear_flags);
         cast_info->func = (PyArrayMethod_StridedLoop *)
-                &_strided_masked_wrapper_decref_transfer_function;
+                &_strided_masked_wrapper_clear_function;
     }
     else {
-        NPY_cast_info_init(&data->decref_src);
+        NPY_traverse_info_init(&data->decref_src);
         cast_info->func = (PyArrayMethod_StridedLoop *)
                 &_strided_masked_wrapper_transfer_function;
     }
diff --git a/numpy/core/src/multiarray/dtype_transfer.h b/numpy/core/src/multiarray/dtype_transfer.h
index 6f7aa2837..04df5cb64 100644
--- a/numpy/core/src/multiarray/dtype_transfer.h
+++ b/numpy/core/src/multiarray/dtype_transfer.h
@@ -146,7 +146,6 @@ object_to_any_get_loop(
         NpyAuxData **out_transferdata,
         NPY_ARRAYMETHOD_FLAGS *flags);
 
-
 NPY_NO_EXPORT int
 wrap_aligned_transferfunction(
         int aligned, int must_wrap,
diff --git a/numpy/core/src/multiarray/dtype_traversal.c b/numpy/core/src/multiarray/dtype_traversal.c
new file mode 100644
index 000000000..f76119f94
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.c
@@ -0,0 +1,651 @@
+/*
+ * This file is simlar to the low-level loops for data type transfer
+ * in `dtype_transfer.c` but for those which only require visiting
+ * a single array (and mutating it in-place).
+ *
+ * As of writing, it is only used for CLEARing, which means mainly
+ * Python object DECREF/dealloc followed by NULL'ing the data
+ * (to support double clearing and ensure data is again in a usable state).
+ * However, memory initialization and traverse follows similar
+ * protocols (although traversal needs additional arguments).
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#include "numpy/ndarraytypes.h"
+#include "numpy/arrayobject.h"
+
+#include "alloc.h"
+#include "array_method.h"
+#include "dtypemeta.h"
+#include "dtype_traversal.h"
+
+
+/* Buffer size with the same use case as the one in dtype_transfer.c */
+#define NPY_LOWLEVEL_BUFFER_BLOCKSIZE  128
+
+
+typedef int get_traverse_func_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/*
+ * Generic Clear function helpers:
+ */
+
+static int
+get_clear_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *clear_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(clear_info);
+    /* not that cleanup code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_clear = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_clear_loop;
+    if (get_clear == NULL) {
+        PyErr_Format(PyExc_RuntimeError,
+                "Internal error, `get_clear_loop` not set for the DType '%S'",
+                dtype);
+        return -1;
+    }
+
+    if (get_clear(traverse_context, dtype, aligned, stride,
+                  &clear_info->func, &clear_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(clear_info->func == NULL);
+        clear_info->func = NULL;
+        return -1;
+    }
+    Py_INCREF(dtype);
+    clear_info->descr = dtype;
+
+    return 0;
+}
+
+/*
+ * Helper to set up a strided loop used for clearing.  Clearing means
+ * deallocating any references (e.g. via Py_DECREF) and resetting the data
+ * back into a usable/initialized state (e.g. by NULLing any references).
+ *
+ * The function will error when called on a dtype which does not have
+ * references (and thus the get_clear_loop slot NULL).
+ * Note that old-style user-dtypes use the "void" version.
+ *
+ * NOTE: This function may have a use for a `traverse_context` at some point
+ *       but right now, it is always NULL and only exists to allow adding it
+ *       in the future without changing the strided-loop signature.
+ */
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    return get_clear_function(NULL, dtype, aligned, stride, clear_info, flags);
+}
+
+
+/*
+ * Generic zerofill/fill function helper:
+ */
+
+static int
+get_zerofill_function(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, NPY_traverse_info *zerofill_info,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    NPY_traverse_info_init(zerofill_info);
+    /* not that filling code bothers to check e.g. for floating point flags */
+    *flags = PyArrayMethod_MINIMAL_FLAGS;
+
+    get_traverse_loop_function *get_zerofill = NPY_DT_SLOTS(NPY_DTYPE(dtype))->get_fill_zero_loop;
+    if (get_zerofill == NULL) {
+        /* Allowed to be NULL (and accept it here) */
+        return 0;
+    }
+
+    if (get_zerofill(traverse_context, dtype, aligned, stride,
+                     &zerofill_info->func, &zerofill_info->auxdata, flags) <  0) {
+        /* callee should clean up, but make sure outside debug mode */
+        assert(zerofill_info->func == NULL);
+        zerofill_info->func = NULL;
+        return -1;
+    }
+    if (zerofill_info->func == NULL) {
+        /* Zerofill also may return func=NULL without an error. */
+        return 0;
+    }
+
+    Py_INCREF(dtype);
+    zerofill_info->descr = dtype;
+
+    return 0;
+}
+
+
+/****************** Python Object clear ***********************/
+
+static int
+clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *aligned_copy = NULL;
+    while (size > 0) {
+        /* Release the reference in src and set it to NULL */
+        memcpy(&aligned_copy, data, sizeof(PyObject *));
+        Py_XDECREF(aligned_copy);
+        memset(data, 0, sizeof(PyObject *));
+
+        data += stride;
+        --size;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **out_auxdata,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI|NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &clear_object_strided_loop;
+    return 0;
+}
+
+
+/**************** Python Object zero fill *********************/
+
+static int
+fill_zero_object_strided_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp size, npy_intp stride,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    PyObject *zero = PyLong_FromLong(0);
+    while (size--) {
+        Py_INCREF(zero);
+        // assumes `data` doesn't have a pre-existing object inside it
+        memcpy(data, &zero, sizeof(zero));
+        data += stride;
+    }
+    Py_DECREF(zero);
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(void *NPY_UNUSED(traverse_context),
+                              PyArray_Descr *NPY_UNUSED(descr),
+                              int NPY_UNUSED(aligned),
+                              npy_intp NPY_UNUSED(fixed_stride),
+                              traverse_loop_function **out_loop,
+                              NpyAuxData **NPY_UNUSED(out_auxdata),
+                              NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    *flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_NO_FLOATINGPOINT_ERRORS;
+    *out_loop = &fill_zero_object_strided_loop;
+    return 0;
+}
+
+/**************** Structured DType generic funcationality ***************/
+
+/*
+ * Note that legacy user dtypes also make use of this.  Someone managed to
+ * hack objects into them by adding a field that contains objects and this
+ * remains (somewhat) valid.
+ * (Unlike our voids, those fields must be hardcoded probably, but...)
+ *
+ * The below functionality mirrors the casting functionality relatively
+ * closely.
+ */
+
+typedef struct {
+    npy_intp src_offset;
+    NPY_traverse_info info;
+} single_field_traverse_data;
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp field_count;
+    single_field_traverse_data fields[];
+} fields_traverse_data;
+
+
+/* traverse data free function */
+static void
+fields_traverse_data_free(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    for (npy_intp i = 0; i < d->field_count; ++i) {
+        NPY_traverse_info_xfree(&d->fields[i].info);
+    }
+    PyMem_Free(d);
+}
+
+
+/* traverse data copy function (untested due to no direct use currently) */
+static NpyAuxData *
+fields_traverse_data_clone(NpyAuxData *data)
+{
+    fields_traverse_data *d = (fields_traverse_data *)data;
+
+    npy_intp field_count = d->field_count;
+    npy_intp structsize = sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data);
+
+    /* Allocate the data and populate it */
+    fields_traverse_data *newdata = PyMem_Malloc(structsize);
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->field_count = 0;
+
+    /* Copy all the fields transfer data */
+    single_field_traverse_data *in_field = d->fields;
+    single_field_traverse_data *new_field = newdata->fields;
+
+    for (; newdata->field_count < field_count;
+                newdata->field_count++, in_field++, new_field++) {
+        new_field->src_offset = in_field->src_offset;
+
+        if (NPY_traverse_info_copy(&new_field->info, &in_field->info) < 0) {
+            fields_traverse_data_free((NpyAuxData *)newdata);
+            return NULL;
+        }
+    }
+
+    return (NpyAuxData *)newdata;
+}
+
+
+static int
+traverse_fields_function(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    fields_traverse_data *d = (fields_traverse_data *)auxdata;
+    npy_intp i, field_count = d->field_count;
+
+    /* Do the traversing a block at a time for better memory caching */
+    const npy_intp blocksize = NPY_LOWLEVEL_BUFFER_BLOCKSIZE;
+
+    for (;;) {
+        if (N > blocksize) {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        blocksize, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            N -= blocksize;
+            data += blocksize * stride;
+        }
+        else {
+            for (i = 0; i < field_count; ++i) {
+                single_field_traverse_data field = d->fields[i];
+                if (field.info.func(traverse_context,
+                        field.info.descr, data + field.src_offset,
+                        N, stride, field.info.auxdata) < 0) {
+                    return -1;
+                }
+            }
+            return 0;
+        }
+    }
+}
+
+
+static int
+get_fields_traverse_function(
+        void *traverse_context, PyArray_Descr *dtype, int NPY_UNUSED(aligned),
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    PyObject *names, *key, *tup, *title;
+    PyArray_Descr *fld_dtype;
+    npy_int i, structsize;
+    Py_ssize_t field_count;
+
+    names = dtype->names;
+    field_count = PyTuple_GET_SIZE(dtype->names);
+
+    /* Over-allocating here: less fields may be used */
+    structsize = (sizeof(fields_traverse_data) +
+                    field_count * sizeof(single_field_traverse_data));
+    /* Allocate the data and populate it */
+    fields_traverse_data *data = PyMem_Malloc(structsize);
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    data->base.free = &fields_traverse_data_free;
+    data->base.clone = &fields_traverse_data_clone;
+    data->field_count = 0;
+
+    single_field_traverse_data *field = data->fields;
+    for (i = 0; i < field_count; ++i) {
+        int offset;
+
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(dtype->fields, key);
+        if (!PyArg_ParseTuple(tup, "Oi|O", &fld_dtype, &offset, &title)) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (get_traverse_func == &get_clear_function
+                && !PyDataType_REFCHK(fld_dtype)) {
+            /* No need to do clearing (could change to use NULL return) */
+            continue;
+        }
+        NPY_ARRAYMETHOD_FLAGS clear_flags;
+        if (get_traverse_func(
+                traverse_context, fld_dtype, 0,
+                stride, &field->info, &clear_flags) < 0) {
+            NPY_AUXDATA_FREE((NpyAuxData *)data);
+            return -1;
+        }
+        if (field->info.func == NULL) {
+            /* zerofill allows NULL func as "default" memset to zero */
+            continue;
+        }
+        *flags = PyArrayMethod_COMBINED_FLAGS(*flags, clear_flags);
+        field->src_offset = offset;
+        data->field_count++;
+        field++;
+    }
+
+    *out_func = &traverse_fields_function;
+    *out_auxdata = (NpyAuxData *)data;
+
+    return 0;
+}
+
+
+typedef struct {
+    NpyAuxData base;
+    npy_intp count;
+    NPY_traverse_info info;
+} subarray_traverse_data;
+
+
+/* traverse data free function */
+static void
+subarray_traverse_data_free(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    NPY_traverse_info_xfree(&d->info);
+    PyMem_Free(d);
+}
+
+
+/*
+ * We seem to be neither using nor exposing this right now, so leave it NULL.
+ * (The implementation below should be functional.)
+ */
+#define subarray_traverse_data_clone NULL
+
+#ifndef subarray_traverse_data_clone
+/* traverse data copy function */
+static NpyAuxData *
+subarray_traverse_data_clone(NpyAuxData *data)
+{
+    subarray_traverse_data *d = (subarray_traverse_data *)data;
+
+    /* Allocate the data and populate it */
+    subarray_traverse_data *newdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (newdata == NULL) {
+        return NULL;
+    }
+    newdata->base = d->base;
+    newdata->count = d->count;
+
+    if (NPY_traverse_info_copy(&newdata->info, &d->info) < 0) {
+        PyMem_Free(newdata);
+        return NULL;
+    }
+
+    return (NpyAuxData *)newdata;
+}
+#endif
+
+
+static int
+traverse_subarray_func(
+        void *traverse_context, PyArray_Descr *NPY_UNUSED(descr),
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    subarray_traverse_data *subarr_data = (subarray_traverse_data *)auxdata;
+
+    traverse_loop_function *func = subarr_data->info.func;
+    PyArray_Descr *sub_descr = subarr_data->info.descr;
+    npy_intp sub_N = subarr_data->count;
+    NpyAuxData *sub_auxdata = subarr_data->info.auxdata;
+    npy_intp sub_stride = sub_descr->elsize;
+
+    while (N--) {
+        if (func(traverse_context, sub_descr, data,
+                 sub_N, sub_stride, sub_auxdata) < 0) {
+            return -1;
+        }
+        data += stride;
+    }
+    return 0;
+}
+
+
+static int
+get_subarray_traverse_func(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp size, npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags,
+        get_traverse_func_function *get_traverse_func)
+{
+    subarray_traverse_data *auxdata = PyMem_Malloc(sizeof(subarray_traverse_data));
+    if (auxdata == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    auxdata->count = size;
+    auxdata->base.free = &subarray_traverse_data_free;
+    auxdata->base.clone = subarray_traverse_data_clone;
+
+    if (get_traverse_func(
+            traverse_context, dtype, aligned,
+            dtype->elsize, &auxdata->info, flags) < 0) {
+        PyMem_Free(auxdata);
+        return -1;
+    }
+    if (auxdata->info.func == NULL) {
+        /* zerofill allows func to be NULL, in which we need not do anything */
+        PyMem_Free(auxdata);
+        *out_func = NULL;
+        *out_auxdata = NULL;
+        return 0;
+    }
+    *out_func = &traverse_subarray_func;
+    *out_auxdata = (NpyAuxData *)auxdata;
+
+    return 0;
+}
+
+
+static int
+clear_no_op(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        char *NPY_UNUSED(data), npy_intp NPY_UNUSED(size),
+        npy_intp NPY_UNUSED(stride), NpyAuxData *NPY_UNUSED(auxdata))
+{
+    return 0;
+}
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    /*
+     * If there are no references, it's a nop.  This path should not be hit
+     * but structured dtypes are tricky when a dtype which included references
+     * was sliced to not include any.
+     */
+    if (!PyDataType_REFCHK(dtype)) {
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_clear_function) < 0) {
+            return -1;
+        }
+        return 0;
+    }
+    else if (dtype->type_num == NPY_VOID) {
+        /* 
+         * Void dtypes can have "ghosts" of objects marking the dtype because
+         * holes (or the raw bytes if fields are gone) may include objects.
+         * Paths that need those flags should probably be considered incorrect.
+         * But as long as this can happen (a V8 that indicates references)
+         * we need to make it a no-op here.
+         */
+        *out_func = &clear_no_op;
+        return 0;
+    }
+
+    PyErr_Format(PyExc_RuntimeError,
+            "Internal error, tried to fetch clear function for the "
+            "user dtype '%S' without fields or subarray (legacy support).",
+            dtype);
+    return -1;
+}
+
+/**************** Structured DType zero fill ***************/
+
+
+static int
+zerofill_fields_function(
+        void *traverse_context, PyArray_Descr *descr,
+        char *data, npy_intp N, npy_intp stride,
+        NpyAuxData *auxdata)
+{
+    npy_intp itemsize = descr->elsize;
+
+    /*
+     * TODO: We could optimize this by chunking, but since we currently memset
+     *       each element always, just loop manually.
+     */
+    while (N--) {
+        memset(data, 0, itemsize);
+        if (traverse_fields_function(
+                traverse_context, descr, data, 1, stride, auxdata) < 0) {
+            return -1;
+        }
+        data +=stride;
+    }
+    return 0;
+}
+
+/*
+ * Similar to other (e.g. clear) traversal loop getter, but unlike it, we
+ * do need to take care of zeroing out everything (in principle not gaps).
+ * So we add a memset before calling the actual traverse function for the
+ * structured path.
+ */
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    if (PyDataType_HASSUBARRAY(dtype)) {
+        PyArray_Dims shape = {NULL, -1};
+        npy_intp size;
+
+        if (!(PyArray_IntpConverter(dtype->subarray->shape, &shape))) {
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid subarray shape");
+            return -1;
+        }
+        size = PyArray_MultiplyList(shape.ptr, shape.len);
+        npy_free_cache_dim_obj(shape);
+
+        if (get_subarray_traverse_func(
+                traverse_context, dtype->subarray->base, aligned, size, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+
+        return 0;
+    }
+    /* If there are fields, need to do each field */
+    else if (PyDataType_HASFIELDS(dtype)) {
+        if (get_fields_traverse_function(
+                traverse_context, dtype, aligned, stride,
+                out_func, out_auxdata, flags, &get_zerofill_function) < 0) {
+            return -1;
+        }
+        if (((fields_traverse_data *)*out_auxdata)->field_count == 0) {
+            /* If there are no fields, just return NULL for zerofill */
+            NPY_AUXDATA_FREE(*out_auxdata);
+            *out_auxdata = NULL;
+            *out_func = NULL;
+            return 0;
+        }
+        /* 
+         * Traversal skips fields that have no custom zeroing, so we need
+         * to take care of it.
+         */
+        *out_func = &zerofill_fields_function;
+        return 0;
+    }
+
+    /* Otherwise, assume there is nothing to do (user dtypes reach here) */
+    *out_auxdata = NULL;
+    *out_func = NULL;
+    return 0;
+}
diff --git a/numpy/core/src/multiarray/dtype_traversal.h b/numpy/core/src/multiarray/dtype_traversal.h
new file mode 100644
index 000000000..fc12c0f7b
--- /dev/null
+++ b/numpy/core/src/multiarray/dtype_traversal.h
@@ -0,0 +1,98 @@
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_
+
+#include "array_method.h"
+
+/* NumPy DType clear (object DECREF + NULLing) implementations */
+
+NPY_NO_EXPORT int
+npy_get_clear_object_strided_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_clear_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *descr, int aligned,
+        npy_intp fixed_stride,
+        traverse_loop_function **out_loop, NpyAuxData **out_traversedata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+/* NumPy DType zero-filling implementations */
+
+NPY_NO_EXPORT int
+npy_object_get_fill_zero_loop(
+        void *NPY_UNUSED(traverse_context), PyArray_Descr *NPY_UNUSED(descr),
+        int NPY_UNUSED(aligned), npy_intp NPY_UNUSED(fixed_stride),
+        traverse_loop_function **out_loop, NpyAuxData **NPY_UNUSED(out_auxdata),
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT int
+npy_get_zerofill_void_and_legacy_user_dtype_loop(
+        void *traverse_context, PyArray_Descr *dtype, int aligned,
+        npy_intp stride, traverse_loop_function **out_func,
+        NpyAuxData **out_auxdata, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+/* Helper to deal with calling or nesting simple strided loops */
+
+typedef struct {
+    traverse_loop_function *func;
+    NpyAuxData *auxdata;
+    PyArray_Descr *descr;
+} NPY_traverse_info;
+
+
+static inline void
+NPY_traverse_info_init(NPY_traverse_info *cast_info)
+{
+    cast_info->func = NULL;  /* mark as uninitialized. */
+    cast_info->auxdata = NULL;  /* allow leaving auxdata untouched */
+    cast_info->descr = NULL;  /* mark as uninitialized. */
+}
+
+
+static inline void
+NPY_traverse_info_xfree(NPY_traverse_info *traverse_info)
+{
+    if (traverse_info->func == NULL) {
+        return;
+    }
+    traverse_info->func = NULL;
+    NPY_AUXDATA_FREE(traverse_info->auxdata);
+    Py_XDECREF(traverse_info->descr);
+}
+
+
+static inline int
+NPY_traverse_info_copy(
+        NPY_traverse_info *traverse_info, NPY_traverse_info *original)
+{
+    traverse_info->func = NULL;
+    if (original->func == NULL) {
+        /* Allow copying also of unused clear info */
+        return 0;
+    }
+    traverse_info->auxdata = NULL;
+    if (original->auxdata != NULL) {
+        traverse_info->auxdata = NPY_AUXDATA_CLONE(original->auxdata);
+        if (traverse_info->auxdata == NULL) {
+            return -1;
+        }
+    }
+    Py_INCREF(original->descr);
+    traverse_info->descr = original->descr;
+    traverse_info->func = original->func;
+
+    return 0;
+}
+
+
+NPY_NO_EXPORT int
+PyArray_GetClearFunction(
+        int aligned, npy_intp stride, PyArray_Descr *dtype,
+        NPY_traverse_info *clear_info, NPY_ARRAYMETHOD_FLAGS *flags);
+
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_DTYPE_TRAVERSAL_H_ */
diff --git a/numpy/core/src/multiarray/dtypemeta.c b/numpy/core/src/multiarray/dtypemeta.c
index edc07bc92..9c1f384d8 100644
--- a/numpy/core/src/multiarray/dtypemeta.c
+++ b/numpy/core/src/multiarray/dtypemeta.c
@@ -9,7 +9,9 @@
 #include <numpy/ndarraytypes.h>
 #include <numpy/arrayscalars.h>
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
+#include "arraytypes.h"
 #include "common.h"
 #include "dtypemeta.h"
 #include "descriptor.h"
@@ -18,6 +20,10 @@
 #include "scalartypes.h"
 #include "convert_datatype.h"
 #include "usertypes.h"
+#include "conversion_utils.h"
+#include "templ_common.h"
+#include "refcount.h"
+#include "dtype_traversal.h"
 
 #include <assert.h>
 
@@ -122,6 +128,50 @@ legacy_dtype_default_new(PyArray_DTypeMeta *self,
     return (PyObject *)self->singleton;
 }
 
+static PyObject *
+string_unicode_new(PyArray_DTypeMeta *self, PyObject *args, PyObject *kwargs)
+{
+    npy_intp size;
+
+    static char *kwlist[] = {"", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&", kwlist,
+                                     PyArray_IntpFromPyIntConverter, &size)) {
+        return NULL;
+    }
+
+    if (size < 0) {
+        PyErr_Format(PyExc_ValueError,
+                     "Strings cannot have a negative size but a size of "
+                     "%"NPY_INTP_FMT" was given", size);
+        return NULL;
+    }
+
+    PyArray_Descr *res = PyArray_DescrNewFromType(self->type_num);
+
+    if (res == NULL) {
+        return NULL;
+    }
+
+    if (self->type_num == NPY_UNICODE) {
+        // unicode strings are 4 bytes per character
+        if (npy_mul_sizes_with_overflow(&size, size, 4)) {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "Strings too large to store inside array.");
+            return NULL;
+        }
+    }
+
+    if (size > NPY_MAX_INT) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Strings too large to store inside array.");
+        return NULL;
+    }
+
+    res->elsize = (int)size;
+    return (PyObject *)res;
+}
 
 static PyArray_Descr *
 nonparametric_discover_descr_from_pyobject(
@@ -151,7 +201,7 @@ string_discover_descr_from_pyobject(
         }
         if (itemsize > NPY_MAX_INT) {
             PyErr_SetString(PyExc_TypeError,
-                    "string to large to store inside array.");
+                    "string too large to store inside array.");
         }
         PyArray_Descr *res = PyArray_DescrNewFromType(cls->type_num);
         if (res == NULL) {
@@ -478,6 +528,7 @@ void_common_instance(PyArray_Descr *descr1, PyArray_Descr *descr2)
     return NULL;
 }
 
+
 NPY_NO_EXPORT int
 python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *NPY_UNUSED(cls), PyTypeObject *pytype)
@@ -618,7 +669,7 @@ static PyArray_DTypeMeta *
 datetime_common_dtype(PyArray_DTypeMeta *cls, PyArray_DTypeMeta *other)
 {
     /*
-     * Timedelta/datetime shouldn't actuall promote at all.  That they
+     * Timedelta/datetime shouldn't actually promote at all.  That they
      * currently do means that we need additional hacks in the comparison
      * type resolver.  For comparisons we have to make sure we reject it
      * nicely in order to return an array of True/False values.
@@ -677,12 +728,17 @@ object_common_dtype(
  * be a HeapType and its instances should be exact PyArray_Descr structs.
  *
  * @param descr The descriptor that should be wrapped.
- * @param name The name for the DType, if NULL the type character is used.
+ * @param name The name for the DType.
+ * @param alias A second name which is also set to the new class for builtins
+ *              (i.e. `np.types.LongDType` for `np.types.Int64DType`).
+ *              Some may have more aliases, as `intp` is not its own thing,
+ *              as of writing this, these are not added here.
  *
  * @returns 0 on success, -1 on failure.
  */
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
+dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr,
+        const char *name, const char *alias)
 {
     int has_type_set = Py_TYPE(descr) == &PyArrayDescr_Type;
 
@@ -709,47 +765,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
         return -1;
     }
 
-    /*
-     * Note: we have no intention of freeing the memory again since this
-     * behaves identically to static type definition (see comment above).
-     * This is seems cleaner for the legacy API, in the new API both static
-     * and heap types are possible (some difficulty arises from the fact that
-     * these are instances of DTypeMeta and not type).
-     * In particular our own DTypes can be true static declarations.
-     * However, this function remains necessary for legacy user dtypes.
-     */
-
-    const char *scalar_name = descr->typeobj->tp_name;
-    /*
-     * We have to take only the name, and ignore the module to get
-     * a reasonable __name__, since static types are limited in this regard
-     * (this is not ideal, but not a big issue in practice).
-     * This is what Python does to print __name__ for static types.
-     */
-    const char *dot = strrchr(scalar_name, '.');
-    if (dot) {
-        scalar_name = dot + 1;
-    }
-    Py_ssize_t name_length = strlen(scalar_name) + 14;
-
-    char *tp_name = PyMem_Malloc(name_length);
-    if (tp_name == NULL) {
-        PyErr_NoMemory();
-        return -1;
-    }
-
-    snprintf(tp_name, name_length, "numpy.dtype[%s]", scalar_name);
-
     NPY_DType_Slots *dt_slots = PyMem_Malloc(sizeof(NPY_DType_Slots));
     if (dt_slots == NULL) {
-        PyMem_Free(tp_name);
         return -1;
     }
     memset(dt_slots, '\0', sizeof(NPY_DType_Slots));
 
     PyArray_DTypeMeta *dtype_class = PyMem_Malloc(sizeof(PyArray_DTypeMeta));
     if (dtype_class == NULL) {
-        PyMem_Free(tp_name);
         PyMem_Free(dt_slots);
         return -1;
     }
@@ -771,13 +794,19 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
             .tp_flags = Py_TPFLAGS_DEFAULT,
             .tp_base = &PyArrayDescr_Type,
             .tp_new = (newfunc)legacy_dtype_default_new,
+            .tp_doc = (
+                "DType class corresponding to the scalar type and dtype of "
+                "the same name.\n\n"
+                "Please see `numpy.dtype` for the typical way to create\n"
+                "dtype instances and :ref:`arrays.dtypes` for additional\n"
+                "information."),
         },},
         .flags = NPY_DT_LEGACY,
         /* Further fields are not common between DTypes */
     };
     memcpy(dtype_class, &prototype, sizeof(PyArray_DTypeMeta));
     /* Fix name of the Type*/
-    ((PyTypeObject *)dtype_class)->tp_name = tp_name;
+    ((PyTypeObject *)dtype_class)->tp_name = name;
     dtype_class->dt_slots = dt_slots;
 
     /* Let python finish the initialization (probably unnecessary) */
@@ -809,6 +838,7 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     dt_slots->common_dtype = default_builtin_common_dtype;
     dt_slots->common_instance = NULL;
     dt_slots->ensure_canonical = ensure_native_byteorder;
+    dt_slots->get_fill_zero_loop = NULL;
 
     if (PyTypeNum_ISSIGNED(dtype_class->type_num)) {
         /* Convert our scalars (raise on too large unsigned and NaN, etc.) */
@@ -820,6 +850,8 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     }
     else if (descr->type_num == NPY_OBJECT) {
         dt_slots->common_dtype = object_common_dtype;
+        dt_slots->get_fill_zero_loop = npy_object_get_fill_zero_loop;
+        dt_slots->get_clear_loop = npy_get_clear_object_strided_loop;
     }
     else if (PyTypeNum_ISDATETIME(descr->type_num)) {
         /* Datetimes are flexible, but were not considered previously */
@@ -841,6 +873,10 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     void_discover_descr_from_pyobject);
             dt_slots->common_instance = void_common_instance;
             dt_slots->ensure_canonical = void_ensure_canonical;
+            dt_slots->get_fill_zero_loop = 
+                    npy_get_zerofill_void_and_legacy_user_dtype_loop;
+            dt_slots->get_clear_loop =
+                    npy_get_clear_void_and_legacy_user_dtype_loop;
         }
         else {
             dt_slots->default_descr = string_and_unicode_default_descr;
@@ -849,9 +885,14 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
                     string_discover_descr_from_pyobject);
             dt_slots->common_dtype = string_unicode_common_dtype;
             dt_slots->common_instance = string_unicode_common_instance;
+            ((PyTypeObject*)dtype_class)->tp_new = (newfunc)string_unicode_new;
         }
     }
 
+    if (PyTypeNum_ISNUMBER(descr->type_num)) {
+        dtype_class->flags |= NPY_DT_NUMERIC;
+    }
+
     if (_PyArray_MapPyTypeToDType(dtype_class, descr->typeobj,
             PyTypeNum_ISUSERDEF(dtype_class->type_num)) < 0) {
         Py_DECREF(dtype_class);
@@ -861,6 +902,21 @@ dtypemeta_wrap_legacy_descriptor(PyArray_Descr *descr)
     /* Finally, replace the current class of the descr */
     Py_SET_TYPE(descr, (PyTypeObject *)dtype_class);
 
+    /* And it to the types submodule if it is a builtin dtype */
+    if (!PyTypeNum_ISUSERDEF(descr->type_num)) {
+        static PyObject *add_dtype_helper = NULL;
+        npy_cache_import("numpy.dtypes", "_add_dtype_helper", &add_dtype_helper);
+        if (add_dtype_helper == NULL) {
+            return -1;
+        }
+
+        if (PyObject_CallFunction(
+                add_dtype_helper,
+                "Os", (PyObject *)dtype_class, alias) == NULL) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -871,22 +927,35 @@ dtypemeta_get_abstract(PyArray_DTypeMeta *self) {
 }
 
 static PyObject *
+dtypemeta_get_legacy(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_legacy(self));
+}
+
+static PyObject *
 dtypemeta_get_parametric(PyArray_DTypeMeta *self) {
     return PyBool_FromLong(NPY_DT_is_parametric(self));
 }
 
+static PyObject *
+dtypemeta_get_is_numeric(PyArray_DTypeMeta *self) {
+    return PyBool_FromLong(NPY_DT_is_numeric(self));
+}
+
 /*
  * Simple exposed information, defined for each DType (class).
  */
 static PyGetSetDef dtypemeta_getset[] = {
         {"_abstract", (getter)dtypemeta_get_abstract, NULL, NULL, NULL},
+        {"_legacy", (getter)dtypemeta_get_legacy, NULL, NULL, NULL},
         {"_parametric", (getter)dtypemeta_get_parametric, NULL, NULL, NULL},
+        {"_is_numeric", (getter)dtypemeta_get_is_numeric, NULL, NULL, NULL},
         {NULL, NULL, NULL, NULL, NULL}
 };
 
 static PyMemberDef dtypemeta_members[] = {
     {"type",
-        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY, NULL},
+        T_OBJECT, offsetof(PyArray_DTypeMeta, scalar_type), READONLY,
+        "scalar type corresponding to the DType."},
     {NULL, 0, 0, 0, NULL},
 };
 
diff --git a/numpy/core/src/multiarray/dtypemeta.h b/numpy/core/src/multiarray/dtypemeta.h
index 5df333584..32212228c 100644
--- a/numpy/core/src/multiarray/dtypemeta.h
+++ b/numpy/core/src/multiarray/dtypemeta.h
@@ -1,44 +1,18 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_DTYPEMETA_H_
 
+#include "array_method.h"
+#include "dtype_traversal.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* DType flags, currently private, since we may just expose functions */
-#define NPY_DT_LEGACY 1 << 0
-#define NPY_DT_ABSTRACT 1 << 1
-#define NPY_DT_PARAMETRIC 1 << 2
-
-
-typedef PyArray_Descr *(discover_descr_from_pyobject_function)(
-        PyArray_DTypeMeta *cls, PyObject *obj);
-
-/*
- * Before making this public, we should decide whether it should pass
- * the type, or allow looking at the object. A possible use-case:
- * `np.array(np.array([0]), dtype=np.ndarray)`
- * Could consider arrays that are not `dtype=ndarray` "scalars".
- */
-typedef int (is_known_scalar_type_function)(
-        PyArray_DTypeMeta *cls, PyTypeObject *obj);
-
-typedef PyArray_Descr *(default_descr_function)(PyArray_DTypeMeta *cls);
-typedef PyArray_DTypeMeta *(common_dtype_function)(
-        PyArray_DTypeMeta *dtype1, PyArray_DTypeMeta *dtype2);
-typedef PyArray_Descr *(common_instance_function)(
-        PyArray_Descr *dtype1, PyArray_Descr *dtype2);
-typedef PyArray_Descr *(ensure_canonical_function)(PyArray_Descr *dtype);
+#include "numpy/_dtype_api.h"
 
-/*
- * TODO: These two functions are currently only used for experimental DType
- *       API support.  Their relation should be "reversed": NumPy should
- *       always use them internally.
- *       There are open points about "casting safety" though, e.g. setting
- *       elements is currently always unsafe.
- */
-typedef int(setitemfunction)(PyArray_Descr *, PyObject *, char *);
-typedef PyObject *(getitemfunction)(PyArray_Descr *, char *);
+/* DType flags, currently private, since we may just expose functions 
+   Other publicly visible flags are in _dtype_api.h                   */
+#define NPY_DT_LEGACY 1 << 0
 
 
 typedef struct {
@@ -55,10 +29,40 @@ typedef struct {
     setitemfunction *setitem;
     getitemfunction *getitem;
     /*
+     * Either NULL or fetches a clearing function.  Clearing means deallocating
+     * any referenced data and setting it to a safe state.  For Python objects
+     * this means using `Py_CLEAR` which is equivalent to `Py_DECREF` and
+     * setting the `PyObject *` to NULL.
+     * After the clear, the data must be fillable via cast/copy and calling
+     * clear a second time must be safe.
+     * If the DType class does not implement `get_clear_loop` setting
+     * NPY_ITEM_REFCOUNT on its dtype instances is invalid.  Note that it is
+     * acceptable for  NPY_ITEM_REFCOUNT to inidicate references that are not
+     * Python objects.
+     */
+    get_traverse_loop_function *get_clear_loop;
+    /*
+       Either NULL or a function that sets a function pointer to a traversal
+       loop that fills an array with zero values appropriate for the dtype. If
+       get_fill_zero_loop is undefined or the function pointer set by it is
+       NULL, the array buffer is allocated with calloc. If this function is
+       defined and it sets a non-NULL function pointer, the array buffer is
+       allocated with malloc and the zero-filling loop function pointer is
+       called to fill the buffer. For the best performance, avoid using this
+       function if a zero-filled array buffer allocated with calloc makes sense
+       for the dtype.
+
+       Note that this is currently used only for zero-filling a newly allocated
+       array. While it can be used to zero-fill an already-filled buffer, that
+       will not work correctly for arrays holding references. If you need to do
+       that, clear the array first.
+    */
+    get_traverse_loop_function *get_fill_zero_loop;
+    /*
      * The casting implementation (ArrayMethod) to convert between two
      * instances of this DType, stored explicitly for fast access:
      */
-    PyObject *within_dtype_castingimpl;
+    PyArrayMethodObject *within_dtype_castingimpl;
     /*
      * Dictionary of ArrayMethods representing most possible casts
      * (structured and object are exceptions).
@@ -74,6 +78,13 @@ typedef struct {
     PyArray_ArrFuncs f;
 } NPY_DType_Slots;
 
+// This must be updated if new slots before within_dtype_castingimpl
+// are added
+#define NPY_NUM_DTYPE_SLOTS 10
+#define NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS 22
+#define NPY_DT_MAX_ARRFUNCS_SLOT \
+  NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS + _NPY_DT_ARRFUNCS_OFFSET
+
 
 #define NPY_DTYPE(descr) ((PyArray_DTypeMeta *)Py_TYPE(descr))
 #define NPY_DT_SLOTS(dtype) ((NPY_DType_Slots *)(dtype)->dt_slots)
@@ -81,6 +92,8 @@ typedef struct {
 #define NPY_DT_is_legacy(dtype) (((dtype)->flags & NPY_DT_LEGACY) != 0)
 #define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0)
 #define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0)
+#define NPY_DT_is_numeric(dtype) (((dtype)->flags & NPY_DT_NUMERIC) != 0)
+#define NPY_DT_is_user_defined(dtype) (((dtype)->type_num == -1))
 
 /*
  * Macros for convenient classmethod calls, since these require
@@ -127,7 +140,8 @@ python_builtins_are_known_scalar_types(
         PyArray_DTypeMeta *cls, PyTypeObject *pytype);
 
 NPY_NO_EXPORT int
-dtypemeta_wrap_legacy_descriptor(PyArray_Descr *dtypem);
+dtypemeta_wrap_legacy_descriptor(
+        PyArray_Descr *dtypem, const char *name, const char *alias);
 
 #ifdef __cplusplus
 }
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index cd1a58982..0b3b3fd8c 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -17,6 +17,7 @@
 #include <numpy/npy_common.h>
 #include <numpy/arrayobject.h>
 #include <npy_pycompat.h>
+#include <array_assign.h>   //PyArray_AssignRawScalar
 
 #include <ctype.h>
 
@@ -321,8 +322,7 @@ get_single_op_view(PyArrayObject *op, char *labels,
                 Py_TYPE(op), PyArray_DESCR(op),
                 ndim_output, new_dims, new_strides, PyArray_DATA(op),
                 PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
-                (PyObject *)op, (PyObject *)op,
-                0, 0);
+                (PyObject *)op, (PyObject *)op, 0);
 
         if (*ret == NULL) {
             return -1;
@@ -532,10 +532,17 @@ unbuffered_loop_nop1_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(1, ptrs[0], strides[0], shape[0]);
 
+        if (needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -586,11 +593,18 @@ unbuffered_loop_nop1_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(1, ptrs[0], strides[0], shape[0]);
 
+            if (needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -643,10 +657,17 @@ unbuffered_loop_nop2_ndim2(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
     for (coord = shape[1]; coord > 0; --coord) {
         sop(2, ptrs[0], strides[0], shape[0]);
 
+        if(needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
         ptr = ptrs[1][0] + strides[1][0];
         ptrs[0][0] = ptrs[1][0] = ptr;
         ptr = ptrs[1][1] + strides[1][1];
@@ -699,11 +720,18 @@ unbuffered_loop_nop2_ndim3(NpyIter *iter)
      * Since the iterator wasn't tracking coordinates, the
      * loop provided by the iterator is in Fortran-order.
      */
-    NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
     for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
         for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
             sop(2, ptrs[0], strides[0], shape[0]);
 
+            if(needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
             ptr = ptrs[1][0] + strides[1][0];
             ptrs[0][0] = ptrs[1][0] = ptr;
             ptr = ptrs[1][1] + strides[1][1];
@@ -1025,7 +1053,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         goto fail;
     }
 
-    /* Initialize the output to all zeros */
+    /* Initialize the output to all zeros or None*/
     ret = NpyIter_GetOperandArray(iter)[nop];
     if (PyArray_AssignZero(ret, NULL) < 0) {
         goto fail;
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
index e7b2f2c2c..5f2ccf2dc 100644
--- a/numpy/core/src/multiarray/einsum_sumprod.c.src
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -1026,6 +1026,57 @@ bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
 
 /**end repeat**/
 
+/**begin repeat
+ * #fn_name = 
+ *  object_sum_of_products_any,
+ *  object_sum_of_products_one,
+ *  object_sum_of_products_two,
+ *  object_sum_of_products_three,
+ *  object_sum_of_products_contig_any,
+ *  object_sum_of_products_contig_one,
+ *  object_sum_of_products_contig_two,
+ *  object_sum_of_products_contig_three,
+ *  object_sum_of_products_outstride0_any,
+ *  object_sum_of_products_outstride0_one,
+ *  object_sum_of_products_outstride0_two,
+ *  object_sum_of_products_outstride0_three#
+ */
+static void
+@fn_name@(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+/**end repeat**/
+
 /* These tables need to match up with the type enum */
 static sum_of_products_fn
 _contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
@@ -1116,7 +1167,7 @@ static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1152,7 +1203,7 @@ static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
@@ -1188,7 +1239,7 @@ static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
  *        1, 1,
  *        1, 1, 1,
  *        1, 1, 1,
- *        0, 0, 0, 0,
+ *        1, 0, 0, 0,
  *        0, 0, 1#
  */
 #if @use@
diff --git a/numpy/core/src/multiarray/experimental_public_dtype_api.c b/numpy/core/src/multiarray/experimental_public_dtype_api.c
index 84507b481..c0da0a7b7 100644
--- a/numpy/core/src/multiarray/experimental_public_dtype_api.c
+++ b/numpy/core/src/multiarray/experimental_public_dtype_api.c
@@ -16,19 +16,6 @@
 #include "common_dtype.h"
 
 
-#define EXPERIMENTAL_DTYPE_API_VERSION 5
-
-
-typedef struct{
-    PyTypeObject *typeobj;    /* type of python scalar or NULL */
-    int flags;                /* flags, including parametric and abstract */
-    /* NULL terminated cast definitions. Use NULL for the newly created DType */
-    PyArrayMethod_Spec **casts;
-    PyType_Slot *slots;
-} PyArrayDTypeMeta_Spec;
-
-
-
 static PyArray_DTypeMeta *
 dtype_does_not_promote(
         PyArray_DTypeMeta *NPY_UNUSED(self), PyArray_DTypeMeta *NPY_UNUSED(other))
@@ -116,10 +103,6 @@ PyArray_ArrFuncs default_funcs = {
 };
 
 
-/* other slots are in order, so keep only last around: */
-#define NUM_DTYPE_SLOTS 8
-
-
 int
 PyArrayInitDTypeMeta_FromSpec(
         PyArray_DTypeMeta *DType, PyArrayDTypeMeta_Spec *spec)
@@ -155,10 +138,12 @@ PyArrayInitDTypeMeta_FromSpec(
     }
 
     /* Check and handle flags: */
-    if (spec->flags & ~(NPY_DT_PARAMETRIC|NPY_DT_ABSTRACT)) {
+    int allowed_flags = NPY_DT_PARAMETRIC | NPY_DT_ABSTRACT | NPY_DT_NUMERIC;
+    if (spec->flags & ~(allowed_flags)) {
         PyErr_SetString(PyExc_RuntimeError,
-                "invalid DType flags specified, only parametric and abstract "
-                "are valid flags for user DTypes.");
+                "invalid DType flags specified, only NPY_DT_PARAMETRIC, "
+                "NPY_DT_ABSTRACT, and NPY_DT_NUMERIC are valid flags for "
+                "user DTypes.");
         return -1;
     }
 
@@ -178,6 +163,8 @@ PyArrayInitDTypeMeta_FromSpec(
     NPY_DT_SLOTS(DType)->common_instance = NULL;
     NPY_DT_SLOTS(DType)->setitem = NULL;
     NPY_DT_SLOTS(DType)->getitem = NULL;
+    NPY_DT_SLOTS(DType)->get_clear_loop = NULL;
+    NPY_DT_SLOTS(DType)->f = default_funcs;
 
     PyType_Slot *spec_slot = spec->slots;
     while (1) {
@@ -187,20 +174,100 @@ PyArrayInitDTypeMeta_FromSpec(
         if (slot == 0) {
             break;
         }
-        if (slot > NUM_DTYPE_SLOTS || slot < 0) {
+        if ((slot < 0) ||
+            ((slot > NPY_NUM_DTYPE_SLOTS) &&
+             (slot <= _NPY_DT_ARRFUNCS_OFFSET)) ||
+            (slot > NPY_DT_MAX_ARRFUNCS_SLOT)) {
             PyErr_Format(PyExc_RuntimeError,
                     "Invalid slot with value %d passed in.", slot);
             return -1;
         }
         /*
-         * It is up to the user to get this right, and slots are sorted
-         * exactly like they are stored right now:
+         * It is up to the user to get this right, the slots in the public API
+         * are sorted exactly like they are stored in the NPY_DT_Slots struct
+         * right now:
          */
-        void **current = (void **)(&(
-                NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
-        current += slot - 1;
-        *current = pfunc;
+        if (slot <= NPY_NUM_DTYPE_SLOTS) {
+            // slot > NPY_NUM_DTYPE_SLOTS are PyArray_ArrFuncs
+            void **current = (void **)(&(
+                    NPY_DT_SLOTS(DType)->discover_descr_from_pyobject));
+            current += slot - 1;
+            *current = pfunc;
+        }
+        else {
+            int f_slot = slot - _NPY_DT_ARRFUNCS_OFFSET;
+            if (1 <= f_slot && f_slot <= NPY_NUM_DTYPE_PYARRAY_ARRFUNCS_SLOTS) {
+                switch (f_slot) {
+                    case 1:
+                        NPY_DT_SLOTS(DType)->f.getitem = pfunc;
+                        break;
+                    case 2:
+                        NPY_DT_SLOTS(DType)->f.setitem = pfunc;
+                        break;
+                    case 3:
+                        NPY_DT_SLOTS(DType)->f.copyswapn = pfunc;
+                        break;
+                    case 4:
+                        NPY_DT_SLOTS(DType)->f.copyswap = pfunc;
+                        break;
+                    case 5:
+                        NPY_DT_SLOTS(DType)->f.compare = pfunc;
+                        break;
+                    case 6:
+                        NPY_DT_SLOTS(DType)->f.argmax = pfunc;
+                        break;
+                    case 7:
+                        NPY_DT_SLOTS(DType)->f.dotfunc = pfunc;
+                        break;
+                    case 8:
+                        NPY_DT_SLOTS(DType)->f.scanfunc = pfunc;
+                        break;
+                    case 9:
+                        NPY_DT_SLOTS(DType)->f.fromstr = pfunc;
+                        break;
+                    case 10:
+                        NPY_DT_SLOTS(DType)->f.nonzero = pfunc;
+                        break;
+                    case 11:
+                        NPY_DT_SLOTS(DType)->f.fill = pfunc;
+                        break;
+                    case 12:
+                        NPY_DT_SLOTS(DType)->f.fillwithscalar = pfunc;
+                        break;
+                    case 13:
+                        *NPY_DT_SLOTS(DType)->f.sort = pfunc;
+                        break;
+                    case 14:
+                        *NPY_DT_SLOTS(DType)->f.argsort = pfunc;
+                        break;
+                    case 15:
+                    case 16:
+                    case 17:
+                    case 18:
+                    case 19:
+                    case 20:
+                    case 21:
+                        PyErr_Format(
+                            PyExc_RuntimeError,
+                            "PyArray_ArrFunc casting slot with value %d is disabled.",
+                            f_slot
+                        );
+                        return -1;
+                    case 22:
+                        NPY_DT_SLOTS(DType)->f.argmin = pfunc;
+                        break;
+                }
+            } else {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Invalid PyArray_ArrFunc slot with value %d passed in.",
+                        f_slot
+                    );
+                    return -1;
+            }
+        }
     }
+
     if (NPY_DT_SLOTS(DType)->setitem == NULL
             || NPY_DT_SLOTS(DType)->getitem == NULL) {
         PyErr_SetString(PyExc_RuntimeError,
@@ -229,7 +296,7 @@ PyArrayInitDTypeMeta_FromSpec(
             return -1;
         }
     }
-    NPY_DT_SLOTS(DType)->f = default_funcs;
+
     /* invalid type num. Ideally, we get away with it! */
     DType->type_num = -1;
 
@@ -443,12 +510,12 @@ _get_experimental_dtype_api(PyObject *NPY_UNUSED(mod), PyObject *arg)
     if (error_converting(version)) {
         return NULL;
     }
-    if (version != EXPERIMENTAL_DTYPE_API_VERSION) {
+    if (version != __EXPERIMENTAL_DTYPE_API_VERSION) {
         PyErr_Format(PyExc_RuntimeError,
                 "Experimental DType API version %d requested, but NumPy "
                 "is exporting version %d.  Recompile your DType and/or upgrade "
                 "NumPy to match.",
-                version, EXPERIMENTAL_DTYPE_API_VERSION);
+                version, __EXPERIMENTAL_DTYPE_API_VERSION);
         return NULL;
     }
 
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index ab35548ed..7022eb231 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -797,20 +797,17 @@ array_imag_get(PyArrayObject *self, void *NPY_UNUSED(ignored))
     }
     else {
         Py_INCREF(PyArray_DESCR(self));
-        ret = (PyArrayObject *)PyArray_NewFromDescr(Py_TYPE(self),
-                                                    PyArray_DESCR(self),
-                                                    PyArray_NDIM(self),
-                                                    PyArray_DIMS(self),
-                                                    NULL, NULL,
-                                                    PyArray_ISFORTRAN(self),
-                                                    (PyObject *)self);
+        ret = (PyArrayObject *)PyArray_NewFromDescr_int(
+                Py_TYPE(self),
+                PyArray_DESCR(self),
+                PyArray_NDIM(self),
+                PyArray_DIMS(self),
+                NULL, NULL,
+                PyArray_ISFORTRAN(self),
+                (PyObject *)self, NULL, _NPY_ARRAY_ZEROED);
         if (ret == NULL) {
             return NULL;
         }
-        if (_zerofill(ret) < 0) {
-            Py_DECREF(ret);
-            return NULL;
-        }
         PyArray_CLEARFLAGS(ret, NPY_ARRAY_WRITEABLE);
     }
     return (PyObject *) ret;
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index e5331b2b4..f42ae7c2d 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -17,10 +17,12 @@
 
 #include "multiarraymodule.h"
 #include "common.h"
+#include "dtype_transfer.h"
 #include "arrayobject.h"
 #include "ctors.h"
 #include "lowlevel_strided_loops.h"
 #include "array_assign.h"
+#include "refcount.h"
 
 #include "npy_sort.h"
 #include "npy_partition.h"
@@ -39,7 +41,26 @@ npy_fasttake_impl(
         PyArray_Descr *dtype, int axis)
 {
     NPY_BEGIN_THREADS_DEF;
-    NPY_BEGIN_THREADS_DESCR(dtype);
+
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    NPY_cast_info_init(&cast_info);
+
+    if (!needs_refcounting) {
+        /* if "refcounting" is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS;
+    }
+    else {
+        if (PyArray_GetDTypeTransferFunction(
+                1, itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            return -1;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+    }
+
     switch (clipmode) {
         case NPY_RAISE:
             for (npy_intp i = 0; i < n; i++) {
@@ -47,22 +68,23 @@ npy_fasttake_impl(
                     npy_intp tmp = indices[j];
                     if (check_and_adjust_index(&tmp, max_item, axis,
                                                _save) < 0) {
-                        return -1;
+                        goto fail;
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -83,18 +105,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -111,18 +134,19 @@ npy_fasttake_impl(
                     }
                     char *tmp_src = src + tmp * chunk;
                     if (needs_refcounting) {
-                        for (npy_intp k = 0; k < nelem; k++) {
-                            PyArray_Item_INCREF(tmp_src, dtype);
-                            PyArray_Item_XDECREF(dest, dtype);
-                            memmove(dest, tmp_src, itemsize);
-                            dest += itemsize;
-                            tmp_src += itemsize;
+                        char *data[2] = {tmp_src, dest};
+                        npy_intp strides[2] = {itemsize, itemsize};
+                        if (cast_info.func(
+                                &cast_info.context, data, &nelem, strides,
+                                cast_info.auxdata) < 0) {
+                            NPY_END_THREADS;
+                            goto fail;
                         }
                     }
                     else {
-                        memmove(dest, tmp_src, chunk);
-                        dest += chunk;
+                        memcpy(dest, tmp_src, chunk);
                     }
+                    dest += chunk;
                 }
                 src += chunk*max_item;
             }
@@ -130,7 +154,13 @@ npy_fasttake_impl(
     }
 
     NPY_END_THREADS;
+    NPY_cast_info_xfree(&cast_info);
     return 0;
+
+  fail:
+    /* NPY_END_THREADS already ensured. */
+    NPY_cast_info_xfree(&cast_info);
+    return -1;
 }
 
 
@@ -322,11 +352,17 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
               NPY_CLIPMODE clipmode)
 {
     PyArrayObject  *indices, *values;
-    npy_intp i, chunk, ni, max_item, nv, tmp;
+    npy_intp i, itemsize, ni, max_item, nv, tmp;
     char *src, *dest;
     int copied = 0;
     int overlap = 0;
 
+    NPY_BEGIN_THREADS_DEF;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+
+    NPY_cast_info_init(&cast_info);
+
     indices = NULL;
     values = NULL;
     if (!PyArray_Check(self)) {
@@ -372,25 +408,51 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     }
     max_item = PyArray_SIZE(self);
     dest = PyArray_DATA(self);
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
+
+    int has_references = PyDataType_REFCHK(PyArray_DESCR(self));
+
+    if (!has_references) {
+        /* if has_references is not needed memcpy is safe for a simple copy  */
+        NPY_BEGIN_THREADS_THRESHOLDED(ni);
+    }
+    else {
+        PyArray_Descr *dtype = PyArray_DESCR(self);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS_THRESHOLDED(ni);
+        }
+    }
+
+
+    if (has_references) {
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
 
-    if (PyDataType_REFCHK(PyArray_DESCR(self))) {
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk*(i % nv);
+                src = PyArray_BYTES(values) + itemsize*(i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
-                if (check_and_adjust_index(&tmp, max_item, 0, NULL) < 0) {
+                if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
+                    goto fail;
+                }
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
                     goto fail;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp*chunk, src, chunk);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -402,14 +464,18 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -417,30 +483,32 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                PyArray_Item_INCREF(src, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest+tmp*chunk, PyArray_DESCR(self));
-                memmove(dest + tmp * chunk, src, chunk);
+                char *data[2] = {src, dest + tmp*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    goto fail;
+                }
             }
             break;
         }
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_THRESHOLDED(ni);
         switch(clipmode) {
         case NPY_RAISE:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (check_and_adjust_index(&tmp, max_item, 0, _save) < 0) {
                     goto fail;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_WRAP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     while (tmp < 0) {
@@ -452,12 +520,12 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                         tmp -= max_item;
                     }
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         case NPY_CLIP:
             for (i = 0; i < ni; i++) {
-                src = PyArray_BYTES(values) + chunk * (i % nv);
+                src = PyArray_BYTES(values) + itemsize * (i % nv);
                 tmp = ((npy_intp *)(PyArray_DATA(indices)))[i];
                 if (tmp < 0) {
                     tmp = 0;
@@ -465,14 +533,16 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
                 else if (tmp >= max_item) {
                     tmp = max_item - 1;
                 }
-                memmove(dest + tmp * chunk, src, chunk);
+                memmove(dest + tmp * itemsize, src, itemsize);
             }
             break;
         }
-        NPY_END_THREADS;
     }
+    NPY_END_THREADS;
 
  finish:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(values);
     Py_XDECREF(indices);
     if (copied) {
@@ -482,6 +552,8 @@ PyArray_PutTo(PyArrayObject *self, PyObject* values0, PyObject *indices0,
     Py_RETURN_NONE;
 
  fail:
+    NPY_cast_info_xfree(&cast_info);
+
     Py_XDECREF(indices);
     Py_XDECREF(values);
     if (copied) {
@@ -562,11 +634,12 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
 {
     PyArrayObject *mask, *values;
     PyArray_Descr *dtype;
-    npy_intp chunk, ni, nv;
+    npy_intp itemsize, ni, nv;
     char *src, *dest;
     npy_bool *mask_data;
     int copied = 0;
     int overlap = 0;
+    NPY_BEGIN_THREADS_DEF;
 
     mask = NULL;
     values = NULL;
@@ -627,31 +700,49 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
         self = obj;
     }
 
-    chunk = PyArray_DESCR(self)->elsize;
+    itemsize = PyArray_DESCR(self)->elsize;
     dest = PyArray_DATA(self);
 
     if (PyDataType_REFCHK(PyArray_DESCR(self))) {
+        NPY_cast_info cast_info;
+        NPY_ARRAYMETHOD_FLAGS flags;
+        const npy_intp one = 1;
+        const npy_intp strides[2] = {itemsize, itemsize};
+
+        NPY_cast_info_init(&cast_info);
+        if (PyArray_GetDTypeTransferFunction(
+                PyArray_ISALIGNED(self), itemsize, itemsize, dtype, dtype, 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
+        }
+        if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
+            NPY_BEGIN_THREADS;
+        }
+
         for (npy_intp i = 0, j = 0; i < ni; i++, j++) {
             if (j >= nv) {
                 j = 0;
             }
             if (mask_data[i]) {
-                char *src_ptr = src + j*chunk;
-                char *dest_ptr = dest + i*chunk;
-
-                PyArray_Item_INCREF(src_ptr, PyArray_DESCR(self));
-                PyArray_Item_XDECREF(dest_ptr, PyArray_DESCR(self));
-                memmove(dest_ptr, src_ptr, chunk);
+                char *data[2] = {src + j*itemsize, dest + i*itemsize};
+                if (cast_info.func(
+                        &cast_info.context, data, &one, strides,
+                        cast_info.auxdata) < 0) {
+                    NPY_END_THREADS;
+                    NPY_cast_info_xfree(&cast_info);
+                    goto fail;
+                }
             }
         }
+        NPY_cast_info_xfree(&cast_info);
     }
     else {
-        NPY_BEGIN_THREADS_DEF;
-        NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(self));
-        npy_fastputmask(dest, src, mask_data, ni, nv, chunk);
-        NPY_END_THREADS;
+        NPY_BEGIN_THREADS;
+        npy_fastputmask(dest, src, mask_data, ni, nv, itemsize);
     }
 
+    NPY_END_THREADS;
+
     Py_XDECREF(values);
     Py_XDECREF(mask);
     if (copied) {
@@ -670,6 +761,81 @@ PyArray_PutMask(PyArrayObject *self, PyObject* values0, PyObject* mask0)
     return NULL;
 }
 
+static NPY_GCC_OPT_3 inline int
+npy_fastrepeat_impl(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    npy_intp i, j, k;
+    for (i = 0; i < n_outer; i++) {
+        for (j = 0; j < n; j++) {
+            npy_intp tmp = broadcast ? counts[0] : counts[j];
+            for (k = 0; k < tmp; k++) {
+                if (!needs_refcounting) {
+                    memcpy(new_data, old_data, chunk);
+                }
+                else {
+                    char *data[2] = {old_data, new_data};
+                    npy_intp strides[2] = {elsize, elsize};
+                    if (cast_info.func(&cast_info.context, data, &nel,
+                                       strides, cast_info.auxdata) < 0) {
+                        return -1;
+                    }
+                }
+                new_data += chunk;
+            }
+            old_data += chunk;
+        }
+    }
+    return 0;
+}
+
+static NPY_GCC_OPT_3 int
+npy_fastrepeat(
+    npy_intp n_outer, npy_intp n, npy_intp nel, npy_intp chunk,
+    npy_bool broadcast, npy_intp* counts, char* new_data, char* old_data,
+    npy_intp elsize, NPY_cast_info cast_info, int needs_refcounting)
+{
+    if (!needs_refcounting) {
+        if (chunk == 1) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 2) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 4) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 8) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 16) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+        if (chunk == 32) {
+            return npy_fastrepeat_impl(
+                n_outer, n, nel, chunk, broadcast, counts, new_data, old_data,
+                elsize, cast_info, needs_refcounting);
+        }
+    }
+
+    return npy_fastrepeat_impl(
+        n_outer, n, nel, chunk, broadcast, counts, new_data, old_data, elsize,
+        cast_info, needs_refcounting);    
+}
+
+
 /*NUMPY_API
  * Repeat the array.
  */
@@ -677,13 +843,16 @@ NPY_NO_EXPORT PyObject *
 PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 {
     npy_intp *counts;
-    npy_intp n, n_outer, i, j, k, chunk;
+    npy_intp i, j, n, n_outer, chunk, elsize, nel;
     npy_intp total = 0;
     npy_bool broadcast = NPY_FALSE;
     PyArrayObject *repeats = NULL;
     PyObject *ap = NULL;
     PyArrayObject *ret = NULL;
     char *new_data, *old_data;
+    NPY_cast_info cast_info;
+    NPY_ARRAYMETHOD_FLAGS flags;
+    int needs_refcounting;
 
     repeats = (PyArrayObject *)PyArray_ContiguousFromAny(op, NPY_INTP, 0, 1);
     if (repeats == NULL) {
@@ -707,6 +876,8 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
 
     aop = (PyArrayObject *)ap;
     n = PyArray_DIM(aop, axis);
+    NPY_cast_info_init(&cast_info);
+    needs_refcounting = PyDataType_REFCHK(PyArray_DESCR(aop));
 
     if (!broadcast && PyArray_SIZE(repeats) != n) {
         PyErr_Format(PyExc_ValueError,
@@ -744,35 +915,41 @@ PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis)
     new_data = PyArray_DATA(ret);
     old_data = PyArray_DATA(aop);
 
-    chunk = PyArray_DESCR(aop)->elsize;
+    nel = 1;
+    elsize = PyArray_DESCR(aop)->elsize;
     for(i = axis + 1; i < PyArray_NDIM(aop); i++) {
-        chunk *= PyArray_DIMS(aop)[i];
+        nel *= PyArray_DIMS(aop)[i];
     }
+    chunk = nel*elsize;
 
     n_outer = 1;
     for (i = 0; i < axis; i++) {
         n_outer *= PyArray_DIMS(aop)[i];
     }
-    for (i = 0; i < n_outer; i++) {
-        for (j = 0; j < n; j++) {
-            npy_intp tmp = broadcast ? counts[0] : counts[j];
-            for (k = 0; k < tmp; k++) {
-                memcpy(new_data, old_data, chunk);
-                new_data += chunk;
-            }
-            old_data += chunk;
+
+    if (needs_refcounting) {
+        if (PyArray_GetDTypeTransferFunction(
+                1, elsize, elsize, PyArray_DESCR(aop), PyArray_DESCR(aop), 0,
+                &cast_info, &flags) < 0) {
+            goto fail;
         }
     }
 
+    if (npy_fastrepeat(n_outer, n, nel, chunk, broadcast, counts, new_data,
+                       old_data, elsize, cast_info, needs_refcounting) < 0) {
+        goto fail;
+    }
+
     Py_DECREF(repeats);
-    PyArray_INCREF(ret);
     Py_XDECREF(aop);
+    NPY_cast_info_xfree(&cast_info);
     return (PyObject *)ret;
 
  fail:
     Py_DECREF(repeats);
     Py_XDECREF(aop);
     Py_XDECREF(ret);
+    NPY_cast_info_xfree(&cast_info);
     return NULL;
 }
 
@@ -945,10 +1122,10 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
     npy_intp elsize = (npy_intp)PyArray_ITEMSIZE(op);
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
-    int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int is_aligned = IsAligned(op);
+    int needcopy = !is_aligned || swap || astride != elsize;
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
 
-    PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
     char *buffer = NULL;
 
     PyArrayIterObject *it;
@@ -956,6 +1133,12 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
     int ret = 0;
 
+    PyArray_Descr *descr = PyArray_DESCR(op);
+    PyArray_Descr *odescr = NULL;
+
+    NPY_cast_info to_cast_info = {.func = NULL};
+    NPY_cast_info from_cast_info = {.func = NULL};
+
     NPY_BEGIN_THREADS_DEF;
 
     /* Check if there is any sorting to do */
@@ -980,32 +1163,48 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+            memset(buffer, 0, N * elsize);
+        }
+
+        if (swap) {
+            odescr = PyArray_DescrNewByteorder(descr, NPY_SWAP);
+        }
+        else {
+            odescr = descr;
+            Py_INCREF(odescr);
+        }
+
+        NPY_ARRAYMETHOD_FLAGS to_transfer_flags;
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, astride, elsize, descr, odescr, 0, &to_cast_info,
+                &to_transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
+
+        NPY_ARRAYMETHOD_FLAGS from_transfer_flags;
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, elsize, astride, odescr, descr, 0, &from_cast_info,
+                &from_transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
     }
 
-    NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_BEGIN_THREADS_DESCR(descr);
 
     while (size--) {
         char *bufptr = it->dataptr;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized buffer, or leak a reference to each
-                 * object if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                _unaligned_strided_byte_copy(buffer, elsize,
-                                             it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(buffer, elsize, it->dataptr, astride, N, swap, op);
+            char *args[2] = {it->dataptr, buffer};
+            npy_intp strides[2] = {astride, elsize};
+
+            if (NPY_UNLIKELY(to_cast_info.func(
+                                 &to_cast_info.context, args, &N, strides,
+                                 to_cast_info.auxdata) < 0)) {
+                goto fail;
             }
             bufptr = buffer;
         }
@@ -1019,7 +1218,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
 
         if (part == NULL) {
             ret = sort(bufptr, N, op);
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1032,7 +1231,7 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
             npy_intp i;
             for (i = 0; i < nkth; ++i) {
                 ret = part(bufptr, N, kth[i], pivots, &npiv, op);
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1042,15 +1241,13 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
         }
 
         if (needcopy) {
-            if (hasrefs) {
-                if (swap) {
-                    copyswapn(buffer, elsize, NULL, 0, N, swap, op);
-                }
-                _unaligned_strided_byte_copy(it->dataptr, astride,
-                                             buffer, elsize, N, elsize);
-            }
-            else {
-                copyswapn(it->dataptr, astride, buffer, elsize, N, swap, op);
+            char *args[2] = {buffer, it->dataptr};
+            npy_intp strides[2] = {elsize, astride};
+
+            if (NPY_UNLIKELY(from_cast_info.func(
+                                 &from_cast_info.context, args, &N, strides,
+                                 from_cast_info.auxdata) < 0)) {
+                goto fail;
             }
         }
 
@@ -1058,15 +1255,21 @@ _new_sortlike(PyArrayObject *op, int axis, PyArray_SortFunc *sort,
     }
 
 fail:
-    NPY_END_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_END_THREADS_DESCR(descr);
     /* cleanup internal buffer */
-    PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(odescr, buffer, elsize, N, 1);
+        PyDataMem_UserFREE(buffer, N * elsize, mem_handler);
+        Py_DECREF(odescr);
+    }
     if (ret < 0 && !PyErr_Occurred()) {
         /* Out of memory during sorting or buffer creation */
         PyErr_NoMemory();
     }
     Py_DECREF(it);
     Py_DECREF(mem_handler);
+    NPY_cast_info_xfree(&to_cast_info);
+    NPY_cast_info_xfree(&from_cast_info);
 
     return ret;
 }
@@ -1080,11 +1283,11 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     npy_intp elsize = (npy_intp)PyArray_ITEMSIZE(op);
     npy_intp astride = PyArray_STRIDE(op, axis);
     int swap = PyArray_ISBYTESWAPPED(op);
-    int needcopy = !IsAligned(op) || swap || astride != elsize;
-    int hasrefs = PyDataType_REFCHK(PyArray_DESCR(op));
+    int is_aligned = IsAligned(op);
+    int needcopy = !is_aligned || swap || astride != elsize;
+    int needs_api = PyDataType_FLAGCHK(PyArray_DESCR(op), NPY_NEEDS_PYAPI);
     int needidxbuffer;
 
-    PyArray_CopySwapNFunc *copyswapn = PyArray_DESCR(op)->f->copyswapn;
     char *valbuffer = NULL;
     npy_intp *idxbuffer = NULL;
 
@@ -1096,6 +1299,12 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
 
     int ret = 0;
 
+    PyArray_Descr *descr = PyArray_DESCR(op);
+    PyArray_Descr *odescr = NULL;
+
+    NPY_ARRAYMETHOD_FLAGS transfer_flags;
+    NPY_cast_info cast_info = {.func = NULL};
+
     NPY_BEGIN_THREADS_DEF;
 
     PyObject *mem_handler = PyDataMem_GetHandler();
@@ -1134,6 +1343,23 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             ret = -1;
             goto fail;
         }
+        if (PyDataType_FLAGCHK(descr, NPY_NEEDS_INIT)) {
+            memset(valbuffer, 0, N * elsize);
+        }
+
+        if (swap) {
+            odescr = PyArray_DescrNewByteorder(descr, NPY_SWAP);
+        }
+        else {
+            odescr = descr;
+            Py_INCREF(odescr);
+        }
+
+        if (PyArray_GetDTypeTransferFunction(
+                is_aligned, astride, elsize, descr, odescr, 0, &cast_info,
+                &transfer_flags) != NPY_SUCCEED) {
+            goto fail;
+        }
     }
 
     if (needidxbuffer) {
@@ -1145,7 +1371,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         }
     }
 
-    NPY_BEGIN_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_BEGIN_THREADS_DESCR(descr);
 
     while (size--) {
         char *valptr = it->dataptr;
@@ -1153,25 +1379,13 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         npy_intp *iptr, i;
 
         if (needcopy) {
-            if (hasrefs) {
-                /*
-                 * For dtype's with objects, copyswapn Py_XINCREF's src
-                 * and Py_XDECREF's dst. This would crash if called on
-                 * an uninitialized valbuffer, or leak a reference to
-                 * each object item if initialized.
-                 *
-                 * So, first do the copy with no refcounting...
-                 */
-                 _unaligned_strided_byte_copy(valbuffer, elsize,
-                                              it->dataptr, astride, N, elsize);
-                /* ...then swap in-place if needed */
-                if (swap) {
-                    copyswapn(valbuffer, elsize, NULL, 0, N, swap, op);
-                }
-            }
-            else {
-                copyswapn(valbuffer, elsize,
-                          it->dataptr, astride, N, swap, op);
+            char *args[2] = {it->dataptr, valbuffer};
+            npy_intp strides[2] = {astride, elsize};
+
+            if (NPY_UNLIKELY(cast_info.func(
+                                 &cast_info.context, args, &N, strides,
+                                 cast_info.auxdata) < 0)) {
+                goto fail;
             }
             valptr = valbuffer;
         }
@@ -1188,7 +1402,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
         if (argpart == NULL) {
             ret = argsort(valptr, idxptr, N, op);
             /* Object comparisons may raise an exception in Python 3 */
-            if (hasrefs && PyErr_Occurred()) {
+            if (needs_api && PyErr_Occurred()) {
                 ret = -1;
             }
             if (ret < 0) {
@@ -1202,7 +1416,7 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
             for (i = 0; i < nkth; ++i) {
                 ret = argpart(valptr, idxptr, N, kth[i], pivots, &npiv, op);
                 /* Object comparisons may raise an exception in Python 3 */
-                if (hasrefs && PyErr_Occurred()) {
+                if (needs_api && PyErr_Occurred()) {
                     ret = -1;
                 }
                 if (ret < 0) {
@@ -1226,9 +1440,13 @@ _new_argsortlike(PyArrayObject *op, int axis, PyArray_ArgSortFunc *argsort,
     }
 
 fail:
-    NPY_END_THREADS_DESCR(PyArray_DESCR(op));
+    NPY_END_THREADS_DESCR(descr);
     /* cleanup internal buffers */
-    PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+    if (needcopy) {
+        PyArray_ClearBuffer(odescr, valbuffer, elsize, N, 1);
+        PyDataMem_UserFREE(valbuffer, N * elsize, mem_handler);
+        Py_DECREF(odescr);
+    }
     PyDataMem_UserFREE(idxbuffer, N * sizeof(npy_intp), mem_handler);
     if (ret < 0) {
         if (!PyErr_Occurred()) {
@@ -1241,6 +1459,7 @@ fail:
     Py_XDECREF(it);
     Py_XDECREF(rit);
     Py_DECREF(mem_handler);
+    NPY_cast_info_xfree(&cast_info);
 
     return (PyObject *)rop;
 }
@@ -2642,7 +2861,7 @@ PyArray_Nonzero(PyArrayObject *self)
                 }
             }
             /*
-             * Fallback to a branchless strategy to avoid branch misprediction 
+             * Fallback to a branchless strategy to avoid branch misprediction
              * stalls that are very expensive on most modern processors.
              */
             else {
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 3dd488220..b7fcf2ee2 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -889,13 +889,13 @@ get_view_from_index(PyArrayObject *self, PyArrayObject **view,
 
     /* Create the new view and set the base array */
     Py_INCREF(PyArray_DESCR(self));
-    *view = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+    *view = (PyArrayObject *)PyArray_NewFromDescr_int(
             ensure_array ? &PyArray_Type : Py_TYPE(self),
             PyArray_DESCR(self),
             new_dim, new_shape, new_strides, data_ptr,
             PyArray_FLAGS(self),
             ensure_array ? NULL : (PyObject *)self,
-            (PyObject *)self);
+            (PyObject *)self, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     if (*view == NULL) {
         return -1;
     }
@@ -1361,7 +1361,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_BYTES(arr) + offset,
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (*view == NULL) {
             return 0;
         }
@@ -1415,7 +1416,8 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_DATA(arr),
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
-                0, 1);
+                /* We do not preserve the dtype for a subarray one, only str */
+                _NPY_ARRAY_ALLOW_EMPTY_STRING);
 
         if (*view == NULL) {
             return 0;
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index fff772769..bc3efc295 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -21,12 +21,14 @@
 #include "ctors.h"
 #include "calculation.h"
 #include "convert_datatype.h"
+#include "descriptor.h"
 #include "item_selection.h"
 #include "conversion_utils.h"
 #include "shape.h"
 #include "strfuncs.h"
 #include "array_assign.h"
 #include "npy_dlpack.h"
+#include "multiarraymodule.h"
 
 #include "methods.h"
 #include "alloc.h"
@@ -435,7 +437,7 @@ PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
             PyArray_BYTES(self) + offset,
             PyArray_FLAGS(self) & ~NPY_ARRAY_F_CONTIGUOUS,
             (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ALLOW_EMPTY_STRING);
     return ret;
 }
 
@@ -851,29 +853,35 @@ static PyObject *
 array_astype(PyArrayObject *self,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-    PyArray_Descr *dtype = NULL;
     /*
      * TODO: UNSAFE default for compatibility, I think
      *       switching to SAME_KIND by default would be good.
      */
+    npy_dtype_info dt_info;
     NPY_CASTING casting = NPY_UNSAFE_CASTING;
     NPY_ORDER order = NPY_KEEPORDER;
     _PyArray_CopyMode forcecopy = 1;
     int subok = 1;
+
     NPY_PREPARE_ARGPARSER;
     if (npy_parse_arguments("astype", args, len_args, kwnames,
-            "dtype", &PyArray_DescrConverter, &dtype,
+            "dtype", &PyArray_DTypeOrDescrConverterRequired, &dt_info,
             "|order", &PyArray_OrderConverter, &order,
             "|casting", &PyArray_CastingConverter, &casting,
             "|subok", &PyArray_PythonPyIntFromInt, &subok,
             "|copy", &PyArray_CopyConverter, &forcecopy,
             NULL, NULL, NULL) < 0) {
-        Py_XDECREF(dtype);
+        Py_XDECREF(dt_info.descr);
+        Py_XDECREF(dt_info.dtype);
         return NULL;
     }
 
     /* If it is not a concrete dtype instance find the best one for the array */
-    Py_SETREF(dtype, PyArray_AdaptDescriptorToArray(self, (PyObject *)dtype));
+    PyArray_Descr *dtype;
+
+    dtype = PyArray_AdaptDescriptorToArray(self, dt_info.dtype, dt_info.descr);
+    Py_XDECREF(dt_info.descr);
+    Py_DECREF(dt_info.dtype);
     if (dtype == NULL) {
         return NULL;
     }
@@ -916,29 +924,28 @@ array_astype(PyArrayObject *self,
 
     PyArrayObject *ret;
 
-    /* This steals the reference to dtype, so no DECREF of dtype */
+    /* This steals the reference to dtype */
+    Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewLikeArray(
                                 self, order, dtype, subok);
     if (ret == NULL) {
+        Py_DECREF(dtype);
         return NULL;
     }
-    /* NumPy 1.20, 2020-10-01 */
-    if ((PyArray_NDIM(self) != PyArray_NDIM(ret)) &&
-            DEPRECATE_FUTUREWARNING(
-                "casting an array to a subarray dtype "
-                "will not use broadcasting in the future, but cast each "
-                "element to the new dtype and then append the dtype's shape "
-                "to the new array. You can opt-in to the new behaviour, by "
-                "additional field to the cast: "
-                "`arr.astype(np.dtype([('f', dtype)]))['f']`.\n"
-                "This may lead to a different result or to current failures "
-                "succeeding.  "
-                "(FutureWarning since NumPy 1.20)") < 0) {
-        Py_DECREF(ret);
-        return NULL;
-    }
 
-    if (PyArray_CopyInto(ret, self) < 0) {
+    /* Decrease the number of dimensions removing subarray ones again */
+    int out_ndim = PyArray_NDIM(ret);
+    PyArray_Descr *out_descr = PyArray_DESCR(ret);
+    ((PyArrayObject_fields *)ret)->nd = PyArray_NDIM(self);
+    ((PyArrayObject_fields *)ret)->descr = dtype;
+
+    int success = PyArray_CopyInto(ret, self);
+
+    Py_DECREF(dtype);
+    ((PyArrayObject_fields *)ret)->nd = out_ndim;
+    ((PyArrayObject_fields *)ret)->descr = out_descr;
+
+    if (success < 0) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -1095,7 +1102,7 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
     int nin, nout;
     PyObject *out_kwd_obj;
     PyObject *fast;
-    PyObject **in_objs, **out_objs;
+    PyObject **in_objs, **out_objs, *where_obj;
 
     /* check inputs */
     nin = PyTuple_Size(args);
@@ -1126,6 +1133,17 @@ any_array_ufunc_overrides(PyObject *args, PyObject *kwds)
         }
     }
     Py_DECREF(out_kwd_obj);
+    /* check where if it exists */
+    where_obj = PyDict_GetItemWithError(kwds, npy_ma_str_where);
+    if (where_obj == NULL) {
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+    } else {
+        if (PyUFunc_HasOverride(where_obj)){
+            return 1;
+        }
+    }
     return 0;
 }
 
@@ -2785,9 +2803,7 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
     PyArray_Descr *dtype;
     PyObject *c;
 
-    if (PyArray_SIZE(self) != 1) {
-        PyErr_SetString(PyExc_TypeError,
-                "only length-1 arrays can be converted to Python scalars");
+    if (check_is_convertible_to_scalar(self) < 0) {
         return NULL;
     }
 
@@ -2832,25 +2848,15 @@ array_complex(PyArrayObject *self, PyObject *NPY_UNUSED(args))
 static PyObject *
 array_class_getitem(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > 2) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > 2 ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 NPY_NO_EXPORT PyMethodDef array_methods[] = {
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5da3d66df..d7d19493b 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -66,6 +66,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "compiled_base.h"
 #include "mem_overlap.h"
 #include "typeinfo.h"
+#include "convert.h" /* for PyArray_AssignZero */
 
 #include "get_attr_string.h"
 #include "experimental_public_dtype_api.h"  /* _get_experimental_dtype_api */
@@ -98,6 +99,12 @@ _umath_strings_richcompare(
  * future.
  */
 int npy_legacy_print_mode = INT_MAX;
+/*
+ * Global variable to check whether NumPy 2.0 behavior is opted in.
+ * This flag is considered a runtime constant.
+ */
+int npy_numpy2_behavior = NPY_FALSE;
+
 
 static PyObject *
 set_legacy_print_mode(PyObject *NPY_UNUSED(self), PyObject *args)
@@ -467,7 +474,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Get the priority subtype for the array */
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays,  (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -488,7 +495,7 @@ PyArray_ConcatenateArrays(int narrays, PyArrayObject **arrays, int axis,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr, ndim, shape, strides, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -583,7 +590,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         PyTypeObject *subtype = PyArray_GetSubType(narrays, arrays);
 
         PyArray_Descr *descr = PyArray_FindConcatenationDescriptor(
-                narrays, arrays, (PyObject *)dtype);
+                narrays, arrays, dtype);
         if (descr == NULL) {
             return NULL;
         }
@@ -593,7 +600,7 @@ PyArray_ConcatenateFlattenedArrays(int narrays, PyArrayObject **arrays,
         /* Allocate the array for the result. This steals the 'dtype' reference. */
         ret = (PyArrayObject *)PyArray_NewFromDescr_int(
                 subtype, descr,  1, &shape, &stride, NULL, 0, NULL,
-                NULL, 0, 1);
+                NULL, _NPY_ARRAY_ALLOW_EMPTY_STRING);
         if (ret == NULL) {
             return NULL;
         }
@@ -1036,12 +1043,11 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
 #endif
 
     if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-        result = (PyArray_NDIM(ap1) == 0 ? ap1 : ap2);
-        result = (PyArrayObject *)Py_TYPE(result)->tp_as_number->nb_multiply(
-                                        (PyObject *)ap1, (PyObject *)ap2);
+        PyObject *mul_res = PyObject_CallFunctionObjArgs(
+                n_ops.multiply, ap1, ap2, out, NULL);
         Py_DECREF(ap1);
         Py_DECREF(ap2);
-        return (PyObject *)result;
+        return mul_res;
     }
     l = PyArray_DIMS(ap1)[PyArray_NDIM(ap1) - 1];
     if (PyArray_NDIM(ap2) > 1) {
@@ -1079,7 +1085,9 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     }
     /* Ensure that multiarray.dot(<Nx0>,<0xM>) -> zeros((N,M)) */
     if (PyArray_SIZE(ap1) == 0 && PyArray_SIZE(ap2) == 0) {
-        memset(PyArray_DATA(out_buf), 0, PyArray_NBYTES(out_buf));
+        if (PyArray_AssignZero(out_buf, NULL) < 0) {
+            goto fail;
+        }
     }
 
     dot = PyArray_DESCR(out_buf)->f->dotfunc;
@@ -1492,19 +1500,26 @@ fail:
     return NULL;
 }
 
-
 static PyObject *
-array_putmask(PyObject *NPY_UNUSED(module), PyObject *args, PyObject *kwds)
+array_putmask(PyObject *NPY_UNUSED(module), PyObject *const *args,
+                Py_ssize_t len_args, PyObject *kwnames )
 {
     PyObject *mask, *values;
     PyObject *array;
 
-    static char *kwlist[] = {"arr", "mask", "values", NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!OO:putmask", kwlist,
-                &PyArray_Type, &array, &mask, &values)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("putmask", args, len_args, kwnames,
+            "", NULL, &array,
+            "mask", NULL, &mask,
+            "values", NULL, &values,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+    if (!PyArray_Check(array)) {
+        PyErr_SetString(PyExc_TypeError,
+                "argument a of putmask must be a numpy array");
+    }
+
     return PyArray_PutMask((PyArrayObject *)array, values, mask);
 }
 
@@ -1622,25 +1637,35 @@ _prepend_ones(PyArrayObject *arr, int nd, int ndmin, NPY_ORDER order)
                  ((order) == NPY_CORDER && PyArray_IS_C_CONTIGUOUS(op)) || \
                  ((order) == NPY_FORTRANORDER && PyArray_IS_F_CONTIGUOUS(op)))
 
+
 static inline PyObject *
 _array_fromobject_generic(
-        PyObject *op, PyArray_Descr *type, _PyArray_CopyMode copy, NPY_ORDER order,
-        npy_bool subok, int ndmin)
+        PyObject *op, PyArray_Descr *in_descr, PyArray_DTypeMeta *in_DType,
+        _PyArray_CopyMode copy, NPY_ORDER order, npy_bool subok, int ndmin)
 {
     PyArrayObject *oparr = NULL, *ret = NULL;
     PyArray_Descr *oldtype = NULL;
     int nd, flags = 0;
 
+    /* Hold on to `in_descr` as `dtype`, since we may also set it below. */
+    Py_XINCREF(in_descr);
+    PyArray_Descr *dtype = in_descr;
+
     if (ndmin > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
                 "ndmin bigger than allowable number of dimensions "
                 "NPY_MAXDIMS (=%d)", NPY_MAXDIMS);
-        return NULL;
+        goto finish;
     }
     /* fast exit if simple call */
     if (PyArray_CheckExact(op) || (subok && PyArray_Check(op))) {
         oparr = (PyArrayObject *)op;
-        if (type == NULL) {
+
+        if (dtype == NULL && in_DType == NULL) {
+            /*
+             * User did not ask for a specific dtype instance or class. So
+             * we can return either self or a copy.
+             */
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
                 ret = oparr;
                 Py_INCREF(ret);
@@ -1650,17 +1675,30 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
                 goto finish;
             }
         }
+        else if (dtype == NULL) {
+            /*
+             * If the user passed a DType class but not a dtype instance,
+             * we must use `PyArray_AdaptDescriptorToArray` to find the
+             * correct dtype instance.
+             * Even if the fast-path doesn't work we will use this.
+             */
+            dtype = PyArray_AdaptDescriptorToArray(oparr, in_DType, NULL);
+            if (dtype == NULL) {
+                goto finish;
+            }
+        }
+
         /* One more chance for faster exit if user specified the dtype. */
         oldtype = PyArray_DESCR(oparr);
-        if (PyArray_EquivTypes(oldtype, type)) {
+        if (PyArray_EquivTypes(oldtype, dtype)) {
             if (copy != NPY_COPY_ALWAYS && STRIDING_OK(oparr, order)) {
-                if (oldtype == type) {
+                if (oldtype == dtype) {
                     Py_INCREF(op);
                     ret = oparr;
                 }
@@ -1668,10 +1706,10 @@ _array_fromobject_generic(
                     /* Create a new PyArrayObject from the caller's
                      * PyArray_Descr. Use the reference `op` as the base
                      * object. */
-                    Py_INCREF(type);
+                    Py_INCREF(dtype);
                     ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
                             Py_TYPE(op),
-                            type,
+                            dtype,
                             PyArray_NDIM(oparr),
                             PyArray_DIMS(oparr),
                             PyArray_STRIDES(oparr),
@@ -1687,10 +1725,10 @@ _array_fromobject_generic(
                 if (copy == NPY_COPY_NEVER) {
                     PyErr_SetString(PyExc_ValueError,
                             "Unable to avoid copy while creating a new array.");
-                    return NULL;
+                    goto finish;
                 }
                 ret = (PyArrayObject *)PyArray_NewCopy(oparr, order);
-                if (oldtype == type || ret == NULL) {
+                if (oldtype == dtype || ret == NULL) {
                     goto finish;
                 }
                 Py_INCREF(oldtype);
@@ -1721,11 +1759,13 @@ _array_fromobject_generic(
     }
 
     flags |= NPY_ARRAY_FORCECAST;
-    Py_XINCREF(type);
-    ret = (PyArrayObject *)PyArray_CheckFromAny(op, type,
-                                                0, 0, flags, NULL);
+
+    ret = (PyArrayObject *)PyArray_CheckFromAny_int(
+            op, dtype, in_DType, 0, 0, flags, NULL);
 
 finish:
+    Py_XDECREF(dtype);
+
     if (ret == NULL) {
         return NULL;
     }
@@ -1752,7 +1792,7 @@ array_array(PyObject *NPY_UNUSED(ignored),
     npy_bool subok = NPY_FALSE;
     _PyArray_CopyMode copy = NPY_COPY_ALWAYS;
     int ndmin = 0;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1760,21 +1800,23 @@ array_array(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("array", args, len_args, kwnames,
                 "object", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$copy", &PyArray_CopyConverter, &copy,
                 "$order", &PyArray_OrderConverter, &order,
                 "$subok", &PyArray_BoolConverter, &subok,
                 "$ndmin", &PyArray_PythonPyIntFromInt, &ndmin,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "array", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1785,8 +1827,9 @@ array_array(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, copy, order, subok, ndmin);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, copy, order, subok, ndmin);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1795,7 +1838,7 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1803,18 +1846,20 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1824,8 +1869,9 @@ array_asarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_FALSE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_FALSE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1834,7 +1880,7 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     NPY_ORDER order = NPY_KEEPORDER;
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
@@ -1842,18 +1888,20 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asanyarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "|order", &PyArray_OrderConverter, &order,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asanyarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1863,8 +1911,9 @@ array_asanyarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, order, NPY_TRUE, 0);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, order, NPY_TRUE, 0);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1874,24 +1923,26 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("ascontiguousarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "ascontiguousarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1901,8 +1952,10 @@ array_ascontiguousarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_CORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_CORDER, NPY_FALSE,
+            1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -1912,24 +1965,26 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *op;
-    PyArray_Descr *type = NULL;
+    npy_dtype_info dt_info = {NULL, NULL};
     PyObject *like = Py_None;
     NPY_PREPARE_ARGPARSER;
 
     if (len_args != 1 || (kwnames != NULL)) {
         if (npy_parse_arguments("asfortranarray", args, len_args, kwnames,
                 "a", NULL, &op,
-                "|dtype", &PyArray_DescrConverter2, &type,
+                "|dtype", &PyArray_DTypeOrDescrConverterOptional, &dt_info,
                 "$like", NULL, &like,
                 NULL, NULL, NULL) < 0) {
-            Py_XDECREF(type);
+            Py_XDECREF(dt_info.descr);
+            Py_XDECREF(dt_info.dtype);
             return NULL;
         }
         if (like != Py_None) {
             PyObject *deferred = array_implement_c_array_function_creation(
                     "asfortranarray", like, NULL, NULL, args, len_args, kwnames);
             if (deferred != Py_NotImplemented) {
-                Py_XDECREF(type);
+                Py_XDECREF(dt_info.descr);
+                Py_XDECREF(dt_info.dtype);
                 return deferred;
             }
         }
@@ -1939,8 +1994,10 @@ array_asfortranarray(PyObject *NPY_UNUSED(ignored),
     }
 
     PyObject *res = _array_fromobject_generic(
-            op, type, NPY_FALSE, NPY_FORTRANORDER, NPY_FALSE, 1);
-    Py_XDECREF(type);
+            op, dt_info.descr, dt_info.dtype, NPY_FALSE, NPY_FORTRANORDER,
+            NPY_FALSE, 1);
+    Py_XDECREF(dt_info.descr);
+    Py_XDECREF(dt_info.dtype);
     return res;
 }
 
@@ -2046,10 +2103,9 @@ fail:
 }
 
 static PyObject *
-array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_empty_like(PyObject *NPY_UNUSED(ignored),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
-
-    static char *kwlist[] = {"prototype", "dtype", "order", "subok", "shape", NULL};
     PyArrayObject *prototype = NULL;
     PyArray_Descr *dtype = NULL;
     NPY_ORDER order = NPY_KEEPORDER;
@@ -2058,12 +2114,15 @@ array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     /* -1 is a special value meaning "not specified" */
     PyArray_Dims shape = {NULL, -1};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&iO&:empty_like", kwlist,
-                &PyArray_Converter, &prototype,
-                &PyArray_DescrConverter2, &dtype,
-                &PyArray_OrderConverter, &order,
-                &subok,
-                &PyArray_OptionalIntpConverter, &shape)) {
+    NPY_PREPARE_ARGPARSER;
+
+    if (npy_parse_arguments("empty_like", args, len_args, kwnames,
+            "prototype", &PyArray_Converter, &prototype,
+            "|dtype", &PyArray_DescrConverter2, &dtype,
+            "|order", &PyArray_OrderConverter, &order,
+            "|subok", &PyArray_PythonPyIntFromInt, &subok,
+            "|shape", &PyArray_OptionalIntpConverter, &shape,
+            NULL, NULL, NULL) < 0) {
         goto fail;
     }
     /* steals the reference to dtype if it's not NULL */
@@ -2249,12 +2308,15 @@ fail:
 }
 
 static PyObject *
-array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
+array_count_nonzero(PyObject *NPY_UNUSED(self), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArrayObject *array;
     npy_intp count;
 
-    if (!PyArg_ParseTuple(args, "O&:count_nonzero", PyArray_Converter, &array)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("count_nonzero", args, len_args, NULL,
+            "", PyArray_Converter, &array,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -2460,7 +2522,8 @@ array_frombuffer(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *keywds
 }
 
 static PyObject *
-array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array_concatenate(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *a0;
     PyObject *out = NULL;
@@ -2469,10 +2532,15 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     PyObject *casting_obj = NULL;
     PyObject *res;
     int axis = 0;
-    static char *kwlist[] = {"seq", "axis", "out", "dtype", "casting", NULL};
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&O$O&O:concatenate", kwlist,
-                &a0, PyArray_AxisConverter, &axis, &out,
-                PyArray_DescrConverter2, &dtype, &casting_obj)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("concatenate", args, len_args, kwnames,
+            "seq", NULL, &a0,
+            "|axis", &PyArray_AxisConverter, &axis,
+            "|out", NULL, &out,
+            "$dtype", &PyArray_DescrConverter2, &dtype,
+            "$casting", NULL, &casting_obj,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     int casting_not_passed = 0;
@@ -2504,25 +2572,34 @@ array_concatenate(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
 }
 
 static PyObject *
-array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_innerproduct(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *b0, *a0;
 
-    if (!PyArg_ParseTuple(args, "OO:innerproduct", &a0, &b0)) {
-        return NULL;
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("innerproduct", args, len_args, NULL,
+            "", NULL, &a0,
+            "", NULL, &b0,
+            NULL, NULL, NULL) < 0) {
+    return NULL;
     }
+
     return PyArray_Return((PyArrayObject *)PyArray_InnerProduct(a0, b0));
 }
 
 static PyObject *
-array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
+array_matrixproduct(PyObject *NPY_UNUSED(dummy),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *v, *a, *o = NULL;
     PyArrayObject *ret;
-    static char* kwlist[] = {"a", "b", "out", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O:matrixproduct",
-                                     kwlist, &a, &v, &o)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("dot", args, len_args, kwnames,
+            "a", NULL, &a,
+            "b", NULL, &v,
+            "|out", NULL, &o,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     if (o != NULL) {
@@ -2540,7 +2617,7 @@ array_matrixproduct(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject* kwds)
 
 
 static PyObject *
-array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     int typenum;
     char *ip1, *ip2, *op;
@@ -2553,7 +2630,11 @@ array_vdot(PyObject *NPY_UNUSED(dummy), PyObject *args)
     PyArray_DotFunc *vdot;
     NPY_BEGIN_THREADS_DEF;
 
-    if (!PyArg_ParseTuple(args, "OO:vdot", &op1, &op2)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("vdot", args, len_args, NULL,
+            "", NULL, &op1,
+            "", NULL, &op2,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
 
@@ -3130,13 +3211,9 @@ PyArray_GetNDArrayCFeatureVersion(void)
 }
 
 static PyObject *
-array__get_ndarray_c_version(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+array__get_ndarray_c_version(
+        PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
-    static char *kwlist[] = {NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist )) {
-        return NULL;
-    }
     return PyLong_FromLong( (long) PyArray_GetNDArrayCVersion() );
 }
 
@@ -3431,32 +3508,42 @@ fail:
 #undef INNER_WHERE_LOOP
 
 static PyObject *
-array_where(PyObject *NPY_UNUSED(ignored), PyObject *args)
+array_where(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args)
 {
     PyObject *obj = NULL, *x = NULL, *y = NULL;
 
-    if (!PyArg_ParseTuple(args, "O|OO:where", &obj, &x, &y)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("where", args, len_args, NULL,
+            "", NULL, &obj,
+            "|x", NULL, &x,
+            "|y", NULL, &y,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
+
     return PyArray_Where(obj, x, y);
 }
 
 static PyObject *
-array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+array_lexsort(PyObject *NPY_UNUSED(ignored), PyObject *const *args, Py_ssize_t len_args,
+                             PyObject *kwnames)
 {
     int axis = -1;
     PyObject *obj;
-    static char *kwlist[] = {"keys", "axis", NULL};
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|i:lexsort", kwlist, &obj, &axis)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("lexsort", args, len_args, kwnames,
+            "keys", NULL, &obj,
+            "|axis", PyArray_PythonPyIntFromInt, &axis,
+            NULL, NULL, NULL) < 0) {
         return NULL;
     }
     return PyArray_Return((PyArrayObject *)PyArray_LexSort(obj, axis));
 }
 
 static PyObject *
-array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
-        PyObject *kwds)
+array_can_cast_safely(PyObject *NPY_UNUSED(self),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
 {
     PyObject *from_obj = NULL;
     PyArray_Descr *d1 = NULL;
@@ -3464,12 +3551,13 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
     int ret;
     PyObject *retobj = NULL;
     NPY_CASTING casting = NPY_SAFE_CASTING;
-    static char *kwlist[] = {"from_", "to", "casting", NULL};
 
-    if(!PyArg_ParseTupleAndKeywords(args, kwds, "OO&|O&:can_cast", kwlist,
-                &from_obj,
-                PyArray_DescrConverter2, &d2,
-                PyArray_CastingConverter, &casting)) {
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("can_cast", args, len_args, kwnames,
+            "from_", NULL, &from_obj,
+            "to", &PyArray_DescrConverter2, &d2,
+            "|casting", &PyArray_CastingConverter, &casting,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
     if (d2 == NULL) {
@@ -3512,13 +3600,17 @@ array_can_cast_safely(PyObject *NPY_UNUSED(self), PyObject *args,
 }
 
 static PyObject *
-array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_promote_types(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len_args)
 {
     PyArray_Descr *d1 = NULL;
     PyArray_Descr *d2 = NULL;
     PyObject *ret = NULL;
-    if (!PyArg_ParseTuple(args, "O&O&:promote_types",
-                PyArray_DescrConverter2, &d1, PyArray_DescrConverter2, &d2)) {
+
+    NPY_PREPARE_ARGPARSER;
+    if (npy_parse_arguments("promote_types", args, len_args, NULL,
+            "", PyArray_DescrConverter2, &d1,
+            "", PyArray_DescrConverter2, &d2,
+            NULL, NULL, NULL) < 0) {
         goto finish;
     }
 
@@ -3558,14 +3650,13 @@ array_min_scalar_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
 }
 
 static PyObject *
-array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
+array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *const *args, Py_ssize_t len)
 {
-    npy_intp i, len, narr = 0, ndtypes = 0;
+    npy_intp i, narr = 0, ndtypes = 0;
     PyArrayObject **arr = NULL;
     PyArray_Descr **dtypes = NULL;
     PyObject *ret = NULL;
 
-    len = PyTuple_GET_SIZE(args);
     if (len == 0) {
         PyErr_SetString(PyExc_ValueError,
                         "at least one array or dtype is required");
@@ -3579,7 +3670,7 @@ array_result_type(PyObject *NPY_UNUSED(dummy), PyObject *args)
     dtypes = (PyArray_Descr**)&arr[len];
 
     for (i = 0; i < len; ++i) {
-        PyObject *obj = PyTuple_GET_ITEM(args, i);
+        PyObject *obj = args[i];
         if (PyArray_Check(obj)) {
             Py_INCREF(obj);
             arr[narr] = (PyArrayObject *)obj;
@@ -3785,6 +3876,34 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
                               TrimMode_LeaveOneZero, -1, -1);
 }
 
+/*
+ * returns 1 if array is a user-defined string dtype, sets an error and
+ * returns 0 otherwise
+ */
+static int _is_user_defined_string_array(PyArrayObject* array)
+{
+    if (NPY_DT_is_user_defined(PyArray_DESCR(array))) {
+        PyTypeObject* scalar_type = NPY_DTYPE(PyArray_DESCR(array))->scalar_type;
+        if (PyType_IsSubtype(scalar_type, &PyBytes_Type) ||
+            PyType_IsSubtype(scalar_type, &PyUnicode_Type)) {
+            return 1;
+        }
+        else {
+            PyErr_SetString(
+                PyExc_TypeError,
+                "string comparisons are only allowed for dtypes with a "
+                "scalar type that is a subtype of str or bytes.");
+            return 0;
+        }
+    }
+    else {
+        PyErr_SetString(
+            PyExc_TypeError,
+            "string operation on non-string array");
+        return 0;
+    }
+}
+
 
 /*
  * The only purpose of this function is that it allows the "rstrip".
@@ -3861,6 +3980,9 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
     else {
         PyErr_SetString(PyExc_TypeError,
                 "comparison of non-string arrays");
+        Py_DECREF(newarr);
+        Py_DECREF(newoth);
+        return NULL;
     }
     Py_DECREF(newarr);
     Py_DECREF(newoth);
@@ -4061,10 +4183,15 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw
         method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name);
     }
     else {
-        PyErr_SetString(PyExc_TypeError,
-                "string operation on non-string array");
-        Py_DECREF(type);
-        goto err;
+        if (_is_user_defined_string_array(char_array)) {
+            PyTypeObject* scalar_type =
+                NPY_DTYPE(PyArray_DESCR(char_array))->scalar_type;
+            method = PyObject_GetAttr((PyObject*)scalar_type, method_name);
+        }
+        else {
+            Py_DECREF(type);
+            goto err;
+        }
     }
     if (method == NULL) {
         Py_DECREF(type);
@@ -4339,13 +4466,22 @@ _reload_guard(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args)) {
     Py_RETURN_NONE;
 }
 
+
+static PyObject *
+_using_numpy2_behavior(
+        PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    return PyBool_FromLong(npy_numpy2_behavior);
+}
+
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
         METH_VARARGS, NULL},
     {"_get_ndarray_c_version",
         (PyCFunction)array__get_ndarray_c_version,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_NOARGS, NULL},
     {"_reconstruct",
         (PyCFunction)array__reconstruct,
         METH_VARARGS, NULL},
@@ -4390,25 +4526,25 @@ static struct PyMethodDef array_module_methods[] = {
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"count_nonzero",
         (PyCFunction)array_count_nonzero,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"empty",
         (PyCFunction)array_empty,
         METH_FASTCALL | METH_KEYWORDS, NULL},
     {"empty_like",
         (PyCFunction)array_empty_like,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"scalar",
         (PyCFunction)array_scalar,
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"where",
         (PyCFunction)array_where,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"lexsort",
         (PyCFunction)array_lexsort,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"putmask",
         (PyCFunction)array_putmask,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"fromstring",
         (PyCFunction)array_fromstring,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4417,16 +4553,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS|METH_KEYWORDS, NULL},
     {"concatenate",
         (PyCFunction)array_concatenate,
-        METH_VARARGS|METH_KEYWORDS, NULL},
+        METH_FASTCALL|METH_KEYWORDS, NULL},
     {"inner",
         (PyCFunction)array_innerproduct,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"dot",
         (PyCFunction)array_matrixproduct,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"vdot",
         (PyCFunction)array_vdot,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL, NULL},
     {"c_einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
@@ -4447,16 +4583,16 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"can_cast",
         (PyCFunction)array_can_cast_safely,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"promote_types",
         (PyCFunction)array_promote_types,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"min_scalar_type",
         (PyCFunction)array_min_scalar_type,
         METH_VARARGS, NULL},
     {"result_type",
         (PyCFunction)array_result_type,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"shares_memory",
         (PyCFunction)array_shares_memory,
         METH_VARARGS | METH_KEYWORDS, NULL},
@@ -4495,27 +4631,24 @@ static struct PyMethodDef array_module_methods[] = {
     {"_vec_string",
         (PyCFunction)_vec_string,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_insert", (PyCFunction)arr_insert,
+    {"_place", (PyCFunction)arr_place,
         METH_VARARGS | METH_KEYWORDS,
         "Insert vals sequentially into equivalent 1-d positions "
         "indicated by mask."},
     {"bincount", (PyCFunction)arr_bincount,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_monotonicity", (PyCFunction)arr__monotonicity,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"implement_array_function",
-        (PyCFunction)array_implement_array_function,
-        METH_VARARGS, NULL},
     {"interp", (PyCFunction)arr_interp,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"interp_complex", (PyCFunction)arr_interp_complex,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"ravel_multi_index", (PyCFunction)arr_ravel_multi_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unravel_index", (PyCFunction)arr_unravel_index,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"add_docstring", (PyCFunction)arr_add_docstring,
-        METH_VARARGS, NULL},
+        METH_FASTCALL, NULL},
     {"packbits", (PyCFunction)io_pack,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"unpackbits", (PyCFunction)io_unpack,
@@ -4525,7 +4658,7 @@ static struct PyMethodDef array_module_methods[] = {
     {"set_legacy_print_mode", (PyCFunction)set_legacy_print_mode,
         METH_VARARGS, NULL},
     {"_discover_array_parameters", (PyCFunction)_discover_array_parameters,
-        METH_VARARGS | METH_KEYWORDS, NULL},
+        METH_FASTCALL | METH_KEYWORDS, NULL},
     {"_get_castingimpl",  (PyCFunction)_get_castingimpl,
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api,
@@ -4538,10 +4671,10 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS | METH_KEYWORDS, NULL},
     {"seterrobj",
         (PyCFunction) ufunc_seterr,
-        METH_VARARGS, NULL},
+        METH_O, NULL},
     {"geterrobj",
         (PyCFunction) ufunc_geterr,
-        METH_VARARGS, NULL},
+        METH_NOARGS, NULL},
     {"get_handler_name",
         (PyCFunction) get_handler_name,
         METH_VARARGS, NULL},
@@ -4567,6 +4700,8 @@ static struct PyMethodDef array_module_methods[] = {
     {"_reload_guard", (PyCFunction)_reload_guard,
         METH_NOARGS,
         "Give a warning on reload and big warning in sub-interpreters."},
+    {"_using_numpy2_behavior", (PyCFunction)_using_numpy2_behavior,
+        METH_NOARGS, NULL},
     {"from_dlpack", (PyCFunction)from_dlpack,
         METH_O, NULL},
     {NULL, NULL, 0, NULL}                /* sentinel */
@@ -4752,6 +4887,7 @@ NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis1 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_axis2 = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_like = NULL;
 NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_numpy = NULL;
+NPY_VISIBILITY_HIDDEN PyObject * npy_ma_str_where = NULL;
 
 static int
 intern_strings(void)
@@ -4808,6 +4944,10 @@ intern_strings(void)
     if (npy_ma_str_numpy == NULL) {
         return -1;
     }
+    npy_ma_str_where = PyUnicode_InternFromString("where");
+    if (npy_ma_str_where == NULL) {
+        return -1;
+    }
     return 0;
 }
 
@@ -4839,6 +4979,12 @@ initialize_static_globals(void)
         return -1;
     }
 
+    /* Initialize from certain environment variabels: */
+    char *env = getenv("NPY_NUMPY_2_BEHAVIOR");
+    if (env != NULL && strcmp(env, "0") != 0) {
+        npy_numpy2_behavior = NPY_TRUE;
+    }
+
     return 0;
 }
 
@@ -5076,6 +5222,12 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (set_typeinfo(d) != 0) {
         goto err;
     }
+    if (PyType_Ready(&PyArrayFunctionDispatcher_Type) < 0) {
+        goto err;
+    }
+    PyDict_SetItemString(
+            d, "_ArrayFunctionDispatcher",
+            (PyObject *)&PyArrayFunctionDispatcher_Type);
     if (PyType_Ready(&PyArrayMethod_Type) < 0) {
         goto err;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.h b/numpy/core/src/multiarray/multiarraymodule.h
index 809736cd2..9ba2a1831 100644
--- a/numpy/core/src/multiarray/multiarraymodule.h
+++ b/numpy/core/src/multiarray/multiarraymodule.h
@@ -1,6 +1,8 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_
 
+NPY_VISIBILITY_HIDDEN extern int npy_numpy2_behavior;
+
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_current_allocator;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_array_function;
@@ -14,5 +16,6 @@ NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis1;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_axis2;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_like;
 NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_numpy;
+NPY_VISIBILITY_HIDDEN extern PyObject * npy_ma_str_where;
 
 #endif  /* NUMPY_CORE_SRC_MULTIARRAY_MULTIARRAYMODULE_H_ */
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index 37e8acd84..28b7bf6e6 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -17,6 +17,7 @@
 #include "nditer_impl.h"
 #include "templ_common.h"
 #include "ctors.h"
+#include "refcount.h"
 
 /* Internal helper functions private to this file */
 static npy_intp
@@ -1945,8 +1946,7 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * only copy back when this flag is on.
          */
         if ((transferinfo[iop].write.func != NULL) &&
-               (op_itflags[iop]&(NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER))
-                        == (NPY_OP_ITFLAG_WRITE|NPY_OP_ITFLAG_USINGBUFFER)) {
+               (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             npy_intp op_transfersize;
 
             npy_intp src_stride, *dst_strides, *dst_coords, *dst_shape;
@@ -2059,27 +2059,18 @@ npyiter_copy_from_buffers(NpyIter *iter)
          * The flag USINGBUFFER is set when the buffer was used, so
          * only decrement refs when this flag is on.
          */
-        else if (transferinfo[iop].write.func != NULL &&
-                       (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER) != 0) {
-            NPY_IT_DBG_PRINT1("Iterator: Freeing refs and zeroing buffer "
-                                "of operand %d\n", (int)iop);
-            /* Decrement refs */
+        else if (transferinfo[iop].clear.func != NULL &&
+                    (op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+            NPY_IT_DBG_PRINT1(
+                    "Iterator: clearing refs of operand %d\n", (int)iop);
             npy_intp buf_stride = dtypes[iop]->elsize;
-            if (transferinfo[iop].write.func(
-                    &transferinfo[iop].write.context,
-                    &buffer, &transfersize, &buf_stride,
-                    transferinfo[iop].write.auxdata) < 0) {
+            if (transferinfo[iop].clear.func(
+                    NULL, transferinfo[iop].clear.descr, buffer, transfersize,
+                    buf_stride, transferinfo[iop].clear.auxdata) < 0) {
                 /* Since this should only decrement, it should never error */
                 assert(0);
                 return -1;
             }
-            /*
-             * Zero out the memory for safety.  For instance,
-             * if during iteration some Python code copied an
-             * array pointing into the buffer, it will get None
-             * values for its references after this.
-             */
-            memset(buffer, 0, dtypes[iop]->elsize*transfersize);
         }
     }
 
@@ -2626,54 +2617,36 @@ npyiter_clear_buffers(NpyIter *iter)
         return;
     }
 
-    if (!(NIT_ITFLAGS(iter) & NPY_ITFLAG_NEEDSAPI)) {
-        /* Buffers do not require clearing, but should not be copied back */
-        NBF_SIZE(bufferdata) = 0;
-        return;
-    }
-
     /*
-     * The iterator may be using a dtype with references, which always
-     * requires the API. In that case, further cleanup may be necessary.
-     *
-     * TODO: At this time, we assume that a dtype having references
-     *       implies the need to hold the GIL at all times. In theory
-     *       we could broaden this definition for a new
-     *       `PyArray_Item_XDECREF` API and the assumption may become
-     *       incorrect.
+     * The iterator may be using a dtype with references (as of writing this
+     * means Python objects, but does not need to stay that way).
+     * In that case, further cleanup may be necessary and we clear buffers
+     * explicitly.
      */
     PyObject *type, *value, *traceback;
     PyErr_Fetch(&type,  &value, &traceback);
 
     /* Cleanup any buffers with references */
     char **buffers = NBF_BUFFERS(bufferdata);
+
+    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
     PyArray_Descr **dtypes = NIT_DTYPES(iter);
     npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
     for (int iop = 0; iop < nop; ++iop, ++buffers) {
-        /*
-         * We may want to find a better way to do this, on the other hand,
-         * this cleanup seems rare and fairly special.  A dtype using
-         * references (right now only us) must always keep the buffer in
-         * a well defined state (either NULL or owning the reference).
-         * Only we implement cleanup
-         */
-        if (!PyDataType_REFCHK(dtypes[iop]) ||
-                !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
+        if (transferinfo[iop].clear.func == NULL ||
+                 !(op_itflags[iop]&NPY_OP_ITFLAG_USINGBUFFER)) {
             continue;
         }
         if (*buffers == 0) {
             continue;
         }
         int itemsize = dtypes[iop]->elsize;
-        for (npy_intp i = 0; i < NBF_SIZE(bufferdata); i++) {
-            /*
-             * See above comment, if this API is expanded the GIL assumption
-             * could become incorrect.
-             */
-            PyArray_Item_XDECREF(*buffers + (itemsize * i), dtypes[iop]);
+        if (transferinfo[iop].clear.func(NULL,
+                dtypes[iop], *buffers, NBF_SIZE(bufferdata), itemsize,
+                transferinfo[iop].clear.auxdata) < 0) {
+            /* This should never fail; if it does write it out */
+            PyErr_WriteUnraisable(NULL);
         }
-        /* Clear out the buffer just to be sure */
-        memset(*buffers, 0, NBF_SIZE(bufferdata) * itemsize);
     }
     /* Signal that the buffers are empty */
     NBF_SIZE(bufferdata) = 0;
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index b969c9f1d..dfa84c51c 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -19,6 +19,8 @@
 #include "array_coercion.h"
 #include "templ_common.h"
 #include "array_assign.h"
+#include "dtype_traversal.h"
+
 
 /* Internal helper functions private to this file */
 static int
@@ -630,6 +632,18 @@ NpyIter_Copy(NpyIter *iter)
                     }
                 }
             }
+
+            if (transferinfo[iop].clear.func != NULL) {
+                if (out_of_memory) {
+                    transferinfo[iop].clear.func = NULL;  /* No cleanup */
+                }
+                else {
+                    if (NPY_traverse_info_copy(&transferinfo[iop].clear,
+                                               &transferinfo[iop].clear) < 0) {
+                        out_of_memory = 1;
+                    }
+                }
+            }
         }
 
         /* Initialize the buffers to the current iterindex */
@@ -706,6 +720,7 @@ NpyIter_Deallocate(NpyIter *iter)
         for (iop = 0; iop < nop; ++iop, ++transferinfo) {
             NPY_cast_info_xfree(&transferinfo->read);
             NPY_cast_info_xfree(&transferinfo->write);
+            NPY_traverse_info_xfree(&transferinfo->clear);
         }
     }
 
@@ -1106,7 +1121,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                                              NPY_ITEM_IS_POINTER))) != 0)) {
                 PyErr_SetString(PyExc_TypeError,
                         "Iterator operand or requested dtype holds "
-                        "references, but the REFS_OK flag was not enabled");
+                        "references, but the NPY_ITER_REFS_OK flag was not "
+                        "enabled");
                 return 0;
             }
         }
@@ -1118,7 +1134,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         if (op_request_dtype != NULL) {
             /* We just have a borrowed reference to op_request_dtype */
             Py_SETREF(*op_dtype, PyArray_AdaptDescriptorToArray(
-                                            *op, (PyObject *)op_request_dtype));
+                                            *op, NULL, op_request_dtype));
             if (*op_dtype == NULL) {
                 return 0;
             }
@@ -1133,7 +1149,7 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                           PyArray_DescrNewByteorder(*op_dtype, NPY_NATIVE));
                 if (*op_dtype == NULL) {
                     return 0;
-                }                
+                }
                 NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                     "because of NPY_ITER_NBO\n");
                 /* Indicate that byte order or alignment needs fixing */
@@ -3186,31 +3202,33 @@ npyiter_allocate_transfer_functions(NpyIter *iter)
                     cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                 }
             }
-            /* If no write back but there are references make a decref fn */
-            else if (PyDataType_REFCHK(op_dtype[iop])) {
+            else {
+                transferinfo[iop].write.func = NULL;
+            }
+
+            /* Get the decref function, used for no-writeback and on error */
+            if (PyDataType_REFCHK(op_dtype[iop])) {
                 /*
                  * By passing NULL to dst_type and setting move_references
                  * to 1, we get back a function that just decrements the
                  * src references.
                  */
-                if (PyArray_GetDTypeTransferFunction(
+                if (PyArray_GetClearFunction(
                         (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
-                        op_dtype[iop]->elsize, 0,
-                        op_dtype[iop], NULL,
-                        1,
-                        &transferinfo[iop].write,
-                        &nc_flags) != NPY_SUCCEED) {
+                        op_dtype[iop]->elsize, op_dtype[iop],
+                        &transferinfo[iop].clear, &nc_flags) < 0) {
                     goto fail;
                 }
                 cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
             }
             else {
-                transferinfo[iop].write.func = NULL;
+                transferinfo[iop].clear.func = NULL;
             }
         }
         else {
             transferinfo[iop].read.func = NULL;
             transferinfo[iop].write.func = NULL;
+            transferinfo[iop].clear.func = NULL;
         }
     }
 
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 5bf47fabd..a33e4f90f 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -23,6 +23,8 @@
 
 #include "lowlevel_strided_loops.h"
 #include "dtype_transfer.h"
+#include "dtype_traversal.h"
+
 
 /********** ITERATOR CONSTRUCTION TIMING **************/
 #define NPY_IT_CONSTRUCTION_TIMING 0
@@ -242,6 +244,7 @@ typedef npy_int16 npyiter_opitflags;
 struct NpyIter_TransferInfo_tag {
     NPY_cast_info read;
     NPY_cast_info write;
+    NPY_traverse_info clear;
     /* Probably unnecessary, but make sure what follows is intp aligned: */
     npy_intp _unused_ensure_alignment[];
 };
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index df814a796..35e4af79c 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -53,6 +53,8 @@ static PyObject *
 array_inplace_remainder(PyArrayObject *m1, PyObject *m2);
 static PyObject *
 array_inplace_power(PyArrayObject *a1, PyObject *o2, PyObject *NPY_UNUSED(modulo));
+static PyObject *
+array_inplace_matrix_multiply(PyArrayObject *m1, PyObject *m2);
 
 /*
  * Dictionary can contain any of the numeric operations, by name.
@@ -339,7 +341,6 @@ array_divmod(PyObject *m1, PyObject *m2)
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.divmod);
 }
 
-/* Need this to be version dependent on account of the slot check */
 static PyObject *
 array_matrix_multiply(PyObject *m1, PyObject *m2)
 {
@@ -348,13 +349,70 @@ array_matrix_multiply(PyObject *m1, PyObject *m2)
 }
 
 static PyObject *
-array_inplace_matrix_multiply(
-        PyArrayObject *NPY_UNUSED(m1), PyObject *NPY_UNUSED(m2))
+array_inplace_matrix_multiply(PyArrayObject *self, PyObject *other)
 {
-    PyErr_SetString(PyExc_TypeError,
-                    "In-place matrix multiplication is not (yet) supported. "
-                    "Use 'a = a @ b' instead of 'a @= b'.");
-    return NULL;
+    static PyObject *AxisError_cls = NULL;
+    npy_cache_import("numpy.exceptions", "AxisError", &AxisError_cls);
+    if (AxisError_cls == NULL) {
+        return NULL;
+    }
+
+    INPLACE_GIVE_UP_IF_NEEDED(self, other,
+            nb_inplace_matrix_multiply, array_inplace_matrix_multiply);
+
+    /*
+     * Unlike `matmul(a, b, out=a)` we ensure that the result is not broadcast
+     * if the result without `out` would have less dimensions than `a`.
+     * Since the signature of matmul is '(n?,k),(k,m?)->(n?,m?)' this is the
+     * case exactly when the second operand has both core dimensions.
+     *
+     * The error here will be confusing, but for now, we enforce this by
+     * passing the correct `axes=`.
+     */
+    static PyObject *axes_1d_obj_kwargs = NULL;
+    static PyObject *axes_2d_obj_kwargs = NULL;
+    if (NPY_UNLIKELY(axes_1d_obj_kwargs == NULL)) {
+        axes_1d_obj_kwargs = Py_BuildValue(
+                "{s, [(i), (i, i), (i)]}", "axes", -1, -2, -1, -1);
+        if (axes_1d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+    if (NPY_UNLIKELY(axes_2d_obj_kwargs == NULL)) {
+        axes_2d_obj_kwargs = Py_BuildValue(
+                "{s, [(i, i), (i, i), (i, i)]}", "axes", -2, -1, -2, -1, -2, -1);
+        if (axes_2d_obj_kwargs == NULL) {
+            return NULL;
+        }
+    }
+
+    PyObject *args = PyTuple_Pack(3, self, other, self);
+    if (args == NULL) {
+        return NULL;
+    }
+    PyObject *kwargs;
+    if (PyArray_NDIM(self) == 1) {
+        kwargs = axes_1d_obj_kwargs;
+    }
+    else {
+        kwargs = axes_2d_obj_kwargs;
+    }
+    PyObject *res = PyObject_Call(n_ops.matmul, args, kwargs);
+    Py_DECREF(args);
+
+    if (res == NULL) {
+        /*
+         * AxisError should indicate that the axes argument didn't work out
+         * which should mean the second operand not being 2 dimensional.
+         */
+        if (PyErr_ExceptionMatches(AxisError_cls)) {
+            PyErr_SetString(PyExc_ValueError,
+                "inplace matrix multiplication requires the first operand to "
+                "have at least one and the second at least two dimensions.");
+        }
+    }
+
+    return res;
 }
 
 /*
@@ -539,47 +597,10 @@ array_power(PyObject *a1, PyObject *o2, PyObject *modulo)
 static PyObject *
 array_positive(PyArrayObject *m1)
 {
-    /*
-     * For backwards compatibility, where + just implied a copy,
-     * we cannot just call n_ops.positive.  Instead, we do the following
-     * 1. Try n_ops.positive
-     * 2. If we get an exception, check whether __array_ufunc__ is
-     *    overridden; if so, we live in the future and we allow the
-     *    TypeError to be passed on.
-     * 3. If not, give a deprecation warning and return a copy.
-     */
-    PyObject *value;
     if (can_elide_temp_unary(m1)) {
-        value = PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
-    }
-    else {
-        value = PyArray_GenericUnaryFunction(m1, n_ops.positive);
+        return PyArray_GenericInplaceUnaryFunction(m1, n_ops.positive);
     }
-    if (value == NULL) {
-        /*
-         * We first fetch the error, as it needs to be clear to check
-         * for the override.  When the deprecation is removed,
-         * this whole stanza can be deleted.
-         */
-        PyObject *exc, *val, *tb;
-        PyErr_Fetch(&exc, &val, &tb);
-        if (PyUFunc_HasOverride((PyObject *)m1)) {
-            PyErr_Restore(exc, val, tb);
-            return NULL;
-        }
-        Py_XDECREF(exc);
-        Py_XDECREF(val);
-        Py_XDECREF(tb);
-
-        /* 2018-06-28, 1.16.0 */
-        if (DEPRECATE("Applying '+' to a non-numerical array is "
-                      "ill-defined. Returning a copy, but in the future "
-                      "this will error.") < 0) {
-            return NULL;
-        }
-        value = PyArray_Return((PyArrayObject *)PyArray_Copy(m1));
-    }
-    return value;
+    return PyArray_GenericUnaryFunction(m1, n_ops.positive);
 }
 
 static PyObject *
@@ -848,13 +869,11 @@ array_scalar_forward(PyArrayObject *v,
                      PyObject *(*builtin_func)(PyObject *),
                      const char *where)
 {
-    PyObject *scalar;
-    if (PyArray_SIZE(v) != 1) {
-        PyErr_SetString(PyExc_TypeError, "only size-1 arrays can be"\
-                        " converted to Python scalars");
+    if (check_is_convertible_to_scalar(v) < 0) {
         return NULL;
     }
 
+    PyObject *scalar;
     scalar = PyArray_GETITEM(v, PyArray_DATA(v));
     if (scalar == NULL) {
         return NULL;
diff --git a/numpy/core/src/multiarray/refcount.c b/numpy/core/src/multiarray/refcount.c
index a1c310700..876bb53e1 100644
--- a/numpy/core/src/multiarray/refcount.c
+++ b/numpy/core/src/multiarray/refcount.c
@@ -2,6 +2,10 @@
  * This module corresponds to the `Special functions for NPY_OBJECT`
  * section in the numpy reference for C-API.
  */
+#include "array_method.h"
+#include "dtype_traversal.h"
+#include "lowlevel_strided_loops.h"
+#include "pyerrors.h"
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
 #define _MULTIARRAYMODULE
 
@@ -12,13 +16,101 @@
 #include "numpy/arrayobject.h"
 #include "numpy/arrayscalars.h"
 #include "iterators.h"
+#include "dtypemeta.h"
+#include "refcount.h"
 
 #include "npy_config.h"
 
 #include "npy_pycompat.h"
 
-static void
-_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+/*
+ * Helper function to clear a strided memory (normally or always contiguous)
+ * from all Python (or other) references.  The function does nothing if the
+ * array dtype does not indicate holding references.
+ *
+ * It is safe to call this function more than once, failing here is usually
+ * critical (during cleanup) and should be set up to minimize the risk or
+ * avoid it fully.
+ */
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned)
+{
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+
+    int res = clear_info.func(
+            NULL, clear_info.descr, data, size, stride, clear_info.auxdata);
+    NPY_traverse_info_xfree(&clear_info);
+    return res;
+}
+
+
+/*
+ * Helper function to clear whole array.  It seems plausible that we should
+ * be able to get away with assuming the the array is contiguous.
+ *
+ * Must only be called on arrays which own their data (and asserts this).
+ */
+ NPY_NO_EXPORT int
+ PyArray_ClearArray(PyArrayObject *arr)
+ {
+    assert(PyArray_FLAGS(arr) & NPY_ARRAY_OWNDATA);
+
+    PyArray_Descr *descr = PyArray_DESCR(arr);
+    if (!PyDataType_REFCHK(descr)) {
+        return 0;
+    }
+    /*
+     * The contiguous path should cover practically all important cases since
+     * it is difficult to create a non-contiguous array which owns its memory
+     * and only arrays which own their memory should clear it.
+     */
+    int aligned = PyArray_ISALIGNED(arr);
+    if (PyArray_ISCONTIGUOUS(arr)) {
+        return PyArray_ClearBuffer(
+                descr, PyArray_BYTES(arr), descr->elsize,
+                PyArray_SIZE(arr), aligned);
+    }
+    int idim, ndim;
+    npy_intp shape_it[NPY_MAXDIMS], strides_it[NPY_MAXDIMS];
+    npy_intp coord[NPY_MAXDIMS];
+    char *data_it;
+    if (PyArray_PrepareOneRawArrayIter(
+                    PyArray_NDIM(arr), PyArray_DIMS(arr),
+                    PyArray_BYTES(arr), PyArray_STRIDES(arr),
+                    &ndim, shape_it, &data_it, strides_it) < 0) {
+        return -1;
+    }
+    npy_intp inner_stride = strides_it[0];
+    npy_intp inner_shape = shape_it[0];
+    NPY_traverse_info clear_info;
+    /* Flags unused: float errors do not matter and we do not release GIL */
+    NPY_ARRAYMETHOD_FLAGS flags_unused;
+    if (PyArray_GetClearFunction(
+            aligned, inner_stride, descr, &clear_info, &flags_unused) < 0) {
+        return -1;
+    }
+    NPY_RAW_ITER_START(idim, ndim, coord, shape_it) {
+        /* Process the innermost dimension */
+        if (clear_info.func(NULL, clear_info.descr,
+                data_it, inner_shape, inner_stride, clear_info.auxdata) < 0) {
+            return -1;
+        }
+    } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord,
+                            shape_it, data_it, strides_it);
+    return 0;
+}
 
 
 /*NUMPY_API
@@ -204,6 +296,10 @@ PyArray_INCREF(PyArrayObject *mp)
 /*NUMPY_API
   Decrement all internal references for object arrays.
   (or arrays with object fields)
+
+  The use of this function is strongly discouraged, within NumPy
+  use PyArray_Clear, which DECREF's and sets everything to NULL and can
+  work with any dtype.
 */
 NPY_NO_EXPORT int
 PyArray_XDECREF(PyArrayObject *mp)
@@ -254,15 +350,28 @@ PyArray_XDECREF(PyArrayObject *mp)
     return 0;
 }
 
+
+static void
+_fillobject(char *optr, PyObject *obj, PyArray_Descr *dtype);
+
+
 /*NUMPY_API
  * Assumes contiguous
  */
 NPY_NO_EXPORT void
 PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
 {
+    PyArray_Descr* descr = PyArray_DESCR(arr);
+
+    // non-legacy dtypes are responsible for initializing
+    // their own internal references
+    if (!NPY_DT_is_legacy(NPY_DTYPE(descr))) {
+        return;
+    }
+
     npy_intp i,n;
     n = PyArray_SIZE(arr);
-    if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) {
+    if (descr->type_num == NPY_OBJECT) {
         PyObject **optr;
         optr = (PyObject **)(PyArray_DATA(arr));
         n = PyArray_SIZE(arr);
@@ -282,8 +391,8 @@ PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj)
         char *optr;
         optr = PyArray_DATA(arr);
         for (i = 0; i < n; i++) {
-            _fillobject(optr, obj, PyArray_DESCR(arr));
-            optr += PyArray_DESCR(arr)->elsize;
+            _fillobject(optr, obj, descr);
+            optr += descr->elsize;
         }
     }
 }
diff --git a/numpy/core/src/multiarray/refcount.h b/numpy/core/src/multiarray/refcount.h
index 959eef5ba..16d34e292 100644
--- a/numpy/core/src/multiarray/refcount.h
+++ b/numpy/core/src/multiarray/refcount.h
@@ -1,6 +1,14 @@
 #ifndef NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 #define NUMPY_CORE_SRC_MULTIARRAY_REFCOUNT_H_
 
+NPY_NO_EXPORT int
+PyArray_ClearBuffer(
+        PyArray_Descr *descr, char *data,
+        npy_intp stride, npy_intp size, int aligned);
+
+NPY_NO_EXPORT int
+PyArray_ClearArray(PyArrayObject *arr);
+
 NPY_NO_EXPORT void
 PyArray_Item_INCREF(char *data, PyArray_Descr *descr);
 
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index bb5d36ba1..ff30737b5 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -15,6 +15,7 @@
 
 #include "npy_pycompat.h"
 
+#include "arraytypes.h"
 #include "npy_config.h"
 #include "mapping.h"
 #include "ctors.h"
@@ -278,7 +279,43 @@ static PyObject *
 genint_type_str(PyObject *self)
 {
     PyObject  *item, *item_str;
-    item = gentype_generic_method(self, NULL, NULL, "item");
+    PyArray_Descr *descr = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(self));
+    void *val = scalar_value(self, descr);
+    switch (descr->type_num) {
+        case NPY_BYTE:
+            item = PyLong_FromLong(*(int8_t *)val);
+            break;
+        case NPY_UBYTE:
+            item = PyLong_FromUnsignedLong(*(uint8_t *)val);
+            break;
+        case NPY_SHORT:
+            item = PyLong_FromLong(*(int16_t *)val);
+            break;
+        case NPY_USHORT:
+            item = PyLong_FromUnsignedLong(*(uint16_t *)val);
+            break;
+        case NPY_INT:
+            item = PyLong_FromLong(*(int32_t *)val);
+            break;
+        case NPY_UINT:
+            item = PyLong_FromUnsignedLong(*(uint32_t *)val);
+            break;
+        case NPY_LONG:
+            item = PyLong_FromLong(*(int64_t *)val);
+            break;
+        case NPY_ULONG:
+            item = PyLong_FromUnsignedLong(*(uint64_t *)val);
+            break;
+        case NPY_LONGLONG:
+            item = PyLong_FromLongLong(*(long long *)val);
+            break;
+        case NPY_ULONGLONG:
+            item = PyLong_FromUnsignedLongLong(*(unsigned long long *)val);
+            break;
+        default:
+            item = gentype_generic_method(self, NULL, NULL, "item");
+            break;
+    }
     if (item == NULL) {
         return NULL;
     }
@@ -896,15 +933,21 @@ static PyObject *
 @name@type_@kind@_either(npy_@name@ val, TrimMode trim_pos, TrimMode trim_sci,
                          npy_bool sign)
 {
-    npy_@name@ absval;
 
     if (npy_legacy_print_mode <= 113) {
         return legacy_@name@_format@kind@(val);
     }
 
-    absval = val < 0 ? -val : val;
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_@name@ absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
 
-    if (absval == 0 || (absval < 1.e16L && absval >= 1.e-4L) ) {
+    if (use_positional) {
         return format_@name@(val, 0, -1, sign, trim_pos, -1, -1, -1);
     }
     return format_@name@(val, 1, -1, sign, trim_sci, -1, -1, -1);
@@ -1072,6 +1115,8 @@ gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
         }
     }
 
+   RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+
     arr = PyArray_FromScalar(self, NULL);
     if (arr == NULL) {
         return NULL;
@@ -1844,10 +1889,7 @@ gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
 static PyObject *
 numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 {
-    PyObject *generic_alias;
-
-#ifdef Py_GENERICALIASOBJECT_H
-    Py_ssize_t args_len;
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     int args_len_expected;
 
     /* complexfloating should take 2 parameters, all others take 1 */
@@ -1859,20 +1901,13 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
         args_len_expected = 1;
     }
 
-    args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
     if ((args_len > args_len_expected) || (args_len == 0)) {
         return PyErr_Format(PyExc_TypeError,
                             "Too %s arguments for %s",
                             args_len > args_len_expected ? "many" : "few",
                             ((PyTypeObject *)cls)->tp_name);
     }
-    generic_alias = Py_GenericAlias(cls, args);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-    generic_alias = NULL;
-#endif
-    return generic_alias;
+    return Py_GenericAlias(cls, args);
 }
 
 /*
@@ -1883,14 +1918,9 @@ numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
 static PyObject *
 numbertype_class_getitem(PyObject *cls, PyObject *args)
 {
-#ifdef Py_GENERICALIASOBJECT_H
     PyErr_Format(PyExc_TypeError,
                  "There are no type variables left in %s",
                  ((PyTypeObject *)cls)->tp_name);
-#else
-    PyErr_SetString(PyExc_TypeError,
-                    "Type subscription requires python >= 3.9");
-#endif
     return NULL;
 }
 
@@ -2278,7 +2308,7 @@ static PyGetSetDef inttype_getsets[] = {
 };
 
 static PyMethodDef numbertype_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem_abc,
         METH_CLASS | METH_O, NULL},
@@ -2292,7 +2322,7 @@ static PyMethodDef @name@type_methods[] = {
     {"__complex__",
         (PyCFunction)@name@_complex,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2333,7 +2363,7 @@ static PyMethodDef @name@type_methods[] = {
     {"is_integer",
         (PyCFunction)@name@_is_integer,
         METH_NOARGS, NULL},
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2345,7 +2375,7 @@ static PyMethodDef @name@type_methods[] = {
  * #name = timedelta, cdouble#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -2358,7 +2388,7 @@ static PyMethodDef @name@type_methods[] = {
  *         long, ulong, longlong, ulonglong#
  */
 static PyMethodDef @name@type_methods[] = {
-    /* for typing; requires python >= 3.9 */
+    /* for typing */
     {"__class_getitem__",
         (PyCFunction)numbertype_class_getitem,
         METH_CLASS | METH_O, NULL},
@@ -3588,7 +3618,7 @@ object_arrtype_call(PyObjectScalarObject *obj, PyObject *args, PyObject *kwds)
 
 NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.object_",
+    .tp_name = "numpy." NPY_OBJECT_name,
     .tp_basicsize = sizeof(PyObjectScalarObject),
     .tp_dealloc = (destructor)object_arrtype_dealloc,
     .tp_as_sequence = &object_arrtype_as_sequence,
@@ -3624,60 +3654,29 @@ gen_arrtype_subscript(PyObject *self, PyObject *key)
 }
 
 
-#define NAME_bool "bool"
-#define NAME_void "void"
-#define NAME_string "bytes"
-#define NAME_unicode "str"
-
 /**begin repeat
- * #name = bool, string, unicode, void#
- * #NAME = Bool, String, Unicode, Void#
- * #ex = _,_,_,#
+ * #Name = Bool,
+ *         Byte, Short, Int, Long, LongLong,
+ *         UByte, UShort, UInt, ULong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble,
+ *         String, Unicode, Void,
+ *         Datetime, Timedelta#
+ * #NAME = BOOL,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *         STRING, UNICODE, VOID,
+ *         DATETIME, TIMEDELTA#
  */
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy." NAME_@name@ "@ex@",
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-};
-/**end repeat**/
-
-#undef NAME_bool
-#undef NAME_void
-#undef NAME_string
-#undef NAME_unicode
 
-/**begin repeat
- * #NAME = Byte, Short, Int, Long, LongLong, UByte, UShort, UInt, ULong,
- *         ULongLong, Half, Float, Double, LongDouble, Datetime, Timedelta#
- * #name = int*5, uint*5, float*4, datetime, timedelta#
- * #CNAME = (CHAR, SHORT, INT, LONG, LONGLONG)*2, HALF, FLOAT, DOUBLE,
- *          LONGDOUBLE, DATETIME, TIMEDELTA#
- */
-#if NPY_BITSOF_@CNAME@ == 8
-#define _THIS_SIZE "8"
-#elif NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "16"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "80"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "96"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "256"
-#endif
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
+NPY_NO_EXPORT PyTypeObject Py@Name@ArrType_Type = {
     PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
+    .tp_name = "numpy." NPY_@NAME@_name,
+    .tp_basicsize = sizeof(Py@Name@ScalarObject),
 };
 
-
-#undef _THIS_SIZE
 /**end repeat**/
 
 
@@ -3686,37 +3685,6 @@ static PyMappingMethods gentype_as_mapping = {
 };
 
 
-/**begin repeat
- * #NAME = CFloat, CDouble, CLongDouble#
- * #name = complex*3#
- * #CNAME = FLOAT, DOUBLE, LONGDOUBLE#
- */
-#if NPY_BITSOF_@CNAME@ == 16
-#define _THIS_SIZE "32"
-#elif NPY_BITSOF_@CNAME@ == 32
-#define _THIS_SIZE "64"
-#elif NPY_BITSOF_@CNAME@ == 64
-#define _THIS_SIZE "128"
-#elif NPY_BITSOF_@CNAME@ == 80
-#define _THIS_SIZE "160"
-#elif NPY_BITSOF_@CNAME@ == 96
-#define _THIS_SIZE "192"
-#elif NPY_BITSOF_@CNAME@ == 128
-#define _THIS_SIZE "256"
-#elif NPY_BITSOF_@CNAME@ == 256
-#define _THIS_SIZE "512"
-#endif
-
-NPY_NO_EXPORT PyTypeObject Py@NAME@ArrType_Type = {
-    PyVarObject_HEAD_INIT(0, 0)
-    .tp_name = "numpy.@name@" _THIS_SIZE,
-    .tp_basicsize = sizeof(Py@NAME@ScalarObject),
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-};
-#undef _THIS_SIZE
-
-/**end repeat**/
-
 /*
  * This table maps the built-in type numbers to their scalar
  * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
@@ -4105,36 +4073,6 @@ initialize_numeric_types(void)
 
     PyArrayIter_Type.tp_iter = PyObject_SelfIter;
     PyArrayMapIter_Type.tp_iter = PyObject_SelfIter;
-
-    /*
-     * Give types different names when they are the same size (gh-9799).
-     * `np.intX` always refers to the first int of that size in the sequence
-     * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
-     */
-#if (NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT)
-    PyByteArrType_Type.tp_name = "numpy.byte";
-    PyUByteArrType_Type.tp_name = "numpy.ubyte";
-#endif
-#if (NPY_SIZEOF_SHORT == NPY_SIZEOF_INT)
-    PyShortArrType_Type.tp_name = "numpy.short";
-    PyUShortArrType_Type.tp_name = "numpy.ushort";
-#endif
-#if (NPY_SIZEOF_INT == NPY_SIZEOF_LONG)
-    PyIntArrType_Type.tp_name = "numpy.intc";
-    PyUIntArrType_Type.tp_name = "numpy.uintc";
-#endif
-#if (NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG)
-    PyLongLongArrType_Type.tp_name = "numpy.longlong";
-    PyULongLongArrType_Type.tp_name = "numpy.ulonglong";
-#endif
-
-    /*
-    Do the same for longdouble
-    */
-#if (NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE)
-    PyLongDoubleArrType_Type.tp_name = "numpy.longdouble";
-    PyCLongDoubleArrType_Type.tp_name = "numpy.clongdouble";
-#endif
 }
 
 typedef struct {
diff --git a/numpy/core/src/multiarray/shape.c b/numpy/core/src/multiarray/shape.c
index 25af43bbc..15b166423 100644
--- a/numpy/core/src/multiarray/shape.c
+++ b/numpy/core/src/multiarray/shape.c
@@ -285,7 +285,7 @@ PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims,
             Py_TYPE(self), PyArray_DESCR(self),
             ndim, dimensions, strides, PyArray_DATA(self),
             flags, (PyObject *)self, (PyObject *)self,
-            0, 1);
+            _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
     Py_DECREF(self);
     return (PyObject *)ret;
 }
diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c
index 9c490b3f7..b8a6943dc 100644
--- a/numpy/core/src/multiarray/textreading/rows.c
+++ b/numpy/core/src/multiarray/textreading/rows.c
@@ -318,10 +318,19 @@ read_rows(stream *s,
         }
 
         if (!usecols && (actual_num_fields != current_num_fields)) {
-            PyErr_Format(PyExc_ValueError,
+            if (homogeneous) {
+                PyErr_Format(PyExc_ValueError,
                     "the number of columns changed from %zd to %zd at row %zd; "
                     "use `usecols` to select a subset and avoid this error",
                     actual_num_fields, current_num_fields, row_count+1);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                    "the dtype passed requires %zd columns but %zd were found "
+                    "at row %zd; "
+                    "use `usecols` to select a subset and avoid this error",
+                    actual_num_fields, current_num_fields, row_count+1);
+            }
             goto error;
         }
 
diff --git a/numpy/core/src/multiarray/textreading/tokenize.cpp b/numpy/core/src/multiarray/textreading/tokenize.cpp
index 02c64263e..e0ddc393d 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.cpp
+++ b/numpy/core/src/multiarray/textreading/tokenize.cpp
@@ -45,7 +45,8 @@ copy_to_field_buffer(tokenizer_state *ts,
         const UCS *chunk_start, const UCS *chunk_end)
 {
     npy_intp chunk_length = chunk_end - chunk_start;
-    npy_intp size = chunk_length + ts->field_buffer_pos + 2;
+    /* Space for length +1 termination, +2 additional padding for add_field */
+    npy_intp size = chunk_length + ts->field_buffer_pos + 3;
 
     if (NPY_UNLIKELY(ts->field_buffer_length < size)) {
         npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4));
@@ -104,6 +105,7 @@ add_field(tokenizer_state *ts)
     ts->num_fields += 1;
     /* Ensure this (currently empty) word is NUL terminated. */
     ts->field_buffer[ts->field_buffer_pos] = '\0';
+    assert(ts->field_buffer_length > ts->field_buffer_pos);
     return 0;
 }
 
@@ -221,7 +223,8 @@ tokenizer_core(tokenizer_state *ts, parser_config *const config)
             }
             else {
                 /* continue parsing as if unquoted */
-                ts->state = TOKENIZE_UNQUOTED;
+                /* Set to TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE */
+                ts->state = ts->unquoted_state;
             }
             break;
 
@@ -287,7 +290,7 @@ NPY_NO_EXPORT int
 npy_tokenize(stream *s, tokenizer_state *ts, parser_config *const config)
 {
     assert(ts->fields_size >= 2);
-    assert(ts->field_buffer_length >= 2*(ssize_t)sizeof(Py_UCS4));
+    assert(ts->field_buffer_length >= 2*(Py_ssize_t)sizeof(Py_UCS4));
 
     int finished_reading_file = 0;
 
@@ -446,6 +449,8 @@ npy_tokenizer_init(tokenizer_state *ts, parser_config *config)
 
     ts->fields = (field_info *)PyMem_Malloc(4 * sizeof(*ts->fields));
     if (ts->fields == nullptr) {
+        PyMem_Free(ts->field_buffer);
+        ts->field_buffer = nullptr;
         PyErr_NoMemory();
         return -1;
     }
diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h
index d0ea46383..53e97760f 100644
--- a/numpy/core/src/multiarray/textreading/tokenize.h
+++ b/numpy/core/src/multiarray/textreading/tokenize.h
@@ -46,8 +46,9 @@ typedef struct {
     char *pos;
     char *end;
     /*
-     * Space to copy words into.  The buffer must always be at least two NUL
-     * entries longer (8 bytes) than the actual word (including initially).
+     * Space to copy words into.  Due to `add_field` not growing the buffer
+     * but writing a \0 termination, the buffer must always be two larger
+     * (add_field can be called twice if a row ends in a delimiter: "123,").
      * The first byte beyond the current word is always NUL'ed on write, the
      * second byte is there to allow easy appending of an additional empty
      * word at the end (this word is also NUL terminated).
diff --git a/numpy/core/src/multiarray/usertypes.c b/numpy/core/src/multiarray/usertypes.c
index a338d712d..5e36c914b 100644
--- a/numpy/core/src/multiarray/usertypes.c
+++ b/numpy/core/src/multiarray/usertypes.c
@@ -41,6 +41,7 @@ maintainer email:  oliphant.travis@ieee.org
 #include "scalartypes.h"
 #include "array_method.h"
 #include "convert_datatype.h"
+#include "dtype_traversal.h"
 #include "legacy_dtype_implementation.h"
 
 
@@ -223,6 +224,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         PyErr_SetString(PyExc_ValueError, "missing typeobject");
         return -1;
     }
+
+    int use_void_clearimpl = 0;
     if (descr->flags & (NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT)) {
         /*
          * User dtype can't actually do reference counting, however, there
@@ -231,6 +234,8 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
          * so we have to support this. But such a structure must be constant
          * (i.e. fixed at registration time, this is the case for `xpress`).
          */
+        use_void_clearimpl = 1;
+
         if (descr->names == NULL || descr->fields == NULL ||
             !PyDict_CheckExact(descr->fields)) {
             PyErr_Format(PyExc_ValueError,
@@ -256,14 +261,53 @@ PyArray_RegisterDataType(PyArray_Descr *descr)
         return -1;
     }
 
+    /*
+     * Legacy user DTypes classes cannot have a name, since the user never
+     * defined one.  So we create a name for them here. These DTypes are
+     * effectively static types.
+     *
+     * Note: we have no intention of freeing the memory again since this
+     * behaves identically to static type definition.
+     */
+
+    const char *scalar_name = descr->typeobj->tp_name;
+    /*
+     * We have to take only the name, and ignore the module to get
+     * a reasonable __name__, since static types are limited in this regard
+     * (this is not ideal, but not a big issue in practice).
+     * This is what Python does to print __name__ for static types.
+     */
+    const char *dot = strrchr(scalar_name, '.');
+    if (dot) {
+        scalar_name = dot + 1;
+    }
+    Py_ssize_t name_length = strlen(scalar_name) + 14;
+
+    char *name = PyMem_Malloc(name_length);
+    if (name == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    snprintf(name, name_length, "numpy.dtype[%s]", scalar_name);
+
     userdescrs[NPY_NUMUSERTYPES++] = descr;
 
     descr->type_num = typenum;
-    if (dtypemeta_wrap_legacy_descriptor(descr) < 0) {
+    if (dtypemeta_wrap_legacy_descriptor(descr, name, NULL) < 0) {
         descr->type_num = -1;
         NPY_NUMUSERTYPES--;
+        PyMem_Free(name);  /* free the name only on failure */
         return -1;
     }
+    if (use_void_clearimpl) {
+        /* See comment where use_void_clearimpl is set... */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_clear_void_and_legacy_user_dtype_loop);
+        /* Also use the void zerofill since there may be objects */
+        NPY_DT_SLOTS(NPY_DTYPE(descr))->get_clear_loop = (
+                &npy_get_zerofill_void_and_legacy_user_dtype_loop);
+    }
 
     return typenum;
 }
@@ -597,7 +641,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     if (from == to) {
         spec.flags = NPY_METH_REQUIRES_PYAPI | NPY_METH_SUPPORTS_UNALIGNED;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &legacy_same_dtype_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
@@ -606,7 +650,7 @@ PyArray_AddLegacyWrapping_CastingImpl(
     else {
         spec.flags = NPY_METH_REQUIRES_PYAPI;
         PyType_Slot slots[] = {
-            {NPY_METH_get_loop, &legacy_cast_get_strided_loop},
+            {_NPY_METH_get_loop, &legacy_cast_get_strided_loop},
             {NPY_METH_resolve_descriptors, &simple_cast_resolve_descriptors},
             {0, NULL}};
         spec.slots = slots;
diff --git a/numpy/core/src/npymath/arm64_exports.c b/numpy/core/src/npymath/arm64_exports.c
index d65bb4f6d..dfa8054d7 100644
--- a/numpy/core/src/npymath/arm64_exports.c
+++ b/numpy/core/src/npymath/arm64_exports.c
@@ -1,12 +1,14 @@
 #if defined(__arm64__) && defined(__APPLE__)
 
 #include <math.h>
-/* 
+/*
  * Export these for scipy, since the SciPy build for macos arm64
  * downloads the macos x86_64 NumPy, and does not error when the
  * linker fails to use npymathlib.a. Importing numpy will expose
  * these external functions
  * See https://github.com/numpy/numpy/issues/22673#issuecomment-1327520055
+ *
+ * This file is actually compiled as part of the main module.
  */
 
 double npy_asinh(double x) {
@@ -20,4 +22,9 @@ double npy_copysign(double y, double x) {
 double npy_log1p(double x) {
     return log1p(x);
 }
+
+double npy_nextafter(double x, double y) {
+    return nextafter(x, y);
+}
+
 #endif
diff --git a/numpy/core/src/npymath/halffloat.c b/numpy/core/src/npymath/halffloat.c
deleted file mode 100644
index 51948c736..000000000
--- a/numpy/core/src/npymath/halffloat.c
+++ /dev/null
@@ -1,555 +0,0 @@
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#include "numpy/halffloat.h"
-
-/*
- * This chooses between 'ties to even' and 'ties away from zero'.
- */
-#define NPY_HALF_ROUND_TIES_TO_EVEN 1
-/*
- * If these are 1, the conversions try to trigger underflow,
- * overflow, and invalid exceptions in the FP system when needed.
- */
-#define NPY_HALF_GENERATE_OVERFLOW 1
-#define NPY_HALF_GENERATE_UNDERFLOW 1
-#define NPY_HALF_GENERATE_INVALID 1
-
-/*
- ********************************************************************
- *                   HALF-PRECISION ROUTINES                        *
- ********************************************************************
- */
-
-float npy_half_to_float(npy_half h)
-{
-    union { float ret; npy_uint32 retbits; } conv;
-    conv.retbits = npy_halfbits_to_floatbits(h);
-    return conv.ret;
-}
-
-double npy_half_to_double(npy_half h)
-{
-    union { double ret; npy_uint64 retbits; } conv;
-    conv.retbits = npy_halfbits_to_doublebits(h);
-    return conv.ret;
-}
-
-npy_half npy_float_to_half(float f)
-{
-    union { float f; npy_uint32 fbits; } conv;
-    conv.f = f;
-    return npy_floatbits_to_halfbits(conv.fbits);
-}
-
-npy_half npy_double_to_half(double d)
-{
-    union { double d; npy_uint64 dbits; } conv;
-    conv.d = d;
-    return npy_doublebits_to_halfbits(conv.dbits);
-}
-
-int npy_half_iszero(npy_half h)
-{
-    return (h&0x7fff) == 0;
-}
-
-int npy_half_isnan(npy_half h)
-{
-    return ((h&0x7c00u) == 0x7c00u) && ((h&0x03ffu) != 0x0000u);
-}
-
-int npy_half_isinf(npy_half h)
-{
-    return ((h&0x7fffu) == 0x7c00u);
-}
-
-int npy_half_isfinite(npy_half h)
-{
-    return ((h&0x7c00u) != 0x7c00u);
-}
-
-int npy_half_signbit(npy_half h)
-{
-    return (h&0x8000u) != 0;
-}
-
-npy_half npy_half_spacing(npy_half h)
-{
-    npy_half ret;
-    npy_uint16 h_exp = h&0x7c00u;
-    npy_uint16 h_sig = h&0x03ffu;
-    if (h_exp == 0x7c00u) {
-#if NPY_HALF_GENERATE_INVALID
-        npy_set_floatstatus_invalid();
-#endif
-        ret = NPY_HALF_NAN;
-    } else if (h == 0x7bffu) {
-#if NPY_HALF_GENERATE_OVERFLOW
-        npy_set_floatstatus_overflow();
-#endif
-        ret = NPY_HALF_PINF;
-    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
-        if (h_exp > 0x2c00u) { /* If result is normalized */
-            ret = h_exp - 0x2c00u;
-        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-            ret = 1 << ((h_exp >> 10) - 2);
-        } else {
-            ret = 0x0001u; /* Smallest subnormal half */
-        }
-    } else if (h_exp > 0x2800u) { /* If result is still normalized */
-        ret = h_exp - 0x2800u;
-    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
-        ret = 1 << ((h_exp >> 10) - 1);
-    } else {
-        ret = 0x0001u;
-    }
-
-    return ret;
-}
-
-npy_half npy_half_copysign(npy_half x, npy_half y)
-{
-    return (x&0x7fffu) | (y&0x8000u);
-}
-
-npy_half npy_half_nextafter(npy_half x, npy_half y)
-{
-    npy_half ret;
-
-    if (npy_half_isnan(x) || npy_half_isnan(y)) {
-        ret = NPY_HALF_NAN;
-    } else if (npy_half_eq_nonan(x, y)) {
-        ret = x;
-    } else if (npy_half_iszero(x)) {
-        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
-    } else if (!(x&0x8000u)) { /* x > 0 */
-        if ((npy_int16)x > (npy_int16)y) { /* x > y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    } else {
-        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
-            ret = x-1;
-        } else {
-            ret = x+1;
-        }
-    }
-#if NPY_HALF_GENERATE_OVERFLOW
-    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
-        npy_set_floatstatus_overflow();
-    }
-#endif
-
-    return ret;
-}
-
-int npy_half_eq_nonan(npy_half h1, npy_half h2)
-{
-    return (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_eq(npy_half h1, npy_half h2)
-{
-    /*
-     * The equality cases are as follows:
-     *   - If either value is NaN, never equal.
-     *   - If the values are equal, equal.
-     *   - If the values are both signed zeros, equal.
-     */
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) &&
-           (h1 == h2 || ((h1 | h2) & 0x7fff) == 0);
-}
-
-int npy_half_ne(npy_half h1, npy_half h2)
-{
-    return !npy_half_eq(h1, h2);
-}
-
-int npy_half_lt_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) > (h2&0x7fffu);
-        } else {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 != 0x8000u) || (h2 != 0x0000u);
-        }
-    } else {
-        if (h2&0x8000u) {
-            return 0;
-        } else {
-            return (h1&0x7fffu) < (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_lt(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_lt_nonan(h1, h2);
-}
-
-int npy_half_gt(npy_half h1, npy_half h2)
-{
-    return npy_half_lt(h2, h1);
-}
-
-int npy_half_le_nonan(npy_half h1, npy_half h2)
-{
-    if (h1&0x8000u) {
-        if (h2&0x8000u) {
-            return (h1&0x7fffu) >= (h2&0x7fffu);
-        } else {
-            return 1;
-        }
-    } else {
-        if (h2&0x8000u) {
-            /* Signed zeros are equal, have to check for it */
-            return (h1 == 0x0000u) && (h2 == 0x8000u);
-        } else {
-            return (h1&0x7fffu) <= (h2&0x7fffu);
-        }
-    }
-}
-
-int npy_half_le(npy_half h1, npy_half h2)
-{
-    return (!npy_half_isnan(h1) && !npy_half_isnan(h2)) && npy_half_le_nonan(h1, h2);
-}
-
-int npy_half_ge(npy_half h1, npy_half h2)
-{
-    return npy_half_le(h2, h1);
-}
-
-npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
-{
-    float fh1 = npy_half_to_float(h1);
-    float fh2 = npy_half_to_float(h2);
-    float div, mod;
-
-    div = npy_divmodf(fh1, fh2, &mod);
-    *modulus = npy_float_to_half(mod);
-    return npy_float_to_half(div);
-}
-
-
-
-/*
- ********************************************************************
- *                     BIT-LEVEL CONVERSIONS                        *
- ********************************************************************
- */
-
-npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
-{
-    npy_uint32 f_exp, f_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (npy_uint16) ((f&0x80000000u) >> 16);
-    f_exp = (f&0x7f800000u);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (f_exp >= 0x47800000u) {
-        if (f_exp == 0x7f800000u) {
-            /* Inf or NaN */
-            f_sig = (f&0x007fffffu);
-            if (f_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (f_sig >> 13));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return (npy_uint16) (h_sgn + 0x7c00u);
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return (npy_uint16) (h_sgn + 0x7c00u);
-        }
-    }
-
-    /* Exponent underflow converts to a subnormal half or signed zero */
-    if (f_exp <= 0x38000000u) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (f_exp < 0x33000000u) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If f != 0, it underflowed to 0 */
-            if ((f&0x7fffffff) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        f_exp >>= 23;
-        f_sig = (0x00800000u + (f&0x007fffffu));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((f_sig&(((npy_uint32)1 << (126 - f_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Usually the significand is shifted by 13. For subnormals an
-         * additional shift needs to occur. This shift is one for the largest
-         * exponent giving a subnormal `f_exp = 0x38000000 >> 23 = 112`, which
-         * offsets the new first bit. At most the shift can be 1+10 bits.
-         */
-        f_sig >>= (113 - f_exp);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand. However, the (113 - f_exp)
-         * shift can lose up to 11 bits, so the || checks them in the original.
-         * In all other cases, we can just add one.
-         */
-        if (((f_sig&0x00003fffu) != 0x00001000u) || (f&0x000007ffu)) {
-            f_sig += 0x00001000u;
-        }
-#else
-        f_sig += 0x00001000u;
-#endif
-        h_sig = (npy_uint16) (f_sig >> 13);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return (npy_uint16) (h_sgn + h_sig);
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((f_exp - 0x38000000u) >> 13);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    f_sig = (f&0x007fffffu);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((f_sig&0x00003fffu) != 0x00001000u) {
-        f_sig += 0x00001000u;
-    }
-#else
-    f_sig += 0x00001000u;
-#endif
-    h_sig = (npy_uint16) (f_sig >> 13);
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
-{
-    npy_uint64 d_exp, d_sig;
-    npy_uint16 h_sgn, h_exp, h_sig;
-
-    h_sgn = (d&0x8000000000000000ULL) >> 48;
-    d_exp = (d&0x7ff0000000000000ULL);
-
-    /* Exponent overflow/NaN converts to signed inf/NaN */
-    if (d_exp >= 0x40f0000000000000ULL) {
-        if (d_exp == 0x7ff0000000000000ULL) {
-            /* Inf or NaN */
-            d_sig = (d&0x000fffffffffffffULL);
-            if (d_sig != 0) {
-                /* NaN - propagate the flag in the significand... */
-                npy_uint16 ret = (npy_uint16) (0x7c00u + (d_sig >> 42));
-                /* ...but make sure it stays a NaN */
-                if (ret == 0x7c00u) {
-                    ret++;
-                }
-                return h_sgn + ret;
-            } else {
-                /* signed inf */
-                return h_sgn + 0x7c00u;
-            }
-        } else {
-            /* overflow to signed inf */
-#if NPY_HALF_GENERATE_OVERFLOW
-            npy_set_floatstatus_overflow();
-#endif
-            return h_sgn + 0x7c00u;
-        }
-    }
-
-    /* Exponent underflow converts to subnormal half or signed zero */
-    if (d_exp <= 0x3f00000000000000ULL) {
-        /*
-         * Signed zeros, subnormal floats, and floats with small
-         * exponents all convert to signed zero half-floats.
-         */
-        if (d_exp < 0x3e60000000000000ULL) {
-#if NPY_HALF_GENERATE_UNDERFLOW
-            /* If d != 0, it underflowed to 0 */
-            if ((d&0x7fffffffffffffffULL) != 0) {
-                npy_set_floatstatus_underflow();
-            }
-#endif
-            return h_sgn;
-        }
-        /* Make the subnormal significand */
-        d_exp >>= 52;
-        d_sig = (0x0010000000000000ULL + (d&0x000fffffffffffffULL));
-#if NPY_HALF_GENERATE_UNDERFLOW
-        /* If it's not exactly represented, it underflowed */
-        if ((d_sig&(((npy_uint64)1 << (1051 - d_exp)) - 1)) != 0) {
-            npy_set_floatstatus_underflow();
-        }
-#endif
-        /*
-         * Unlike floats, doubles have enough room to shift left to align
-         * the subnormal significand leading to no loss of the last bits.
-         * The smallest possible exponent giving a subnormal is:
-         * `d_exp = 0x3e60000000000000 >> 52 = 998`. All larger subnormals are
-         * shifted with respect to it. This adds a shift of 10+1 bits the final
-         * right shift when comparing it to the one in the normal branch.
-         */
-        assert(d_exp - 998 >= 0);
-        d_sig <<= (d_exp - 998);
-        /* Handle rounding by adding 1 to the bit beyond half precision */
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-        /*
-         * If the last bit in the half significand is 0 (already even), and
-         * the remaining bit pattern is 1000...0, then we do not add one
-         * to the bit after the half significand.  In all other cases, we do.
-         */
-        if ((d_sig&0x003fffffffffffffULL) != 0x0010000000000000ULL) {
-            d_sig += 0x0010000000000000ULL;
-        }
-#else
-        d_sig += 0x0010000000000000ULL;
-#endif
-        h_sig = (npy_uint16) (d_sig >> 53);
-        /*
-         * If the rounding causes a bit to spill into h_exp, it will
-         * increment h_exp from zero to one and h_sig will be zero.
-         * This is the correct result.
-         */
-        return h_sgn + h_sig;
-    }
-
-    /* Regular case with no overflow or underflow */
-    h_exp = (npy_uint16) ((d_exp - 0x3f00000000000000ULL) >> 42);
-    /* Handle rounding by adding 1 to the bit beyond half precision */
-    d_sig = (d&0x000fffffffffffffULL);
-#if NPY_HALF_ROUND_TIES_TO_EVEN
-    /*
-     * If the last bit in the half significand is 0 (already even), and
-     * the remaining bit pattern is 1000...0, then we do not add one
-     * to the bit after the half significand.  In all other cases, we do.
-     */
-    if ((d_sig&0x000007ffffffffffULL) != 0x0000020000000000ULL) {
-        d_sig += 0x0000020000000000ULL;
-    }
-#else
-    d_sig += 0x0000020000000000ULL;
-#endif
-    h_sig = (npy_uint16) (d_sig >> 42);
-
-    /*
-     * If the rounding causes a bit to spill into h_exp, it will
-     * increment h_exp by one and h_sig will be zero.  This is the
-     * correct result.  h_exp may increment to 15, at greatest, in
-     * which case the result overflows to a signed inf.
-     */
-#if NPY_HALF_GENERATE_OVERFLOW
-    h_sig += h_exp;
-    if (h_sig == 0x7c00u) {
-        npy_set_floatstatus_overflow();
-    }
-    return h_sgn + h_sig;
-#else
-    return h_sgn + h_exp + h_sig;
-#endif
-}
-
-npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint32 f_sgn, f_exp, f_sig;
-
-    h_exp = (h&0x7c00u);
-    f_sgn = ((npy_uint32)h&0x8000u) << 16;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return f_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            f_exp = ((npy_uint32)(127 - 15 - h_exp)) << 23;
-            f_sig = ((npy_uint32)(h_sig&0x03ffu)) << 13;
-            return f_sgn + f_exp + f_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return f_sgn + 0x7f800000u + (((npy_uint32)(h&0x03ffu)) << 13);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return f_sgn + (((npy_uint32)(h&0x7fffu) + 0x1c000u) << 13);
-    }
-}
-
-npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
-{
-    npy_uint16 h_exp, h_sig;
-    npy_uint64 d_sgn, d_exp, d_sig;
-
-    h_exp = (h&0x7c00u);
-    d_sgn = ((npy_uint64)h&0x8000u) << 48;
-    switch (h_exp) {
-        case 0x0000u: /* 0 or subnormal */
-            h_sig = (h&0x03ffu);
-            /* Signed zero */
-            if (h_sig == 0) {
-                return d_sgn;
-            }
-            /* Subnormal */
-            h_sig <<= 1;
-            while ((h_sig&0x0400u) == 0) {
-                h_sig <<= 1;
-                h_exp++;
-            }
-            d_exp = ((npy_uint64)(1023 - 15 - h_exp)) << 52;
-            d_sig = ((npy_uint64)(h_sig&0x03ffu)) << 42;
-            return d_sgn + d_exp + d_sig;
-        case 0x7c00u: /* inf or NaN */
-            /* All-ones exponent and a copy of the significand */
-            return d_sgn + 0x7ff0000000000000ULL +
-                                (((npy_uint64)(h&0x03ffu)) << 42);
-        default: /* normalized */
-            /* Just need to adjust the exponent and shift */
-            return d_sgn + (((npy_uint64)(h&0x7fffu) + 0xfc000u) << 42);
-    }
-}
diff --git a/numpy/core/src/npymath/halffloat.cpp b/numpy/core/src/npymath/halffloat.cpp
new file mode 100644
index 000000000..aa582c1b9
--- /dev/null
+++ b/numpy/core/src/npymath/halffloat.cpp
@@ -0,0 +1,238 @@
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+/*
+ * If these are 1, the conversions try to trigger underflow,
+ * overflow, and invalid exceptions in the FP system when needed.
+ */
+#define NPY_HALF_GENERATE_OVERFLOW 1
+#define NPY_HALF_GENERATE_INVALID 1
+
+#include "numpy/halffloat.h"
+
+#include "common.hpp"
+/*
+ ********************************************************************
+ *                   HALF-PRECISION ROUTINES                        *
+ ********************************************************************
+ */
+using namespace np;
+
+float npy_half_to_float(npy_half h)
+{
+    return static_cast<float>(Half::FromBits(h));
+}
+
+double npy_half_to_double(npy_half h)
+{
+    return static_cast<double>(Half::FromBits(h));
+}
+
+npy_half npy_float_to_half(float f)
+{
+    return Half(f).Bits();
+}
+
+npy_half npy_double_to_half(double d)
+{
+    return Half(d).Bits();
+}
+
+int npy_half_iszero(npy_half h)
+{
+    return (h&0x7fff) == 0;
+}
+
+int npy_half_isnan(npy_half h)
+{
+    return Half::FromBits(h).IsNaN();
+}
+
+int npy_half_isinf(npy_half h)
+{
+    return ((h&0x7fffu) == 0x7c00u);
+}
+
+int npy_half_isfinite(npy_half h)
+{
+    return ((h&0x7c00u) != 0x7c00u);
+}
+
+int npy_half_signbit(npy_half h)
+{
+    return (h&0x8000u) != 0;
+}
+
+npy_half npy_half_spacing(npy_half h)
+{
+    npy_half ret;
+    npy_uint16 h_exp = h&0x7c00u;
+    npy_uint16 h_sig = h&0x03ffu;
+    if (h_exp == 0x7c00u) {
+#if NPY_HALF_GENERATE_INVALID
+        npy_set_floatstatus_invalid();
+#endif
+        ret = NPY_HALF_NAN;
+    } else if (h == 0x7bffu) {
+#if NPY_HALF_GENERATE_OVERFLOW
+        npy_set_floatstatus_overflow();
+#endif
+        ret = NPY_HALF_PINF;
+    } else if ((h&0x8000u) && h_sig == 0) { /* Negative boundary case */
+        if (h_exp > 0x2c00u) { /* If result is normalized */
+            ret = h_exp - 0x2c00u;
+        } else if(h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+            ret = 1 << ((h_exp >> 10) - 2);
+        } else {
+            ret = 0x0001u; /* Smallest subnormal half */
+        }
+    } else if (h_exp > 0x2800u) { /* If result is still normalized */
+        ret = h_exp - 0x2800u;
+    } else if (h_exp > 0x0400u) { /* The result is a subnormal, but not the smallest */
+        ret = 1 << ((h_exp >> 10) - 1);
+    } else {
+        ret = 0x0001u;
+    }
+
+    return ret;
+}
+
+npy_half npy_half_copysign(npy_half x, npy_half y)
+{
+    return (x&0x7fffu) | (y&0x8000u);
+}
+
+npy_half npy_half_nextafter(npy_half x, npy_half y)
+{
+    npy_half ret;
+
+    if (npy_half_isnan(x) || npy_half_isnan(y)) {
+        ret = NPY_HALF_NAN;
+    } else if (npy_half_eq_nonan(x, y)) {
+        ret = x;
+    } else if (npy_half_iszero(x)) {
+        ret = (y&0x8000u) + 1; /* Smallest subnormal half */
+    } else if (!(x&0x8000u)) { /* x > 0 */
+        if ((npy_int16)x > (npy_int16)y) { /* x > y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    } else {
+        if (!(y&0x8000u) || (x&0x7fffu) > (y&0x7fffu)) { /* x < y */
+            ret = x-1;
+        } else {
+            ret = x+1;
+        }
+    }
+#if NPY_HALF_GENERATE_OVERFLOW
+    if (npy_half_isinf(ret) && npy_half_isfinite(x)) {
+        npy_set_floatstatus_overflow();
+    }
+#endif
+
+    return ret;
+}
+
+int npy_half_eq_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Equal(Half::FromBits(h2));
+}
+
+int npy_half_eq(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) == Half::FromBits(h2);
+}
+
+int npy_half_ne(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) != Half::FromBits(h2);
+}
+
+int npy_half_lt_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).Less(Half::FromBits(h2));
+}
+
+int npy_half_lt(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) < Half::FromBits(h2);
+}
+
+int npy_half_gt(npy_half h1, npy_half h2)
+{
+    return npy_half_lt(h2, h1);
+}
+
+int npy_half_le_nonan(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1).LessEqual(Half::FromBits(h2));
+}
+
+int npy_half_le(npy_half h1, npy_half h2)
+{
+    return Half::FromBits(h1) <= Half::FromBits(h2);
+}
+
+int npy_half_ge(npy_half h1, npy_half h2)
+{
+    return npy_half_le(h2, h1);
+}
+
+npy_half npy_half_divmod(npy_half h1, npy_half h2, npy_half *modulus)
+{
+    float fh1 = npy_half_to_float(h1);
+    float fh2 = npy_half_to_float(h2);
+    float div, mod;
+
+    div = npy_divmodf(fh1, fh2, &mod);
+    *modulus = npy_float_to_half(mod);
+    return npy_float_to_half(div);
+}
+
+
+/*
+ ********************************************************************
+ *                     BIT-LEVEL CONVERSIONS                        *
+ ********************************************************************
+ */
+
+npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint16_t>(Half(BitCast<float>(f)));
+    }
+    else {
+        return half_private::FromFloatBits(f);
+    }
+}
+
+npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint16_t>(Half(BitCast<double>(d)));
+    }
+    else {
+        return half_private::FromDoubleBits(d);
+    }
+}
+
+npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<float>) {
+        return BitCast<uint32_t>(static_cast<float>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToFloatBits(h);
+    }
+}
+
+npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h)
+{
+    if constexpr (Half::kNativeConversion<double>) {
+        return BitCast<uint64_t>(static_cast<double>(Half::FromBits(h)));
+    }
+    else {
+        return half_private::ToDoubleBits(h);
+    }
+}
+
diff --git a/numpy/core/src/npymath/npy_math_complex.c.src b/numpy/core/src/npymath/npy_math_complex.c.src
index eff7f8e13..eb9d810b7 100644
--- a/numpy/core/src/npymath/npy_math_complex.c.src
+++ b/numpy/core/src/npymath/npy_math_complex.c.src
@@ -535,6 +535,443 @@ npy_cpow@c@ (@ctype@ a, @ctype@ b)
 #endif
 }
 
+
+#ifndef HAVE_CCOS@C@
+@ctype@
+npy_ccos@c@(@ctype@ z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccosh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CSIN@C@
+@ctype@
+npy_csin@c@(@ctype@ z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z));
+}
+#endif
+
+#ifndef HAVE_CTAN@C@
+@ctype@
+npy_ctan@c@(@ctype@ z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanh@c@(npy_cpack@c@(-npy_cimag@c@(z), npy_creal@c@(z)));
+    return (npy_cpack@c@(npy_cimag@c@(z), -npy_creal@c@(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+@ctype@
+npy_ccosh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_cosh@c@(x), x * y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_cosh@c@(x) * npy_cos@c@(y),
+                                npy_sinh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(absx) * 0.5@c@;
+            return npy_cpack@c@(h * npy_cos@c@(y),
+                                npy_copysign@c@(h, x) * npy_sin@c@(y));
+        }
+        else if (absx < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z),
+                                npy_cimag@c@(z) * npy_copysign@c@(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpack@c@(h * h * npy_cos@c@(y), h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(y - y, npy_copysign@c@(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpack@c@(x * x, npy_copysign@c@(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@((x * x) * npy_cos@c@(y), x * npy_sin@c@(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+@ctype@
+npy_csinh@c@(@ctype@ z)
+{
+#if @precision@ == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if @precision@ == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if @precision@ >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    @type@ x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack@c@(npy_sinh@c@(x), y);
+        }
+        absx = npy_fabs@c@(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpack@c@(npy_sinh@c@(x) * npy_cos@c@(y),
+                                npy_cosh@c@(x) * npy_sin@c@(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER@C@) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp@c@(npy_fabs@c@(x)) * 0.5@c@;
+            return npy_cpack@c@(npy_copysign@c@(h, x) * npy_cos@c@(y),
+                                h * npy_sin@c@(y));
+        }
+        else if (x < SCALED_CEXP_UPPER@C@) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp@c@(absx, y, -1);
+            return npy_cpack@c@(npy_creal@c@(z) * npy_copysign@c@(1, x),
+                                npy_cimag@c@(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpack@c@(h * npy_cos@c@(y), h * h * npy_sin@c@(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack@c@(npy_copysign@c@(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpack@c@(x, npy_copysign@c@(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack@c@(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpack@c@(x * x, x * (y - y));
+        }
+        return npy_cpack@c@(x * npy_cos@c@(y),
+                            NPY_INFINITY@C@ * npy_sin@c@(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack@c@((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANH@C@
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+@ctype@
+npy_ctanh@c@(@ctype@ z)
+{
+    @type@ x, y;
+    @type@ t, beta, s, rho, denom;
+
+    x = npy_creal@c@(z);
+    y = npy_cimag@c@(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpack@c@(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpack@c@(npy_copysign@c@(1,x),
+                                npy_copysign@c@(0,
+                                npy_isinf(y) ?
+                                    y : npy_sin@c@(y) * npy_cos@c@(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpack@c@(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabs@c@(x) >= TANH@C@_HUGE) {
+        @type@ exp_mx = npy_exp@c@(-npy_fabs@c@(x));
+        return npy_cpack@c@(npy_copysign@c@(1, x),
+                            4 * npy_sin@c@(y) * npy_cos@c@(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tan@c@(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinh@c@(x);
+    rho = npy_sqrt@c@(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpack@c@((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
 #if !defined (HAVE_CACOS@C@) || !defined (HAVE_CASINH@C@)
 /*
  * Complex inverse trig functions taken from the msum library in FreeBSD
@@ -1327,8 +1764,10 @@ npy_@kind@@c@(@ctype@ z)
 /**end repeat1**/
 
 /**begin repeat1
- * #kind = cexp,clog,csqrt,cacos,casin,catan,cacosh,casinh,catanh#
- * #KIND = CEXP,CLOG,CSQRT,CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
+ * #kind = cexp,clog,csqrt,ccos,csin,ctan,ccosh,csinh,ctanh,
+ *         cacos,casin,catan,cacosh,casinh,catanh#
+ * #KIND = CEXP,CLOG,CSQRT,CCOS,CSIN,CTAN,CCOSH,CSINH,CTANH,
+ *         CACOS,CASIN,CATAN,CACOSH,CASINH,CATANH#
  */
 #ifdef HAVE_@KIND@@C@
 @ctype@
@@ -1343,20 +1782,5 @@ npy_@kind@@c@(@ctype@ z)
 #endif
 /**end repeat1**/
 
-/* Mandatory complex functions */
-
-/**begin repeat1
- * #kind = csin,csinh,ccos,ccosh,ctan,ctanh#
- */
-@ctype@
-npy_@kind@@c@(@ctype@ z)
-{
-    __@ctype@_to_c99_cast z1;
-    __@ctype@_to_c99_cast ret;
-    z1.npy_z = z;
-    ret.c99_z = @kind@@c@(z1.c99_z);
-    return ret.npy_z;
-}
-/**end repeat1**/
 
 /**end repeat**/
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index a474b3de3..20c94f98a 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -21,6 +21,7 @@
 #include <Python.h>
 #ifdef __cplusplus
 #include <cmath>
+#include <complex>
 using std::isgreater;
 using std::isless;
 #else
@@ -494,8 +495,9 @@ do {                                                            \
  * Microsoft C defines _MSC_VER
  * Intel compiler does not use MSVC complex types, but defines _MSC_VER by
  * default.
+ * since c++17 msvc is no longer support them.
  */
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#if !defined(__cplusplus) && defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 typedef union {
         npy_cdouble npy_z;
         _Dcomplex c99_z;
diff --git a/numpy/core/src/npysort/heapsort.cpp b/numpy/core/src/npysort/heapsort.cpp
index de39367c2..77a4bda74 100644
--- a/numpy/core/src/npysort/heapsort.cpp
+++ b/numpy/core/src/npysort/heapsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -55,6 +55,9 @@ npy_heapsort(void *start, npy_intp num, void *varr)
     PyArrayObject *arr = (PyArrayObject *)varr;
     npy_intp elsize = PyArray_ITEMSIZE(arr);
     PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare;
+    if (elsize == 0) {
+        return 0;  /* no need for sorting elements of no size */
+    }
     char *tmp = (char *)malloc(elsize);
     char *a = (char *)start - elsize;
     npy_intp i, j, l;
diff --git a/numpy/core/src/npysort/mergesort.cpp b/numpy/core/src/npysort/mergesort.cpp
index f892dd185..f53feb320 100644
--- a/numpy/core/src/npysort/mergesort.cpp
+++ b/numpy/core/src/npysort/mergesort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
@@ -171,6 +171,7 @@ amergesort_(type *v, npy_intp *tosort, npy_intp num)
 }
 
 /*
+ 
  *****************************************************************************
  **                             STRING SORTS                                **
  *****************************************************************************
diff --git a/numpy/core/src/npysort/npysort_heapsort.h b/numpy/core/src/npysort/npysort_heapsort.h
index 442320094..16750b817 100644
--- a/numpy/core/src/npysort/npysort_heapsort.h
+++ b/numpy/core/src/npysort/npysort_heapsort.h
@@ -128,6 +128,10 @@ int string_heapsort_(type *start, npy_intp n, void *varr)
 {
     PyArrayObject *arr = (PyArrayObject *)varr;
     size_t len = PyArray_ITEMSIZE(arr) / sizeof(type);
+    if (len == 0) {
+        return 0;  /* no need for sorting if strings are empty */
+    }
+
     type *tmp = (type *)malloc(PyArray_ITEMSIZE(arr));
     type *a = (type *)start - len;
     npy_intp i, j, l;
diff --git a/numpy/core/src/npysort/quicksort.cpp b/numpy/core/src/npysort/quicksort.cpp
index 3e351dd84..7497ebaa3 100644
--- a/numpy/core/src/npysort/quicksort.cpp
+++ b/numpy/core/src/npysort/quicksort.cpp
@@ -54,15 +54,11 @@
 #include "npysort_common.h"
 #include "npysort_heapsort.h"
 #include "numpy_tag.h"
+#include "simd_qsort.hpp"
 
-#include "x86-qsort.h"
 #include <cstdlib>
 #include <utility>
 
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
 #define NOT_USED NPY_UNUSED(unused)
 /*
  * pushing largest partition has upper bound of log2(n) space
@@ -73,70 +69,50 @@
 #define SMALL_MERGESORT 20
 #define SMALL_STRING 16
 
+// Temporarily disable AVX512 sorting on WIN32 and CYGWIN until we can figure
+// out why it has test failures
+#if defined(_MSC_VER) || defined(__CYGWIN__)
+template<typename T>
+inline bool quicksort_dispatch(T*, npy_intp)
+{
+    return false;
+}
+#else
+template<typename T>
+inline bool quicksort_dispatch(T *start, npy_intp num)
+{
+    using TF = typename np::meta::FixedWidth<T>::Type;
+    void (*dispfunc)(TF*, intptr_t) = nullptr;
+    if (sizeof(T) == sizeof(uint16_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort_16bit.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
+        #ifndef NPY_DISABLE_OPTIMIZATION
+            #include "simd_qsort.dispatch.h"
+        #endif
+        NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
+    }
+    if (dispfunc) {
+        (*dispfunc)(reinterpret_cast<TF*>(start), static_cast<intptr_t>(num));
+        return true;
+    }
+    return false;
+}
+#endif // _MSC_VER || CYGWIN
+
 /*
  *****************************************************************************
  **                            NUMERIC SORTS                                **
  *****************************************************************************
  */
 
-namespace {
-
-template <typename Tag>
-struct x86_dispatch {
-    static bool quicksort(typename Tag::type *, npy_intp) { return false; }
-};
-
-template <>
-struct x86_dispatch<npy::int_tag> {
-    static bool quicksort(npy_int *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_int);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::uint_tag> {
-    static bool quicksort(npy_uint *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_uint);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-template <>
-struct x86_dispatch<npy::float_tag> {
-    static bool quicksort(npy_float *start, npy_intp num)
-    {
-        void (*dispfunc)(void *, npy_intp) = nullptr;
-        NPY_CPU_DISPATCH_CALL_XB(dispfunc = x86_quicksort_float);
-        if (dispfunc) {
-            (*dispfunc)(start, num);
-            return true;
-        }
-        return false;
-    }
-};
-
-}  // namespace
-
 template <typename Tag, typename type>
 static int
 quicksort_(type *start, npy_intp num)
 {
-    if (x86_dispatch<Tag>::quicksort(start, num))
-        return 0;
-
     type vp;
     type *pl = start;
     type *pr = pl + num - 1;
@@ -729,56 +705,89 @@ quicksort_ubyte(void *start, npy_intp n, void *NPY_UNUSED(varr))
 NPY_NO_EXPORT int
 quicksort_short(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_short *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::short_tag>((npy_short *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ushort(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ushort *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ushort_tag>((npy_ushort *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_int(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_int *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::int_tag>((npy_int *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_uint(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_uint *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::uint_tag>((npy_uint *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_long(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_long *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::long_tag>((npy_long *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulong_tag>((npy_ulong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_longlong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_longlong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::longlong_tag>((npy_longlong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_ulonglong(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_ulonglong *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::ulonglong_tag>((npy_ulonglong *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_half(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((np::Half *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::half_tag>((npy_half *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_float(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_float *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::float_tag>((npy_float *)start, n);
 }
 NPY_NO_EXPORT int
 quicksort_double(void *start, npy_intp n, void *NPY_UNUSED(varr))
 {
+    if (quicksort_dispatch((npy_double *)start, n)) {
+        return 0;
+    }
     return quicksort_<npy::double_tag>((npy_double *)start, n);
 }
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/npysort/selection.cpp b/numpy/core/src/npysort/selection.cpp
index ebe188b71..450858df9 100644
--- a/numpy/core/src/npysort/selection.cpp
+++ b/numpy/core/src/npysort/selection.cpp
@@ -423,8 +423,8 @@ static constexpr std::array<arg_map, sizeof...(Tags)>
 make_partition_map(npy::taglist<Tags...>)
 {
     return std::array<arg_map, sizeof...(Tags)>{
-            arg_map{Tags::type_value, &introselect_noarg<Tags>,
-                    &introselect_arg<Tags>}...};
+            arg_map{Tags::type_value, {&introselect_noarg<Tags>},
+                {&introselect_arg<Tags>}}...};
 }
 
 struct partition_t {
diff --git a/numpy/core/src/npysort/simd_qsort.dispatch.cpp b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
new file mode 100644
index 000000000..101bb3dcc
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.dispatch.cpp
@@ -0,0 +1,44 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_skx
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
+    #include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif  // NPY_HAVE_AVX512_SKX
+
+}} // namespace np::simd
diff --git a/numpy/core/src/npysort/simd_qsort.hpp b/numpy/core/src/npysort/simd_qsort.hpp
new file mode 100644
index 000000000..7cdee774d
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort.hpp
@@ -0,0 +1,19 @@
+#ifndef NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+#define NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
+
+#include "common.hpp"
+
+namespace np { namespace qsort_simd {
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "simd_qsort_16bit.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
+
+} } // np::qsort_simd
+#endif // NUMPY_SRC_COMMON_NPYSORT_SIMD_QSORT_HPP
diff --git a/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
new file mode 100644
index 000000000..3f5099758
--- /dev/null
+++ b/numpy/core/src/npysort/simd_qsort_16bit.dispatch.cpp
@@ -0,0 +1,39 @@
+/*@targets
+ * $maxopt $keep_baseline avx512_icl avx512_spr
+ */
+// policy $keep_baseline is used to avoid skip building avx512_skx
+// when its part of baseline features (--cpu-baseline), since
+// 'baseline' option isn't specified within targets.
+
+#include "simd_qsort.hpp"
+
+#if defined(NPY_HAVE_AVX512_SPR) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512fp16-16bit-qsort.hpp"
+#elif defined(NPY_HAVE_AVX512_ICL) && !defined(_MSC_VER)
+    #include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
+#endif
+
+namespace np { namespace qsort_simd {
+
+#if !defined(_MSC_VER)
+#if defined(NPY_HAVE_AVX512_ICL) || defined(NPY_HAVE_AVX512_SPR)
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
+{
+#if defined(NPY_HAVE_AVX512_SPR)
+    avx512_qsort(reinterpret_cast<_Float16*>(arr), size);
+#else
+    avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size);
+#endif
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
+{
+    avx512_qsort(arr, size);
+}
+#endif // NPY_HAVE_AVX512_ICL || SPR
+#endif // _MSC_VER
+
+}} // namespace np::qsort_simd
diff --git a/numpy/core/src/npysort/timsort.cpp b/numpy/core/src/npysort/timsort.cpp
index 27294af0c..9438fb293 100644
--- a/numpy/core/src/npysort/timsort.cpp
+++ b/numpy/core/src/npysort/timsort.cpp
@@ -21,7 +21,7 @@
  *
  * The merge sort is *stable*, meaning that equal components
  * are unmoved from their entry versions, so it can be used to
- * implement lexigraphic sorting on multiple keys.
+ * implement lexicographic sorting on multiple keys.
  *
  * The heap sort is included for completeness.
  */
diff --git a/numpy/core/src/npysort/x86-qsort.dispatch.cpp b/numpy/core/src/npysort/x86-qsort.dispatch.cpp
deleted file mode 100644
index 8e88cc667..000000000
--- a/numpy/core/src/npysort/x86-qsort.dispatch.cpp
+++ /dev/null
@@ -1,835 +0,0 @@
-/*@targets
- * $maxopt $keep_baseline avx512_skx
- */
-// policy $keep_baseline is used to avoid skip building avx512_skx
-// when its part of baseline features (--cpu-baseline), since
-// 'baseline' option isn't specified within targets.
-
-#include "x86-qsort.h"
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#ifdef NPY_HAVE_AVX512_SKX
-#include "numpy/npy_math.h"
-
-#include "npy_sort.h"
-#include "numpy_tag.h"
-
-#include "simd/simd.h"
-#include <immintrin.h>
-
-template <typename Tag, typename type>
-NPY_NO_EXPORT int
-heapsort_(type *start, npy_intp n);
-
-/*
- * Quicksort using AVX-512 for int, uint32 and float. The ideas and code are
- * based on these two research papers:
- * (1) Fast and Robust Vectorized In-Place Sorting of Primitive Types
- *     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
- * (2) A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
- * Skylake https://arxiv.org/pdf/1704.08579.pdf
- *
- * High level idea: Vectorize the quicksort partitioning using AVX-512
- * compressstore instructions. The algorithm to pick the pivot is to use median
- * of 72 elements picked at random. If the array size is < 128, then use
- * Bitonic sorting network. Good resource for bitonic sorting network:
- * http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
- *
- * Refer to https://github.com/numpy/numpy/pull/20133#issuecomment-958110340
- * for potential problems when converting this code to universal intrinsics
- * framework.
- */
-
-/*
- * Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-#define NETWORK1 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
-#define NETWORK2 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK3 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK4 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK5 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-#define NETWORK6 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4
-#define NETWORK7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-#define ZMM_MAX_FLOAT _mm512_set1_ps(NPY_INFINITYF)
-#define ZMM_MAX_UINT _mm512_set1_epi32(NPY_MAX_UINT32)
-#define ZMM_MAX_INT _mm512_set1_epi32(NPY_MAX_INT32)
-#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
-#define SHUFFLE_ps(ZMM, MASK) _mm512_shuffle_ps(zmm, zmm, MASK)
-#define SHUFFLE_epi32(ZMM, MASK) _mm512_shuffle_epi32(zmm, MASK)
-
-#define MAX(x, y) (((x) > (y)) ? (x) : (y))
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/*
- * Vectorized random number generator xoroshiro128+. Broken into 2 parts:
- * (1) vnext generates 2 64-bit random integers
- * (2) rnd_epu32 converts this to 4 32-bit random integers and bounds it to
- *     the length of the array
- */
-#define VROTL(x, k) /* rotate each uint64_t value in vector */ \
-    _mm256_or_si256(_mm256_slli_epi64((x), (k)),               \
-                    _mm256_srli_epi64((x), 64 - (k)))
-
-static inline __m256i
-vnext(__m256i *s0, __m256i *s1)
-{
-    *s1 = _mm256_xor_si256(*s0, *s1); /* modify vectors s1 and s0 */
-    *s0 = _mm256_xor_si256(_mm256_xor_si256(VROTL(*s0, 24), *s1),
-                           _mm256_slli_epi64(*s1, 16));
-    *s1 = VROTL(*s1, 37);
-    return _mm256_add_epi64(*s0, *s1); /* return random vector */
-}
-
-/* transform random numbers to the range between 0 and bound - 1 */
-static inline __m256i
-rnd_epu32(__m256i rnd_vec, __m256i bound)
-{
-    __m256i even = _mm256_srli_epi64(_mm256_mul_epu32(rnd_vec, bound), 32);
-    __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(rnd_vec, 32), bound);
-    return _mm256_blend_epi32(odd, even, 0b01010101);
-}
-
-template <typename type>
-struct vector;
-
-template <>
-struct vector<npy_int> {
-    using tag = npy::int_tag;
-    using type_t = npy_int;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_INT32; }
-    static type_t type_min() { return NPY_MIN_INT32; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    template <int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epi32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epi32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_s32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_s32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epi32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epi32(x, y); }
-};
-template <>
-struct vector<npy_uint> {
-    using tag = npy::uint_tag;
-    using type_t = npy_uint;
-    using zmm_t = __m512i;
-    using ymm_t = __m256i;
-
-    static type_t type_max() { return NPY_MAX_UINT32; }
-    static type_t type_min() { return 0; }
-    static zmm_t zmm_max() { return _mm512_set1_epi32(type_max()); }
-
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_epi32(index, base, scale);
-    }
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_epu32_mask(x, y, _MM_CMPINT_NLT);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_si512(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_epu32(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_epi32(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_epi32(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_epi32(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_epi32(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_epu32(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_epi32(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_u32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_u32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_epi32(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x)
-    {
-        return _mm512_storeu_si512(mem, x);
-    }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_epu32(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_epu32(x, y); }
-};
-template <>
-struct vector<npy_float> {
-    using tag = npy::float_tag;
-    using type_t = npy_float;
-    using zmm_t = __m512;
-    using ymm_t = __m256;
-
-    static type_t type_max() { return NPY_INFINITYF; }
-    static type_t type_min() { return -NPY_INFINITYF; }
-    static zmm_t zmm_max() { return _mm512_set1_ps(type_max()); }
-
-    static __mmask16 ge(zmm_t x, zmm_t y)
-    {
-        return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
-    }
-    template<int scale>
-    static ymm_t i64gather(__m512i index, void const *base)
-    {
-        return _mm512_i64gather_ps(index, base, scale);
-    }
-    static zmm_t loadu(void const *mem) { return _mm512_loadu_ps(mem); }
-    static zmm_t max(zmm_t x, zmm_t y) { return _mm512_max_ps(x, y); }
-    static void mask_compressstoreu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_compressstoreu_ps(mem, mask, x);
-    }
-    static zmm_t mask_loadu(zmm_t x, __mmask16 mask, void const *mem)
-    {
-        return _mm512_mask_loadu_ps(x, mask, mem);
-    }
-    static zmm_t mask_mov(zmm_t x, __mmask16 mask, zmm_t y)
-    {
-        return _mm512_mask_mov_ps(x, mask, y);
-    }
-    static void mask_storeu(void *mem, __mmask16 mask, zmm_t x)
-    {
-        return _mm512_mask_storeu_ps(mem, mask, x);
-    }
-    static zmm_t min(zmm_t x, zmm_t y) { return _mm512_min_ps(x, y); }
-    static zmm_t permutexvar(__m512i idx, zmm_t zmm)
-    {
-        return _mm512_permutexvar_ps(idx, zmm);
-    }
-    static type_t reducemax(zmm_t v) { return npyv_reduce_max_f32(v); }
-    static type_t reducemin(zmm_t v) { return npyv_reduce_min_f32(v); }
-    static zmm_t set1(type_t v) { return _mm512_set1_ps(v); }
-    template<__mmask16 mask>
-    static zmm_t shuffle(zmm_t zmm)
-    {
-        return _mm512_shuffle_ps(zmm, zmm, (_MM_PERM_ENUM)mask);
-    }
-    static void storeu(void *mem, zmm_t x) { return _mm512_storeu_ps(mem, x); }
-
-    static ymm_t max(ymm_t x, ymm_t y) { return _mm256_max_ps(x, y); }
-    static ymm_t min(ymm_t x, ymm_t y) { return _mm256_min_ps(x, y); }
-};
-
-/*
- * COEX == Compare and Exchange two registers by swapping min and max values
- */
-template <typename vtype, typename mm_t>
-void
-COEX(mm_t &a, mm_t &b)
-{
-    mm_t temp = a;
-    a = vtype::min(a, b);
-    b = vtype::max(temp, b);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-cmp_merge(zmm_t in1, zmm_t in2, __mmask16 mask)
-{
-    zmm_t min = vtype::min(in2, in1);
-    zmm_t max = vtype::max(in2, in1);
-    return vtype::mask_mov(min, mask, max);  // 0 -> min, 1 -> max
-}
-
-/*
- * Assumes zmm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-sort_zmm(zmm_t zmm)
-{
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK3), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm), 0xFF00);
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm is bitonic and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline zmm_t
-bitonic_merge_zmm(zmm_t zmm)
-{
-    // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK7), zmm), 0xFF00);
-    // 2) half_cleaner[8]: compare 1-5, 2-6, 3-7 etc ..
-    zmm = cmp_merge<vtype>(
-            zmm, vtype::permutexvar(_mm512_set_epi32(NETWORK6), zmm), 0xF0F0);
-    // 3) half_cleaner[4]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
-                           0xCCCC);
-    // 3) half_cleaner[1]
-    zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
-                           0xAAAA);
-    return zmm;
-}
-
-// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_two_zmm(zmm_t *zmm1, zmm_t *zmm2)
-{
-    // 1) First step of a merging network: coex of zmm1 and zmm2 reversed
-    *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK5), *zmm2);
-    zmm_t zmm3 = vtype::min(*zmm1, *zmm2);
-    zmm_t zmm4 = vtype::max(*zmm1, *zmm2);
-    // 2) Recursive half cleaner for each
-    *zmm1 = bitonic_merge_zmm<vtype>(zmm3);
-    *zmm2 = bitonic_merge_zmm<vtype>(zmm4);
-}
-
-// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
-// half cleaner
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_four_zmm(zmm_t *zmm)
-{
-    zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[2]);
-    zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[3]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm3r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm2r);
-    zmm_t zmm_t3 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm2r));
-    zmm_t zmm_t4 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm3r));
-    zmm_t zmm0 = vtype::min(zmm_t1, zmm_t2);
-    zmm_t zmm1 = vtype::max(zmm_t1, zmm_t2);
-    zmm_t zmm2 = vtype::min(zmm_t3, zmm_t4);
-    zmm_t zmm3 = vtype::max(zmm_t3, zmm_t4);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm0);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm1);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm2);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm3);
-}
-
-template <typename vtype, typename zmm_t = typename vtype::zmm_t>
-static inline void
-bitonic_merge_eight_zmm(zmm_t *zmm)
-{
-    zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[4]);
-    zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[5]);
-    zmm_t zmm6r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[6]);
-    zmm_t zmm7r = vtype::permutexvar(_mm512_set_epi32(NETWORK5), zmm[7]);
-    zmm_t zmm_t1 = vtype::min(zmm[0], zmm7r);
-    zmm_t zmm_t2 = vtype::min(zmm[1], zmm6r);
-    zmm_t zmm_t3 = vtype::min(zmm[2], zmm5r);
-    zmm_t zmm_t4 = vtype::min(zmm[3], zmm4r);
-    zmm_t zmm_t5 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[3], zmm4r));
-    zmm_t zmm_t6 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[2], zmm5r));
-    zmm_t zmm_t7 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[1], zmm6r));
-    zmm_t zmm_t8 = vtype::permutexvar(_mm512_set_epi32(NETWORK5),
-                                      vtype::max(zmm[0], zmm7r));
-    COEX<vtype>(zmm_t1, zmm_t3);
-    COEX<vtype>(zmm_t2, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t7);
-    COEX<vtype>(zmm_t6, zmm_t8);
-    COEX<vtype>(zmm_t1, zmm_t2);
-    COEX<vtype>(zmm_t3, zmm_t4);
-    COEX<vtype>(zmm_t5, zmm_t6);
-    COEX<vtype>(zmm_t7, zmm_t8);
-    zmm[0] = bitonic_merge_zmm<vtype>(zmm_t1);
-    zmm[1] = bitonic_merge_zmm<vtype>(zmm_t2);
-    zmm[2] = bitonic_merge_zmm<vtype>(zmm_t3);
-    zmm[3] = bitonic_merge_zmm<vtype>(zmm_t4);
-    zmm[4] = bitonic_merge_zmm<vtype>(zmm_t5);
-    zmm[5] = bitonic_merge_zmm<vtype>(zmm_t6);
-    zmm[6] = bitonic_merge_zmm<vtype>(zmm_t7);
-    zmm[7] = bitonic_merge_zmm<vtype>(zmm_t8);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_16(type_t *arr, npy_int N)
-{
-    __mmask16 load_mask = (0x0001 << N) - 0x0001;
-    typename vtype::zmm_t zmm =
-            vtype::mask_loadu(vtype::zmm_max(), load_mask, arr);
-    vtype::mask_storeu(arr, load_mask, sort_zmm<vtype>(zmm));
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_32(type_t *arr, npy_int N)
-{
-    if (N <= 16) {
-        sort_16<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm1 = vtype::loadu(arr);
-    __mmask16 load_mask = (0x0001 << (N - 16)) - 0x0001;
-    zmm_t zmm2 = vtype::mask_loadu(vtype::zmm_max(), load_mask, arr + 16);
-    zmm1 = sort_zmm<vtype>(zmm1);
-    zmm2 = sort_zmm<vtype>(zmm2);
-    bitonic_merge_two_zmm<vtype>(&zmm1, &zmm2);
-    vtype::storeu(arr, zmm1);
-    vtype::mask_storeu(arr + 16, load_mask, zmm2);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_64(type_t *arr, npy_int N)
-{
-    if (N <= 32) {
-        sort_32<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[4];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    if (N < 48) {
-        load_mask1 = (0x0001 << (N - 32)) - 0x0001;
-        load_mask2 = 0x0000;
-    }
-    else if (N < 64) {
-        load_mask2 = (0x0001 << (N - 48)) - 0x0001;
-    }
-    zmm[2] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 32);
-    zmm[3] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::mask_storeu(arr + 32, load_mask1, zmm[2]);
-    vtype::mask_storeu(arr + 48, load_mask2, zmm[3]);
-}
-
-template <typename vtype, typename type_t>
-static inline void
-sort_128(type_t *arr, npy_int N)
-{
-    if (N <= 64) {
-        sort_64<vtype>(arr, N);
-        return;
-    }
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t zmm[8];
-    zmm[0] = vtype::loadu(arr);
-    zmm[1] = vtype::loadu(arr + 16);
-    zmm[2] = vtype::loadu(arr + 32);
-    zmm[3] = vtype::loadu(arr + 48);
-    zmm[0] = sort_zmm<vtype>(zmm[0]);
-    zmm[1] = sort_zmm<vtype>(zmm[1]);
-    zmm[2] = sort_zmm<vtype>(zmm[2]);
-    zmm[3] = sort_zmm<vtype>(zmm[3]);
-    __mmask16 load_mask1 = 0xFFFF, load_mask2 = 0xFFFF;
-    __mmask16 load_mask3 = 0xFFFF, load_mask4 = 0xFFFF;
-    if (N < 80) {
-        load_mask1 = (0x0001 << (N - 64)) - 0x0001;
-        load_mask2 = 0x0000;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 96) {
-        load_mask2 = (0x0001 << (N - 80)) - 0x0001;
-        load_mask3 = 0x0000;
-        load_mask4 = 0x0000;
-    }
-    else if (N < 112) {
-        load_mask3 = (0x0001 << (N - 96)) - 0x0001;
-        load_mask4 = 0x0000;
-    }
-    else {
-        load_mask4 = (0x0001 << (N - 112)) - 0x0001;
-    }
-    zmm[4] = vtype::mask_loadu(vtype::zmm_max(), load_mask1, arr + 64);
-    zmm[5] = vtype::mask_loadu(vtype::zmm_max(), load_mask2, arr + 80);
-    zmm[6] = vtype::mask_loadu(vtype::zmm_max(), load_mask3, arr + 96);
-    zmm[7] = vtype::mask_loadu(vtype::zmm_max(), load_mask4, arr + 112);
-    zmm[4] = sort_zmm<vtype>(zmm[4]);
-    zmm[5] = sort_zmm<vtype>(zmm[5]);
-    zmm[6] = sort_zmm<vtype>(zmm[6]);
-    zmm[7] = sort_zmm<vtype>(zmm[7]);
-    bitonic_merge_two_zmm<vtype>(&zmm[0], &zmm[1]);
-    bitonic_merge_two_zmm<vtype>(&zmm[2], &zmm[3]);
-    bitonic_merge_two_zmm<vtype>(&zmm[4], &zmm[5]);
-    bitonic_merge_two_zmm<vtype>(&zmm[6], &zmm[7]);
-    bitonic_merge_four_zmm<vtype>(zmm);
-    bitonic_merge_four_zmm<vtype>(zmm + 4);
-    bitonic_merge_eight_zmm<vtype>(zmm);
-    vtype::storeu(arr, zmm[0]);
-    vtype::storeu(arr + 16, zmm[1]);
-    vtype::storeu(arr + 32, zmm[2]);
-    vtype::storeu(arr + 48, zmm[3]);
-    vtype::mask_storeu(arr + 64, load_mask1, zmm[4]);
-    vtype::mask_storeu(arr + 80, load_mask2, zmm[5]);
-    vtype::mask_storeu(arr + 96, load_mask3, zmm[6]);
-    vtype::mask_storeu(arr + 112, load_mask4, zmm[7]);
-}
-
-template <typename type_t>
-static inline void
-swap(type_t *arr, npy_intp ii, npy_intp jj)
-{
-    type_t temp = arr[ii];
-    arr[ii] = arr[jj];
-    arr[jj] = temp;
-}
-
-// Median of 3 strategy
-// template<typename type_t>
-// static inline
-// npy_intp get_pivot_index(type_t *arr, const npy_intp left, const npy_intp
-// right) {
-//    return (rand() % (right + 1 - left)) + left;
-//    //npy_intp middle = ((right-left)/2) + left;
-//    //type_t a = arr[left], b = arr[middle], c = arr[right];
-//    //if ((b >= a && b <= c) || (b <= a && b >= c))
-//    //    return middle;
-//    //if ((a >= b && a <= c) || (a <= b && a >= c))
-//    //    return left;
-//    //else
-//    //    return right;
-//}
-
-/*
- * Picking the pivot: Median of 72 array elements chosen at random.
- */
-
-template <typename vtype, typename type_t>
-static inline type_t
-get_pivot(type_t *arr, const npy_intp left, const npy_intp right)
-{
-    /* seeds for vectorized random number generator */
-    __m256i s0 = _mm256_setr_epi64x(8265987198341093849, 3762817312854612374,
-                                    1324281658759788278, 6214952190349879213);
-    __m256i s1 = _mm256_setr_epi64x(2874178529384792648, 1257248936691237653,
-                                    7874578921548791257, 1998265912745817298);
-    s0 = _mm256_add_epi64(s0, _mm256_set1_epi64x(left));
-    s1 = _mm256_sub_epi64(s1, _mm256_set1_epi64x(right));
-
-    npy_intp arrsize = right - left + 1;
-    __m256i bound =
-            _mm256_set1_epi32(arrsize > INT32_MAX ? INT32_MAX : arrsize);
-    __m512i left_vec = _mm512_set1_epi64(left);
-    __m512i right_vec = _mm512_set1_epi64(right);
-    using ymm_t = typename vtype::ymm_t;
-    ymm_t v[9];
-    /* fill 9 vectors with random numbers */
-    for (npy_int i = 0; i < 9; ++i) {
-        __m256i rand_64 = vnext(&s0, &s1); /* vector with 4 random uint64_t */
-        __m512i rand_32 = _mm512_cvtepi32_epi64(rnd_epu32(
-                rand_64, bound)); /* random numbers between 0 and bound - 1 */
-        __m512i indices;
-        if (i < 5)
-            indices =
-                    _mm512_add_epi64(left_vec, rand_32); /* indices for arr */
-        else
-            indices =
-                    _mm512_sub_epi64(right_vec, rand_32); /* indices for arr */
-
-        v[i] = vtype::template i64gather<sizeof(type_t)>(indices, arr);
-    }
-
-    /* median network for 9 elements */
-    COEX<vtype>(v[0], v[1]);
-    COEX<vtype>(v[2], v[3]);
-    COEX<vtype>(v[4], v[5]);
-    COEX<vtype>(v[6], v[7]);
-    COEX<vtype>(v[0], v[2]);
-    COEX<vtype>(v[1], v[3]);
-    COEX<vtype>(v[4], v[6]);
-    COEX<vtype>(v[5], v[7]);
-    COEX<vtype>(v[0], v[4]);
-    COEX<vtype>(v[1], v[2]);
-    COEX<vtype>(v[5], v[6]);
-    COEX<vtype>(v[3], v[7]);
-    COEX<vtype>(v[1], v[5]);
-    COEX<vtype>(v[2], v[6]);
-    COEX<vtype>(v[3], v[5]);
-    COEX<vtype>(v[2], v[4]);
-    COEX<vtype>(v[3], v[4]);
-    COEX<vtype>(v[3], v[8]);
-    COEX<vtype>(v[4], v[8]);
-
-    // technically v[4] needs to be sorted before we pick the correct median,
-    // picking the 4th element works just as well for performance
-    type_t *temp = (type_t *)&v[4];
-
-    return temp[4];
-}
-
-/*
- * Partition one ZMM register based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t, typename zmm_t>
-static inline npy_int
-partition_vec(type_t *arr, npy_intp left, npy_intp right, const zmm_t curr_vec,
-              const zmm_t pivot_vec, zmm_t *smallest_vec, zmm_t *biggest_vec)
-{
-    /* which elements are larger than the pivot */
-    __mmask16 gt_mask = vtype::ge(curr_vec, pivot_vec);
-    npy_int amount_gt_pivot = _mm_popcnt_u32((npy_int)gt_mask);
-    vtype::mask_compressstoreu(arr + left, _mm512_knot(gt_mask), curr_vec);
-    vtype::mask_compressstoreu(arr + right - amount_gt_pivot, gt_mask,
-                               curr_vec);
-    *smallest_vec = vtype::min(curr_vec, *smallest_vec);
-    *biggest_vec = vtype::max(curr_vec, *biggest_vec);
-    return amount_gt_pivot;
-}
-
-/*
- * Partition an array based on the pivot and returns the index of the
- * last element that is less than equal to the pivot.
- */
-template <typename vtype, typename type_t>
-static inline npy_intp
-partition_avx512(type_t *arr, npy_intp left, npy_intp right, type_t pivot,
-                 type_t *smallest, type_t *biggest)
-{
-    /* make array length divisible by 16 , shortening the array */
-    for (npy_int i = (right - left) % 16; i > 0; --i) {
-        *smallest = MIN(*smallest, arr[left]);
-        *biggest = MAX(*biggest, arr[left]);
-        if (arr[left] > pivot) {
-            swap(arr, left, --right);
-        }
-        else {
-            ++left;
-        }
-    }
-
-    if (left == right)
-        return left; /* less than 16 elements in the array */
-
-    using zmm_t = typename vtype::zmm_t;
-    zmm_t pivot_vec = vtype::set1(pivot);
-    zmm_t min_vec = vtype::set1(*smallest);
-    zmm_t max_vec = vtype::set1(*biggest);
-
-    if (right - left == 16) {
-        zmm_t vec = vtype::loadu(arr + left);
-        npy_int amount_gt_pivot = partition_vec<vtype>(
-                arr, left, left + 16, vec, pivot_vec, &min_vec, &max_vec);
-        *smallest = vtype::reducemin(min_vec);
-        *biggest = vtype::reducemax(max_vec);
-        return left + (16 - amount_gt_pivot);
-    }
-
-    // first and last 16 values are partitioned at the end
-    zmm_t vec_left = vtype::loadu(arr + left);
-    zmm_t vec_right = vtype::loadu(arr + (right - 16));
-    // store points of the vectors
-    npy_intp r_store = right - 16;
-    npy_intp l_store = left;
-    // indices for loading the elements
-    left += 16;
-    right -= 16;
-    while (right - left != 0) {
-        zmm_t curr_vec;
-        /*
-         * if fewer elements are stored on the right side of the array,
-         * then next elements are loaded from the right side,
-         * otherwise from the left side
-         */
-        if ((r_store + 16) - right < left - l_store) {
-            right -= 16;
-            curr_vec = vtype::loadu(arr + right);
-        }
-        else {
-            curr_vec = vtype::loadu(arr + left);
-            left += 16;
-        }
-        // partition the current vector and save it on both sides of the array
-        npy_int amount_gt_pivot =
-                partition_vec<vtype>(arr, l_store, r_store + 16, curr_vec,
-                                     pivot_vec, &min_vec, &max_vec);
-        ;
-        r_store -= amount_gt_pivot;
-        l_store += (16 - amount_gt_pivot);
-    }
-
-    /* partition and save vec_left and vec_right */
-    npy_int amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, r_store + 16, vec_left,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    amount_gt_pivot =
-            partition_vec<vtype>(arr, l_store, l_store + 16, vec_right,
-                                 pivot_vec, &min_vec, &max_vec);
-    l_store += (16 - amount_gt_pivot);
-    *smallest = vtype::reducemin(min_vec);
-    *biggest = vtype::reducemax(max_vec);
-    return l_store;
-}
-
-template <typename vtype, typename type_t>
-static inline void
-qsort_(type_t *arr, npy_intp left, npy_intp right, npy_int max_iters)
-{
-    /*
-     * Resort to heapsort if quicksort isn't making any progress
-     */
-    if (max_iters <= 0) {
-        heapsort_<typename vtype::tag>(arr + left, right + 1 - left);
-        return;
-    }
-    /*
-     * Base case: use bitonic networks to sort arrays <= 128
-     */
-    if (right + 1 - left <= 128) {
-        sort_128<vtype>(arr + left, (npy_int)(right + 1 - left));
-        return;
-    }
-
-    type_t pivot = get_pivot<vtype>(arr, left, right);
-    type_t smallest = vtype::type_max();
-    type_t biggest = vtype::type_min();
-    npy_intp pivot_index = partition_avx512<vtype>(arr, left, right + 1, pivot,
-                                                   &smallest, &biggest);
-    if (pivot != smallest)
-        qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
-    if (pivot != biggest)
-        qsort_<vtype>(arr, pivot_index, right, max_iters - 1);
-}
-
-static inline npy_intp
-replace_nan_with_inf(npy_float *arr, npy_intp arrsize)
-{
-    npy_intp nan_count = 0;
-    __mmask16 loadmask = 0xFFFF;
-    while (arrsize > 0) {
-        if (arrsize < 16) {
-            loadmask = (0x0001 << arrsize) - 0x0001;
-        }
-        __m512 in_zmm = _mm512_maskz_loadu_ps(loadmask, arr);
-        __mmask16 nanmask = _mm512_cmp_ps_mask(in_zmm, in_zmm, _CMP_NEQ_UQ);
-        nan_count += _mm_popcnt_u32((npy_int)nanmask);
-        _mm512_mask_storeu_ps(arr, nanmask, ZMM_MAX_FLOAT);
-        arr += 16;
-        arrsize -= 16;
-    }
-    return nan_count;
-}
-
-static inline void
-replace_inf_with_nan(npy_float *arr, npy_intp arrsize, npy_intp nan_count)
-{
-    for (npy_intp ii = arrsize - 1; nan_count > 0; --ii) {
-        arr[ii] = NPY_NANF;
-        nan_count -= 1;
-    }
-}
-
-/***************************************
- * C > C++ dispatch
- ***************************************/
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_int)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_int>, npy_int>((npy_int *)arr, 0, arrsize - 1,
-                                         2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_uint)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        qsort_<vector<npy_uint>, npy_uint>((npy_uint *)arr, 0, arrsize - 1,
-                                           2 * (npy_int)log2(arrsize));
-    }
-}
-
-NPY_NO_EXPORT void
-NPY_CPU_DISPATCH_CURFX(x86_quicksort_float)(void *arr, npy_intp arrsize)
-{
-    if (arrsize > 1) {
-        npy_intp nan_count = replace_nan_with_inf((npy_float *)arr, arrsize);
-        qsort_<vector<npy_float>, npy_float>((npy_float *)arr, 0, arrsize - 1,
-                                             2 * (npy_int)log2(arrsize));
-        replace_inf_with_nan((npy_float *)arr, arrsize, nan_count);
-    }
-}
-
-#endif  // NPY_HAVE_AVX512_SKX
diff --git a/numpy/core/src/npysort/x86-qsort.h b/numpy/core/src/npysort/x86-qsort.h
deleted file mode 100644
index 6340e2bc7..000000000
--- a/numpy/core/src/npysort/x86-qsort.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "numpy/npy_common.h"
-
-#include "npy_cpu_dispatch.h"
-
-#ifndef NPY_NO_EXPORT
-#define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
-#endif
-
-#ifndef NPY_DISABLE_OPTIMIZATION
-#include "x86-qsort.dispatch.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_int,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_uint,
-                         (void *start, npy_intp num))
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void x86_quicksort_float,
-                         (void *start, npy_intp num))
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/numpy/core/src/npysort/x86-simd-sort b/numpy/core/src/npysort/x86-simd-sort
new file mode 160000
+Subproject ac6c10cd71a8aaa93996860f33dd81e15447f58
diff --git a/numpy/core/src/umath/_scaled_float_dtype.c b/numpy/core/src/umath/_scaled_float_dtype.c
index 2d6ba3574..c26ace9f1 100644
--- a/numpy/core/src/umath/_scaled_float_dtype.c
+++ b/numpy/core/src/umath/_scaled_float_dtype.c
@@ -26,6 +26,14 @@
 #include "dispatching.h"
 
 
+/* TODO: from wrapping_array_method.c, use proper public header eventually  */
+NPY_NO_EXPORT int
+PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
+        PyArray_DTypeMeta *new_dtypes[], PyArray_DTypeMeta *wrapped_dtypes[],
+        translate_given_descrs_func *translate_given_descrs,
+        translate_loop_descrs_func *translate_loop_descrs);
+
+
 typedef struct {
     PyArray_Descr base;
     double scaling;
@@ -240,7 +248,7 @@ static PyArray_DTypeMeta PyArray_SFloatDType = {{{
     }},
     .type_num = -1,
     .scalar_type = NULL,
-    .flags = NPY_DT_PARAMETRIC,
+    .flags = NPY_DT_PARAMETRIC | NPY_DT_NUMERIC,
     .dt_slots = &sfloat_slots,
 };
 
@@ -439,7 +447,7 @@ sfloat_to_bool_resolve_descriptors(
 
 
 static int
-init_casts(void)
+sfloat_init_casts(void)
 {
     PyArray_DTypeMeta *dtypes[2] = {&PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[4] = {{0, NULL}};
@@ -645,13 +653,55 @@ add_sfloats_resolve_descriptors(
 }
 
 
+/*
+ * We define the hypot loop using the "PyUFunc_AddWrappingLoop" API.
+ * We use this very narrowly for mapping to the double hypot loop currently.
+ */
 static int
-add_loop(const char *ufunc_name,
-        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+translate_given_descrs_to_double(
+        int nin, int nout, PyArray_DTypeMeta *wrapped_dtypes[],
+        PyArray_Descr *given_descrs[], PyArray_Descr *new_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    for (int i = 0; i < 3; i++) {
+        if (given_descrs[i] == NULL) {
+            new_descrs[i] = NULL;
+        }
+        else {
+            new_descrs[i] = PyArray_DescrFromType(NPY_DOUBLE);
+        }
+    }
+    return 0;
+}
+
+
+static int
+translate_loop_descrs(
+        int nin, int nout, PyArray_DTypeMeta *new_dtypes[],
+        PyArray_Descr *given_descrs[],
+        PyArray_Descr *NPY_UNUSED(original_descrs[]),
+        PyArray_Descr *loop_descrs[])
+{
+    assert(nin == 2 && nout == 1);
+    loop_descrs[0] = sfloat_common_instance(
+            given_descrs[0], given_descrs[1]);
+    if (loop_descrs[0] == 0) {
+        return -1;
+    }
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[1] = loop_descrs[0];
+    Py_INCREF(loop_descrs[0]);
+    loop_descrs[2] = loop_descrs[0];
+    return 0;
+}
+
+
+static PyObject *
+sfloat_get_ufunc(const char *ufunc_name)
 {
     PyObject *mod = PyImport_ImportModule("numpy");
     if (mod == NULL) {
-        return -1;
+        return NULL;
     }
     PyObject *ufunc = PyObject_GetAttrString(mod, ufunc_name);
     Py_DECREF(mod);
@@ -659,6 +709,18 @@ add_loop(const char *ufunc_name,
         Py_DECREF(ufunc);
         PyErr_Format(PyExc_TypeError,
                 "numpy.%s was not a ufunc!", ufunc_name);
+        return NULL;
+    }
+    return ufunc;
+}
+
+
+static int
+sfloat_add_loop(const char *ufunc_name,
+        PyArray_DTypeMeta *dtypes[3], PyObject *meth_or_promoter)
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
         return -1;
     }
     PyObject *dtype_tup = PyArray_TupleFromItems(3, (PyObject **)dtypes, 1);
@@ -679,6 +741,24 @@ add_loop(const char *ufunc_name,
 }
 
 
+static int
+sfloat_add_wrapping_loop(const char *ufunc_name, PyArray_DTypeMeta *dtypes[3])
+{
+    PyObject *ufunc = sfloat_get_ufunc(ufunc_name);
+    if (ufunc == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *double_dt = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+    PyArray_DTypeMeta *wrapped_dtypes[3] = {double_dt, double_dt, double_dt};
+    int res = PyUFunc_AddWrappingLoop(
+        ufunc, dtypes, wrapped_dtypes, &translate_given_descrs_to_double,
+        &translate_loop_descrs);
+    Py_DECREF(ufunc);
+    Py_DECREF(double_dt);
+
+    return res;
+}
+
 
 /*
  * We add some very basic promoters to allow multiplying normal and scaled
@@ -706,7 +786,7 @@ promote_to_sfloat(PyUFuncObject *NPY_UNUSED(ufunc),
  * get less so with the introduction of public API).
  */
 static int
-init_ufuncs(void) {
+sfloat_init_ufuncs(void) {
     PyArray_DTypeMeta *dtypes[3] = {
             &PyArray_SFloatDType, &PyArray_SFloatDType, &PyArray_SFloatDType};
     PyType_Slot slots[3] = {{0, NULL}};
@@ -727,7 +807,7 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    int res = add_loop("multiply",
+    int res = sfloat_add_loop("multiply",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
@@ -745,13 +825,18 @@ init_ufuncs(void) {
     if (bmeth == NULL) {
         return -1;
     }
-    res = add_loop("add",
+    res = sfloat_add_loop("add",
             bmeth->dtypes, (PyObject *)bmeth->method);
     Py_DECREF(bmeth);
     if (res < 0) {
         return -1;
     }
 
+    /* N.B.: Wrapping isn't actually correct if scaling can be negative */
+    if (sfloat_add_wrapping_loop("hypot", dtypes) < 0) {
+        return -1;
+    }
+
     /*
      * Add a promoter for both directions of multiply with double.
      */
@@ -766,14 +851,14 @@ init_ufuncs(void) {
     if (promoter == NULL) {
         return -1;
     }
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     if (res < 0) {
         Py_DECREF(promoter);
         return -1;
     }
     promoter_dtypes[0] = double_DType;
     promoter_dtypes[1] = &PyArray_SFloatDType;
-    res = add_loop("multiply", promoter_dtypes, promoter);
+    res = sfloat_add_loop("multiply", promoter_dtypes, promoter);
     Py_DECREF(promoter);
     if (res < 0) {
         return -1;
@@ -814,11 +899,11 @@ get_sfloat_dtype(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
         return NULL;
     }
 
-    if (init_casts() < 0) {
+    if (sfloat_init_casts() < 0) {
         return NULL;
     }
 
-    if (init_ufuncs() < 0) {
+    if (sfloat_init_ufuncs() < 0) {
         return NULL;
     }
 
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 1bf459ce6..b427991e5 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -9,6 +9,9 @@
 #include <Python.h>
 
 #define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
 #include "numpy/arrayobject.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/npy_math.h"
@@ -19,6 +22,9 @@
 #include "npy_cpu_features.h"
 #include "npy_cpu_dispatch.h"
 #include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+
 
 /*
  *****************************************************************************
@@ -300,7 +306,7 @@ static void
                     ptr_this += stride_d;
                     ptr_that += stride_d;
                 }
-                *(@typ@ *)data_out = npy_@sqrt_func@(out);
+                *(@typ@ *)data_out = @sqrt_func@(out);
                 data_that += stride_n;
                 data_out += stride_p;
             }
@@ -343,6 +349,50 @@ static void
 
 /**end repeat**/
 
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+               char **args, npy_intp const *dimensions,
+               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    char *ip=args[0], *op=args[1];
+    for (i = 0; i < di; i++, ip += is, op += os) {
+        if (i == 3) {
+            *(int32_t *)op = - 100;
+        } else {
+            *(int32_t *)op = - *(int32_t *)ip;
+        }
+    }
+    return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    npy_intp is1 = steps[0], isindex = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    int32_t *indexed;
+    for(i = 0; i < n; i++, indx += isindex) {
+        indexed = (int32_t *)(ip1 + is1 * *(npy_intp *)indx);
+        if (i == 3) {
+            *indexed = -200;
+        } else {
+            *indexed = - *indexed;
+        }
+    }
+    return 0;
+}
+
+
+
 /*  The following lines were generated using a slightly modified
     version of code_generators/generate_umath.py and adding these
     lines to defdict:
@@ -671,6 +721,43 @@ err:
     return NULL;
 }
 
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+    if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) {
+        return -1;
+    }
+
+    PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+                                    PyUFunc_Zero, "indexed_negative", NULL, 0);
+    if (negative == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType};
+
+    PyType_Slot slots[] = {
+        {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+        {NPY_METH_strided_loop, INT32_negative},
+        {0, NULL}
+    };
+
+    PyArrayMethod_Spec spec = {
+        .name = "negative_indexed_loop",
+        .nin = 1,
+        .nout = 1,
+        .dtypes = dtypes,
+        .slots = slots,
+        .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+    };
+
+    if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+        Py_DECREF(negative);
+        return -1;
+    }
+    PyDict_SetItemString(dict, "indexed_negative", negative);
+    Py_DECREF(negative);
+    return 0;
+}
+
 static PyMethodDef UMath_TestsMethods[] = {
     {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
      "Test signature parsing of ufunc. \n"
@@ -733,5 +820,13 @@ PyMODINIT_FUNC PyInit__umath_tests(void) {
                         "cannot load _umath_tests module.");
         return NULL;
     }
+
+    if (add_INT32_negative_indexed(m, d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
     return m;
 }
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 6d6c481fb..fee77b2d6 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -914,8 +914,8 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
     int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
-     * Get the actual DTypes we operate with by mixing the operand array
-     * ones with the passed signature.
+     * Get the actual DTypes we operate with by setting op_dtypes[i] from
+     * signature[i].
      */
     for (int i = 0; i < nargs; i++) {
         if (signature[i] != NULL) {
@@ -1240,3 +1240,40 @@ install_logical_ufunc_promoter(PyObject *ufunc)
 
     return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
 }
+
+/*
+ * Return the PyArrayMethodObject or PyCapsule that matches a registered
+ * tuple of identical dtypes. Return a borrowed ref of the first match.
+ */
+NPY_NO_EXPORT PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                 int ndtypes)
+{
+    PyObject *t_dtypes = PyTuple_New(ndtypes);
+    if (t_dtypes == NULL) {
+        return NULL;
+    }
+    for (int i=0; i < ndtypes; i++) {
+        PyTuple_SetItem(t_dtypes, i, (PyObject *)op_dtype);
+    }
+    PyObject *loops = ufunc->_loops;
+    Py_ssize_t length = PyList_Size(loops);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *item = PyList_GetItem(loops, i);
+        PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+        int cmp = PyObject_RichCompareBool(cur_DType_tuple,
+                                           t_dtypes, Py_EQ);
+        if (cmp < 0) {
+            Py_DECREF(t_dtypes);
+            return NULL;
+        }
+        if (cmp == 0) {
+            continue;
+        }
+        /* Got the match */
+        Py_DECREF(t_dtypes);
+        return PyTuple_GetItem(item, 1);
+    }
+    Py_DECREF(t_dtypes);
+    Py_RETURN_NONE;
+}
diff --git a/numpy/core/src/umath/fast_loop_macros.h b/numpy/core/src/umath/fast_loop_macros.h
index 5274957f7..b8c1926b2 100644
--- a/numpy/core/src/umath/fast_loop_macros.h
+++ b/numpy/core/src/umath/fast_loop_macros.h
@@ -12,6 +12,19 @@
 
 #include <assert.h>
 
+#include "simd/simd.h"
+
+/*
+ * largest simd vector size in bytes numpy supports
+ * it is currently a extremely large value as it is only used for memory
+ * overlap checks
+ */
+#if NPY_SIMD > 0
+    // Enough for compiler unroll
+    #define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
+#else
+    #define AUTOVEC_OVERLAP_SIZE 1024
+#endif
 /*
  * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
  * Very large step size can be as slow as processing it using scalar. The
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
         /* condition allows compiler to optimize the generic macro */ \
         if (IS_BINARY_CONT(tin, tout)) { \
             if (abs_ptrdiff(args[2], args[0]) == 0 && \
-                    abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
+                    abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else if (abs_ptrdiff(args[2], args[1]) == 0 && \
-                         abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
+                         abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
                 BASE_BINARY_LOOP_INP(tin, tout, op) \
             } \
             else { \
@@ -378,19 +391,6 @@ abs_ptrdiff(char *a, char *b)
 
 #undef abs_ptrdiff
 
-#define IS_BLOCKABLE_BINARY_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[0] == steps[1] && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)) && \
-     npy_is_aligned(args[0], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR1_BOOL(esize, vsize) \
-    (steps[0] == 0 && steps[1] == (esize) && steps[2] == (1) && \
-     npy_is_aligned(args[1], (esize)))
-
-#define IS_BLOCKABLE_BINARY_SCALAR2_BOOL(esize, vsize) \
-    (steps[0] == (esize) && steps[1] == 0 && steps[2] == (1) && \
-     npy_is_aligned(args[0], (esize)))
-
 /* align var to alignment */
 #define LOOP_BLOCK_ALIGN_VAR(var, type, alignment)\
     npy_intp i, peel = npy_aligned_block_offset(var, sizeof(type),\
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index 56ac96598..965a0eb83 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -13,10 +13,13 @@
 
 #include "convert_datatype.h"
 #include "array_method.h"
+#include "array_coercion.h"
 #include "dtype_transfer.h"
 #include "legacy_array_method.h"
 #include "dtypemeta.h"
 
+#include "ufunc_object.h"
+
 
 typedef struct {
     NpyAuxData base;
@@ -91,7 +94,6 @@ generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context),
     return 0;
 }
 
-
 /*
  * Signal that the old type-resolution function must be used to resolve
  * the descriptors (mainly/only used for datetimes due to the unit).
@@ -234,6 +236,97 @@ get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
 }
 
 
+
+/*
+ * We can shave off a bit of time by just caching the initial and this is
+ * trivial for all internal numeric types.  (Wrapped ufuncs never use
+ * byte-swapping.)
+ */
+static int
+copy_cached_initial(
+        PyArrayMethod_Context *context, npy_bool NPY_UNUSED(reduction_is_empty),
+        char *initial)
+{
+    memcpy(initial, context->method->legacy_initial,
+           context->descriptors[0]->elsize);
+    return 1;
+}
+
+
+/*
+ * The default `get_reduction_initial` attempts to look up the identity
+ * from the calling ufunc.  This might fail, so we only call it when necessary.
+ *
+ * For internal number dtypes, we can easily cache it, so do so after the
+ * first call by overriding the function with `copy_cache_initial`.
+ * This path is not publicly available.  That could be added, and for a
+ * custom initial getter it should be static/compile time data anyway.
+ */
+static int
+get_initial_from_ufunc(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *initial)
+{
+    if (context->caller == NULL
+            || !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) {
+        /* Impossible in NumPy 1.24;  guard in case it becomes possible. */
+        PyErr_SetString(PyExc_ValueError,
+                "getting initial failed because it can only done for legacy "
+                "ufunc loops when the ufunc is provided.");
+        return -1;
+    }
+    npy_bool reorderable;
+    PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+            (PyUFuncObject *)context->caller, &reorderable);
+    if (identity_obj == NULL) {
+        return -1;
+    }
+    if (identity_obj == Py_None) {
+        /* UFunc has no idenity (should not happen) */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+    if (PyTypeNum_ISUNSIGNED(context->descriptors[1]->type_num)
+            && PyLong_CheckExact(identity_obj)) {
+        /*
+         * This is a bit of a hack until we have truly loop specific
+         * identities.  Python -1 cannot be cast to unsigned so convert
+         * it to a NumPy scalar, but we use -1 for bitwise functions to
+         * signal all 1s.
+         * (A builtin identity would not overflow here, although we may
+         * unnecessary convert 0 and 1.)
+         */
+        Py_SETREF(identity_obj, PyObject_CallFunctionObjArgs(
+                     (PyObject *)&PyLongArrType_Type, identity_obj, NULL));
+        if (identity_obj == NULL) {
+            return -1;
+        }
+    }
+    else if (context->descriptors[0]->type_num == NPY_OBJECT
+            && !reduction_is_empty) {
+        /* Allows `sum([object()])` to work, but use 0 when empty. */
+        Py_DECREF(identity_obj);
+        return 0;
+    }
+
+    int res = PyArray_Pack(context->descriptors[0], initial, identity_obj);
+    Py_DECREF(identity_obj);
+    if (res < 0) {
+        return -1;
+    }
+
+    if (PyTypeNum_ISNUMBER(context->descriptors[0]->type_num)) {
+        /* For numbers we can cache to avoid going via Python ints */
+        memcpy(context->method->legacy_initial, initial,
+               context->descriptors[0]->elsize);
+        context->method->get_reduction_initial = &copy_cached_initial;
+    }
+
+    /* Reduction can use the initial value */
+    return 1;
+}
+
+
 /*
  * Get the unbound ArrayMethod which wraps the instances of the ufunc.
  * Note that this function stores the result on the ufunc and then only
@@ -273,6 +366,27 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         flags = _NPY_METH_FORCE_CAST_INPUTS;
     }
 
+    get_reduction_initial_function *get_reduction_intial = NULL;
+    if (ufunc->nin == 2 && ufunc->nout == 1) {
+        npy_bool reorderable = NPY_FALSE;
+        PyObject *identity_obj = PyUFunc_GetDefaultIdentity(
+                ufunc, &reorderable);
+        if (identity_obj == NULL) {
+            return NULL;
+        }
+        /*
+         * TODO: For object, "reorderable" is needed(?), because otherwise
+         *       we disable multi-axis reductions `arr.sum(0, 1)`. But for
+         *       `arr = array([["a", "b"], ["c", "d"]], dtype="object")`
+         *       it isn't actually reorderable (order changes result).
+         */
+        if (reorderable) {
+            flags |= NPY_METH_IS_REORDERABLE;
+        }
+        if (identity_obj != Py_None) {
+            get_reduction_intial = &get_initial_from_ufunc;
+        }
+    }
     for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
         if (signature[i]->singleton->flags & (
                 NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) {
@@ -283,9 +397,10 @@ PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
         }
     }
 
-    PyType_Slot slots[3] = {
-        {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
+    PyType_Slot slots[4] = {
+        {_NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
         {NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors},
+        {NPY_METH_get_reduction_initial, get_reduction_intial},
         {0, NULL},
     };
     if (any_output_flexible) {
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0b4856847..97a74b425 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,6 +13,7 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
 
 #include "npy_pycompat.h"
 
@@ -31,27 +32,9 @@
  */
 #define PW_BLOCKSIZE    128
 
-
-/*
- * largest simd vector size in bytes numpy supports
- * it is currently a extremely large value as it is only used for memory
- * overlap checks
- */
-#ifndef NPY_MAX_SIMD_SIZE
-#define NPY_MAX_SIMD_SIZE 1024
-#endif
-
 /** Provides the various *_LOOP macros */
 #include "fast_loop_macros.h"
 
-/*
- * include vectorized functions and dispatchers
- * this file is safe to include also for generic builds
- * platform specific instructions are either masked via the proprocessor or
- * runtime detected
- */
-#include "simd.inc"
-
 /******************************************************************************
  **                          GENERIC FLOAT LOOPS                             **
  *****************************************************************************/
@@ -416,98 +399,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP =  &&, ||#
- * #SC =  ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-        /*
-         * stick with our variant for more reliable performance, only known
-         * platform which outperforms it by ~20% is an i7 with glibc 2.17
-         */
-        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-#else
-        /* for now only use libc on 32-bit/non-x86 */
-        if (steps[1] == 1) {
-            npy_bool * op = (npy_bool *)args[0];
-#if @and@
-            /* np.all(), search for a zero (false) */
-            if (*op) {
-                *op = memchr(args[1], 0, dimensions[0]) == NULL;
-            }
-#else
-            /*
-             * np.any(), search for a non-zero (true) via comparing against
-             * zero blocks, memcmp is faster than memchr on SSE4 machines
-             * with glibc >= 2.12 and memchr can only check for equal 1
-             */
-            static const npy_bool zero[4096]; /* zero by C standard */
-            npy_uintp i, n = dimensions[0];
-
-            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
-                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
-            }
-            if (!*op && n - i > 0) {
-                *op = memcmp(&args[1][i], zero, n - i) != 0;
-            }
-#endif
-            return;
-        }
-#endif
-        else {
-            BINARY_REDUCE_LOOP(npy_bool) {
-                const npy_bool in2 = *(npy_bool *)ip2;
-                io1 = io1 @OP@ in2;
-                if (io1 @SC@ 0) {
-                    break;
-                }
-            }
-            *((npy_bool *)iop1) = io1;
-        }
-    }
-    else {
-        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-        else {
-            BINARY_LOOP {
-                const npy_bool in1 = *(npy_bool *)ip1;
-                const npy_bool in2 = *(npy_bool *)ip2;
-                *((npy_bool *)op1) = in1 @OP@ in2;
-            }
-        }
-    }
-}
-/**end repeat**/
-
-/**begin repeat
- * #kind = absolute, logical_not#
- * #OP =  !=, ==#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
-        return;
-    }
-    else {
-        UNARY_LOOP {
-            npy_bool in1 = *(npy_bool *)ip1;
-            *((npy_bool *)op1) = in1 @OP@ 0;
-        }
-    }
-}
-/**end repeat**/
-
 NPY_NO_EXPORT void
 BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -516,24 +407,6 @@ BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps,
     }
 }
 
-
-/**begin repeat
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = @val@);
-}
-
-/**end repeat**/
-
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -552,8 +425,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
+#define @TYPE@_floor_divide_indexed @TYPE@_divide_indexed
 #define @TYPE@_fmax @TYPE@_maximum
+#define @TYPE@_fmax_indexed @TYPE@_maximum_indexed
 #define @TYPE@_fmin @TYPE@_minimum
+#define @TYPE@_fmin_indexed @TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -562,155 +438,30 @@ NPY_NO_EXPORT void
         *((@type@ *)op1) = 1;
     }
 }
-
-NPY_NO_EXPORT void
-@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
-}
-
 /**begin repeat1
- * #isa = , _avx2#
- * #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
- * #ATTR = , NPY_GCC_TARGET_AVX2#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
-}
-#endif
-
-/**begin repeat2
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
  * #OP = +, -, *, &, |, ^#
  */
 
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
-    }
-    else {
-        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
-    }
-}
-#endif
-
-/**end repeat2**/
-
-/*
- * Arithmetic bit shift operations.
- *
- * Intel hardware masks bit shift values, so large shifts wrap around
- * and can produce surprising results. The special handling ensures that
- * behavior is independent of compiler or hardware.
- * TODO: We could implement consistent behavior for negative shifts,
- *       which is undefined in C.
- */
-
-#define INT_left_shift_needs_clear_floatstatus
-#define UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                  void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
-
-#ifdef @TYPE@_left_shift_needs_clear_floatstatus
-    // For some reason, our macOS CI sets an "invalid" flag here, but only
-    // for some types.
-    npy_clear_floatstatus_barrier((char*)dimensions);
-#endif
-}
-#endif
-
-#undef INT_left_shift_needs_clear_floatstatus
-#undef UINT_left_shift_needs_clear_floatstatus
-
-#if @CHK@
-NPY_NO_EXPORT
-#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
-NPY_GCC_OPT_3
-#endif
-void
-@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
-                   void *NPY_UNUSED(func))
-{
-    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
-}
-#endif
-
-/**begin repeat2
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * gcc vectorization of this is not good (PR60575) but manual integer
-     * vectorization is too tedious to be worthwhile
-     */
-    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
-}
-#endif
-
-/**end repeat2**/
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
 {
-    BINARY_LOOP {
-        const int t1 = !!*(@type@ *)ip1;
-        const int t2 = !!*(@type@ *)ip2;
-        *((npy_bool *)op1) = (t1 != t2);
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
-#endif
-
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -752,23 +503,6 @@ NPY_NO_EXPORT void
         *((@type@ *) op1) = out;
     }
 }
-
-/**begin repeat1
- * #kind = isnan, isinf, isfinite#
- * #func = npy_isnan, npy_isinf, npy_isfinite#
- * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*
-     * The (void)in; suppresses an unused variable warning raised by gcc and allows
-     * us to re-use this macro even though we do not depend on in
-     */
-    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
-}
-/**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -776,19 +510,6 @@ NPY_NO_EXPORT void
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  * #c    = ,,,l,ll#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -802,7 +523,6 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
-
 /**end repeat**/
 
 /**begin repeat
@@ -810,19 +530,6 @@ NPY_NO_EXPORT void
  * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  * #c    = u,u,u,ul,ull#
  */
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in);
-}
-
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
-}
-
 /**begin repeat1
  * #kind = gcd, lcm#
  **/
@@ -836,9 +543,50 @@ NPY_NO_EXPORT void
     }
 }
 /**end repeat1**/
+/**end repeat**/
+
+
+/*
+ * NOTE: It may be nice to vectorize these, OTOH, these are still faster
+ *       than the cast we used to do.
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 @OP@ in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 @OP@ (npy_ulonglong)in2;
+        }
+    }
+}
 
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 @OP@ 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 @OP@ in2;
+        }
+    }
+}
 /**end repeat**/
 
+
 /*
  *****************************************************************************
  **                           DATETIME LOOPS                                **
@@ -915,12 +663,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_bool, npy_bool, (void)in; *out = NPY_FALSE);
-}
-
-NPY_NO_EXPORT void
 @TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
     OUTPUT_LOOP {
@@ -979,6 +721,7 @@ NPY_NO_EXPORT void
         }
     }
 }
+
 /**end repeat1**/
 
 /**begin repeat1
@@ -1398,6 +1141,8 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
+ *  #fd = 1, 1, 0#
+ *  #VCHK = 1, 1, 0#
  */
 /**begin repeat1
  * #kind = logical_and, logical_or#
@@ -1434,32 +1179,22 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@fd@
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
  **/
-
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- * #isa  = simd, avx512_skx#
- * #CHK  = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX512_SKX)#
- **/
-
-#if @CHK@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((npy_bool *)op1) = @func@(in1) != 0;
-        }
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *((npy_bool *)op1) = @func@(in1) != 0;
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-#endif
-/**end repeat2**/
 /**end repeat1**/
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
@@ -1500,6 +1235,25 @@ NPY_NO_EXPORT void
     }
 }
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = npy_floor_divide@c@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1634,6 +1388,26 @@ LONGDOUBLE_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps
         }
     }
 }
+
+NPY_NO_EXPORT int
+LONGDOUBLE_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_longdouble *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1739,6 +1513,26 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         }
     }
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
+    }
+    return 0;
+}
 /**end repeat**/
 
 #define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
@@ -1840,6 +1634,27 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
     }
     /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -1855,8 +1670,29 @@ HALF_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
         const npy_half in2 = *(npy_half *)ip2;
         *((npy_half *)op1) = (@OP@(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
     }
-    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+    /* no need to clear floatstatus_invalid */
 }
+
+NPY_NO_EXPORT int
+HALF_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for (i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (@OP@(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -1875,6 +1711,27 @@ HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps
     }
 }
 
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (npy_half *)(ip1 + is1 * *(npy_intp *)indx);
+        float v = npy_half_to_float(*(npy_half *)value);
+        float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+        *indexed = npy_float_to_half(div);
+    }
+    return 0;
+}
+
 NPY_NO_EXPORT void
 HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1934,12 +1791,6 @@ HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, v
     }
 }
 
-NPY_NO_EXPORT NPY_GCC_OPT_3 void
-HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
-}
-
 NPY_NO_EXPORT void
 HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -2091,6 +1942,26 @@ NPY_NO_EXPORT void
         }
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_@kind@_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    }
+    return 0;
+}
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -2105,6 +1976,28 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
     }
 }
+
+NPY_NO_EXPORT int @TYPE@_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
 #endif // !SIMD
 
 NPY_NO_EXPORT void
@@ -2216,6 +2109,8 @@ NPY_NO_EXPORT void
 }
 /**end repeat1**/
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
@@ -2226,6 +2121,7 @@ NPY_NO_EXPORT void
         ((@ftype@ *)op1)[1] = in1r*in1i + in1i*in1r;
     }
 }
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
@@ -2256,6 +2152,8 @@ NPY_NO_EXPORT void
     }
 }
 
+#if !@SIMD@
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
 NPY_NO_EXPORT void
 @TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
     UNARY_LOOP {
@@ -2275,20 +2173,6 @@ NPY_NO_EXPORT void
         *((@ftype@ *)op1) = npy_hypot@c@(in1r, in1i);
     }
 }
-
-#if @SIMD@ && defined(HAVE_ATTRIBUTE_TARGET_AVX512F)
-/**begin repeat1
- * arithmetic
- * #kind = conjugate, square, absolute#
- */
-NPY_NO_EXPORT void
-@TYPE@_@kind@_avx512f(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
-{
-    if (!run_unary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        @TYPE@_@kind@(args, dimensions, steps, func);
-    }
-}
-/**end repeat1**/
 #endif
 
 NPY_NO_EXPORT void
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e3a410968..cce73aff8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -10,24 +10,32 @@
     #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
 #endif
 
-#define BOOL_invert BOOL_logical_not
-#define BOOL_add BOOL_logical_or
-#define BOOL_bitwise_and BOOL_logical_and
-#define BOOL_bitwise_or BOOL_logical_or
-#define BOOL_logical_xor BOOL_not_equal
-#define BOOL_bitwise_xor BOOL_logical_xor
-#define BOOL_multiply BOOL_logical_and
-#define BOOL_maximum BOOL_logical_or
-#define BOOL_minimum BOOL_logical_and
-#define BOOL_fmax BOOL_maximum
-#define BOOL_fmin BOOL_minimum
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
  */
 
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -39,11 +47,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
 /**begin repeat
- * #kind = logical_and, logical_or, absolute, logical_not#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+ * #kind = logical_and, logical_or, logical_not, absolute#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -56,6 +68,17 @@ NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                           INTEGER LOOPS
@@ -72,6 +95,11 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -106,19 +134,40 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
 /**begin repeat
- * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
  */
+/**begin repeat1
+ * #kind = invert, logical_not, conjugate, reciprocal, square, add,
+ *         subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *         left_shift, right_shift, logical_and, logical_or,
+ *         logical_xor, isnan, isinf, isfinite,
+ *         absolute, sign#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
 /**begin repeat1
  * both signed and unsigned integer types
  * #s = , u#
  * #S = , U#
  */
-
 #define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
 #define @S@@TYPE@_fmax @S@@TYPE@_maximum
 #define @S@@TYPE@_fmin @S@@TYPE@_minimum
+#define @S@@TYPE@_fmax_indexed @S@@TYPE@_maximum_indexed
+#define @S@@TYPE@_fmin_indexed @S@@TYPE@_minimum_indexed
 
 NPY_NO_EXPORT void
 @S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
@@ -127,83 +176,53 @@ NPY_NO_EXPORT void
 @S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**begin repeat2
- * #isa = , _avx2#
- */
-
-NPY_NO_EXPORT void
-@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
-@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat3
  * Arithmetic
  * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
  *          left_shift, right_shift#
  * #OP = +, -,*, &, |, ^, <<, >>#
  */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat3**/
-
-/**begin repeat3
- * #kind = logical_and, logical_or#
- * #OP = &&, ||#
- */
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-/**end repeat3**/
-
-NPY_NO_EXPORT void
-@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**begin repeat2
  * #kind = maximum, minimum#
- * #OP =  >, <#
  **/
 NPY_NO_EXPORT void
 @S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-/**end repeat2**/
 
-NPY_NO_EXPORT void
-@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT int
+@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
 
 NPY_NO_EXPORT void
-@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**begin repeat2
- * #kind = isnan, isinf, isfinite#
- **/
-NPY_NO_EXPORT void
-@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 /**end repeat2**/
 
 /**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal#
+ * #OP = ==, !=, <, <=, >, >=#
+ */
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 /**end repeat**/
 
- 
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_unary.dispatch.h"
 #endif
@@ -240,6 +259,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_unary.dispatch.h"
 #endif
 /**begin repeat
@@ -262,10 +295,13 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**begin repeat1
  * Arithmetic
  * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 /**end repeat**/
 
@@ -311,15 +347,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_@func@,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
-    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
-
-/**end repeat**/
-
-/**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
 /**begin repeat1
@@ -332,27 +359,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
 /**end repeat1**/
 /**end repeat**/
 
-/**begin repeat
- *  #TYPE = FLOAT, DOUBLE#
- */
-/**begin repeat1
- * #func = maximum, minimum#
- */
-NPY_NO_EXPORT void
-@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-/**end repeat1**/
-/**end repeat**/
-
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_trigonometric.dispatch.h"
 #endif
+
 /**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
  *  #func = sin, cos#
  */
-NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@, (
     char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
 ))
+/**end repeat1**/
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
@@ -391,6 +411,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #c = f, f, , l#
  *  #C = F, F, , L#
  *  #half = 1, 0, 0, 0#
+ *  #fd = 0, 1, 1, 0#
  */
 
 /**begin repeat1
@@ -400,6 +421,11 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  */
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat1**/
 /**end repeat1**/
 
 /**begin repeat1
@@ -419,35 +445,44 @@ NPY_NO_EXPORT void
 /**begin repeat1
  * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
  * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ * #dispatched = 1, 1, 1, 1, 0, 0, 0#
  **/
 
-/**begin repeat2
- * #ISA  = , _avx512_skx#
- **/
+#if !@fd@ || !@dispatched@
 NPY_NO_EXPORT void
-@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 /**end repeat2**/
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = maximum, minimum#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 /**begin repeat1
  * #kind = fmax, fmin#
- * #OP =  >=, <=#
  **/
 NPY_NO_EXPORT void
 @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 /**end repeat1**/
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+NPY_NO_EXPORT int
+@TYPE@_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
@@ -504,6 +539,19 @@ NPY_NO_EXPORT void
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = HALF#
+ */
+/**begin repeat1
+ * #kind = absolute#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
 /*
  *****************************************************************************
  **                           COMPLEX LOOPS                                 **
@@ -516,7 +564,21 @@ NPY_NO_EXPORT void
  * #TYPE = CFLOAT, CDOUBLE#
  */
 /**begin repeat1
- * #kind = add, subtract, multiply#
+ * #kind = add, subtract, multiply, conjugate, square#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = absolute#
  */
 NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
@@ -543,6 +605,9 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
  */
 NPY_NO_EXPORT void
 C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+C@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
 /**end repeat1**/
 
 NPY_NO_EXPORT void
@@ -585,19 +650,14 @@ C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *step
 NPY_NO_EXPORT void
 C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
 
-/**begin repeat1
- * #isa = , _avx512f#
- */
-
 NPY_NO_EXPORT void
-C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+C@TYPE@_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
-/**end repeat1**/
+C@TYPE@_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
 
 NPY_NO_EXPORT void
 C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
@@ -658,9 +718,6 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
-NPY_NO_EXPORT void
-@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
 #define @TYPE@_isnan @TYPE@_isnat
 
 NPY_NO_EXPORT void
@@ -736,6 +793,21 @@ TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *
 #define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
 /* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
 
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = TIMEDELTA, DATETIME#
+ */
+/**begin repeat1
+ * #kind = isinf#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
 /*
  *****************************************************************************
  **                            OBJECT LOOPS                                 **
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index f637ecfe4..3ab5a968d 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -1,6 +1,8 @@
 /*@targets
  ** $maxopt baseline
- ** sse2 avx2 avx512f
+ ** sse2 (avx2 fma3)
+ ** neon asimd
+ ** vsx2 vsx3
  ** vx vxe
  **/
 #define _UMATHMODULE
@@ -14,707 +16,392 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
-// TODO: replace raw SIMD with NPYV
+/**
+ * TODO:
+ *  - Improve the implementation of SIMD complex absolute,
+ *    current one kinda slow and it can be optimized by
+ *    at least avoiding the division and keep sqrt.
+ *  - Vectorize reductions
+ *  - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
 //###############################################################################
 //## Real Single/Double precision
 //###############################################################################
 /********************************************************************************
- ** Defining the SIMD kernels
+ ** Defining ufunc inner functions
  ********************************************************************************/
-#ifdef NPY_HAVE_SSE2
+
+/*
+ * clang has a bug that's present at -O1 or greater.  When partially loading a
+ * vector register for a divide operation, the remaining elements are set
+ * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
+ * store after the divide operation.  clang notices that the entire register
+ * is not needed for the store and optimizes out the fill of 1 to the remaining
+ * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
+ * that we were trying to avoid by filling.
+ *
+ * Using a dummy variable marked 'volatile' convinces clang not to ignore
+ * the explicit fill of remaining elements.  If `-ftrapping-math` is
+ * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
+ * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
+ * `-ftrapping-math` is set by default of Numpy builds in
+ * numpy/distutils/ccompiler.py.
+ *
+ * Note: Apple clang and clang upstream have different versions that overlap
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ < 12000000
+        // Apple Clang before v12
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Apple Clang after v12, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Apple Clang after v12, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ < 10
+        // Clang before v10
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(_MSC_VER)
+        // clang-cl has the same bug
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
+        // Clang v10+, targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+        #else
+        // Clang v10+, not targeting i386 or x86_64
+        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
+        #endif
+    #endif
+#else
+// Not a Clang compiler
+#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
+#endif
+
 /**begin repeat
+ * Float types
  *  #type = npy_float, npy_double#
  *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
+ *  #sfx  = f32, f64#
  *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
+ *  #C = F, #
+ *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
  */
 /**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-static void
-sse2_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # intrin = add, sub, mul, div#
+ * # OP = +, -, *, /#
+ * # PW = 1, 0, 0, 0#
+ * # is_div = 0*3, 1#
+ * # is_mul = 0*2, 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) && npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if @PW@
+        *((@type@*)src0) @OP@= @TYPE@_pairwise_sum(src1, len, ssrc1);
+    #else
+        @type@ acc = *((@type@*)src0);
+        if (ssrc1 == sizeof(@type@)) {
+            for (; len > 0; --len, src1 += sizeof(@type@)) {
+                acc @OP@= *(@type@ *)src1;
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc @OP@= *(@type@ *)src1;
             }
         }
+        *((@type@*)src0) = acc;
+    #endif
+        return;
     }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, a);
-                @vpre512@_store_@vsuf@(&op[i], c);
+#if @VECTOR@
+    if (len > npyv_nlanes_@sfx@*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_@sfx@;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(@type@) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-                @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-                @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-                @vpre512@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if @is_div@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for divide and working around clang partial load bug
+            if(len > 0){
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
+            npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@((const @type@*)src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@((const @type@*)(src1 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a, b0);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a, b1);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, a);
-                @vpre256@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-                @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-                @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-                @vpre256@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            const int vstop = hstep - 1;
+        #else
+            const int vstop = 0;
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load_tillz_@sfx@((const @type@*)src1, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[i];
-    /* lots of specializations, to squeeze out max performance */
-    if (npy_is_aligned(&ip1[i], vector_size_bytes) &&
-            npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
-            }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+            // last partial iteration for multiply / divide and working around clang partial load bug
+            if(len > 0){
+                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        }
-    }
-    else if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        if (ip1 == ip2) {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, a);
-                @vpre@_store_@vsuf@(&op[i], c);
+        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
+            npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@((const @type@*)src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@((const @type@*)(src0 + vstep));
+                npyv_@sfx@ r0 = npyv_@intrin@_@sfx@(a0, b);
+                npyv_@sfx@ r1 = npyv_@intrin@_@sfx@(a1, b);
+                npyv_store_@sfx@((@type@*)dst, r0);
+                npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        }
-        else {
-            LOOP_BLOCKED(@type@, vector_size_bytes) {
-                @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-                @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-                @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-                @vpre@_store_@vsuf@(&op[i], c);
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_div@ || @is_mul@
+                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load_tillz_@sfx@((const @type@*)src0, len);
+            #endif
+                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
+                npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
+        } else {
+            goto loop_scalar;
         }
+        npyv_cleanup();
+        return;
     }
+loop_scalar:
 #endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
-
-static void
-sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ a = @vpre512@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_load_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ b = @vpre512@_loadu_@vsuf@(&ip2[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ a = @vpre256@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_load_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ b = @vpre256@_loadu_@vsuf@(&ip2[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ a = @vpre@_set1_@vsuf@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[0] @OP@ ip2[i];
-    if (npy_is_aligned(&ip2[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_load_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ b = @vpre@_loadu_@vsuf@(&ip2[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const @type@ a = *((@type@*)src0);
+        const @type@ b = *((@type@*)src1);
+        *((@type@*)dst) = a @OP@ b;
     }
 }
 
-static void
-sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
 {
-#ifdef NPY_HAVE_AVX512F
-    const npy_intp vector_size_bytes = 64;
-    const @vtype512@ b = @vpre512@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_load_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype512@ a = @vpre512@_loadu_@vsuf@(&ip1[i]);
-            @vtype512@ c = @vpre512@_@VOP@_@vsuf@(a, b);
-            @vpre512@_store_@vsuf@(&op[i], c);
-        }
-    }
-
-#elif defined NPY_HAVE_AVX2
-    const npy_intp vector_size_bytes = 32;
-    const @vtype256@ b = @vpre256@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_load_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype256@ a = @vpre256@_loadu_@vsuf@(&ip1[i]);
-            @vtype256@ c = @vpre256@_@VOP@_@vsuf@(a, b);
-            @vpre256@_store_@vsuf@(&op[i], c);
-        }
-    }
-#else
-    const npy_intp vector_size_bytes = 16;
-    const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, vector_size_bytes)
-        op[i] = ip1[i] @OP@ ip2[0];
-    if (npy_is_aligned(&ip1[i], vector_size_bytes)) {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, vector_size_bytes) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip1[i]);
-            @vtype@ c = @vpre@_@VOP@_@vsuf@(a, b);
-            @vpre@_store_@vsuf@(&op[i], c);
-        }
-    }
-#endif
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = *indexed @OP@ *(@type@ *)value;
     }
+    return 0;
 }
 
 /**end repeat1**/
 /**end repeat**/
 
-#else // NPY_HAVE_SSE2
+#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
 
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #sfx = f32, f64#
- *  #CHK = _F32, _F64#
- */
-#if NPY_SIMD@CHK@
-/**begin repeat1
-* Arithmetic
-* # kind = add, subtract, multiply, divide#
-* # OP = +, -, *, /#
-* # VOP = add, sub, mul, div#
-*/
-
-static void
-simd_binary_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-    /* lots of specializations, to squeeze out max performance */
-    if (ip1 == ip2) {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, a);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-            npyv_@sfx@ a = npyv_load_@sfx@(&ip1[i]);
-            npyv_@sfx@ b = npyv_load_@sfx@(&ip2[i]);
-            npyv_@sfx@ c = npyv_@VOP@_@sfx@(a, b);
-            npyv_store_@sfx@(&op[i], c);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[i];
-    }
-}
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
 
-static void
-simd_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
-{
-    const npyv_@sfx@ v1 = npyv_setall_@sfx@(ip1[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[0] @OP@ ip2[i];
-    }
-}
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
 
-static void
-simd_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
 {
-    const npyv_@sfx@ v2 = npyv_setall_@sfx@(ip2[0]);
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, NPY_SIMD_WIDTH) {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
-    LOOP_BLOCKED(@type@, NPY_SIMD_WIDTH) {
-        npyv_@sfx@ v1 = npyv_load_@sfx@(&ip1[i]);
-        npyv_@sfx@ v3 = npyv_@VOP@_@sfx@(v1, v2);
-        npyv_store_@sfx@(&op[i], v3);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = ip1[i] @OP@ ip2[0];
-    }
+    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+    npyv_f32x2 r;
+    r.val[0] = fill;
+    r.val[1] = fill;
+    return r;
 }
-/**end repeat1**/
-#endif /* NPY_SIMD@CHK@ */
-/**end repeat**/
-#endif // NPY_HAVE_SSE2
 
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- */
-static inline int
-run_binary_simd_@kind@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
 {
-#if @vector@ && defined NPY_HAVE_SSE2
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-#if defined NPY_HAVE_AVX512F
-    const npy_uintp vector_size_bytes = 64;
-#elif defined NPY_HAVE_AVX2
-    const npy_uintp vector_size_bytes = 32;
+#if NPY_SIMD_BIGENDIAN
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
 #else
-    const npy_uintp vector_size_bytes = 32;
-#endif
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), vector_size_bytes)) {
-        sse2_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-#elif @VECTOR@
-    @type@ * ip1 = (@type@ *)args[0];
-    @type@ * ip2 = (@type@ *)args[1];
-    @type@ * op = (@type@ *)args[2];
-    npy_intp n = dimensions[0];
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
-    else if (IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@TYPE@(op, ip1, ip2, n);
-        return 1;
-    }
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
 #endif
-    return 0;
+    return npyv_xor_f32(x, mask);
 }
-/**end repeat1**/
-/**end repeat**/
 
-/********************************************************************************
- ** Defining ufunc inner functions
- ********************************************************************************/
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #c = f, #
- *  #C = F, #
- */
-/**begin repeat1
- * Arithmetic
- * # kind = add, subtract, multiply, divide#
- * # OP = +, -, *, /#
- * # PW = 1, 0, 0, 0#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
 {
-    if (IS_BINARY_REDUCE) {
-#if @PW@
-        @type@ * iop1 = (@type@ *)args[0];
-        npy_intp n = dimensions[0];
-
-        *iop1 @OP@= @TYPE@_pairwise_sum(args[1], n, steps[1]);
-#else
-        BINARY_REDUCE_LOOP(@type@) {
-            io1 @OP@= *(@type@ *)ip2;
-        }
-        *((@type@ *)iop1) = io1;
-#endif
-    }
-    else if (!run_binary_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = in1 @OP@ in2;
-        }
-    }
+    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+    // a_im * b_im, a_im * b_re
+    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+    return npyv_muladdsub_f32(a_re, b, ab_iiir);
 }
-/**end repeat1**/
-/**end repeat**/
 
-//###############################################################################
-//## Complex Single/Double precision
-//###############################################################################
-/********************************************************************************
- ** Defining the SIMD kernels
- ********************************************************************************/
-#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
-    /**
-     * For somehow MSVC commit aggressive optimization lead
-     * to raises 'RuntimeWarning: invalid value encountered in multiply'
-     *
-     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
-     * investigate about it while moving to NPYV.
-     */
-    #define AVX512F_NOMSVC
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
 #endif
 
-#ifdef AVX512F_NOMSVC
-NPY_FINLINE __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-NPY_FINLINE __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
+#if NPY_SIMD_F64
 
-NPY_FINLINE __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
 {
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+    npyv_f64 r = npyv_setall_f64(a[0]);
+    npyv_f64 i = npyv_setall_f64(a[1]);
+    return npyv_zip_f64(r, i);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
 {
-    return (0x0001 << num_elem) - 0x0001;
+    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+    ));
+    return npyv_xor_f64(x, mask);
 }
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
 {
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
+    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+    // a_im * b_im, a_im * b_re
+    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+    return npyv_muladdsub_f64(a_re, b, ab_iiir);
 }
 
-NPY_FINLINE @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-NPY_FINLINE @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-/**end repeat**/
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
 #endif
 
 /**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
  * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
- *  #vectorf = _mm512_add, _mm512_sub, avx512_cmul#
- */
-#if defined AVX512F_NOMSVC
-static inline void
-AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-    const npy_intp array_size = dimensions[0];
-    npy_intp num_remaining_elements = 2*array_size;
-    @type@* ip1 = (@type@*) args[0];
-    @type@* ip2 = (@type@*) args[1];
-    @type@* op  = (@type@*) args[2];
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
-        x2 = avx512_masked_load_@vsuffix@(load_mask, ip2);
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1, x2);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-
-        ip1 += @num_lanes@;
-        ip2 += @num_lanes@;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif // AVX512F_NOMSVC
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-/**begin repeat1
- *  #func = add, subtract, multiply#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
  */
-static inline int
-run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
 {
-#if defined AVX512F_NOMSVC
-    if (IS_BINARY_STRIDE_ONE(@esize@, 64)) {
-        AVX512F_@func@_@TYPE@(args, dimensions, steps);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
 }
-/**end repeat1**/
+#endif // VECTOR
 /**end repeat**/
 
 /********************************************************************************
@@ -724,55 +411,345 @@ run_binary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const
  * complex types
  * #TYPE = CFLOAT, CDOUBLE#
  * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
  * #c = f, #
  * #C = F, #
  */
 /**begin repeat1
  * arithmetic
- * #kind = add, subtract#
- * #OP = +, -#
- * #PW = 1, 0#
+ * #kind = add, subtract, multiply#
+ * #vectorf = npyv_add, npyv_sub, simd_cmul#
+ * #OP = +, -, *#
+ * #PW = 1, 0, 0#
+ * #is_mul = 0*2, 1#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    // Parenthesis around @PW@ tells clang dead code is intentional
-    if (IS_BINARY_REDUCE && (@PW@)) {
-        npy_intp n = dimensions[0];
-        @ftype@ * or = ((@ftype@ *)args[0]);
-        @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if @PW@
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(@ftype@)*2) == 0
+    ) {
+        @ftype@ *rl_im = (@ftype@ *)b_src0;
         @ftype@ rr, ri;
-
-        @TYPE@_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
-        *or @OP@= rr;
-        *oi @OP@= ri;
+        @TYPE@_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] @OP@= rr;
+        rl_im[1] @OP@= ri;
         return;
     }
-    if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r @OP@ in2r;
-            ((@ftype@ *)op1)[1] = in1i @OP@ in2i;
+#endif
+#if @VECTOR@
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(@ftype@) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(@ftype@) != 0 ||
+        b_ssrc1 % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src0 = (@ftype@*)b_src0;
+    const @ftype@ *src1 = (@ftype@*)b_src1;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(@ftype@);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(@ftype@);
+    const npy_intp sdst  = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+            npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+            npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_@sfx@x2 a = simd_set2_@sfx@(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_@sfx@ b0 = npyv_load_@sfx@(src1);
+                npyv_@sfx@ b1 = npyv_load_@sfx@(src1 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_load2_till_@sfx@(src1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_load2_tillz_@sfx@(src1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+                npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a.val[0], b0);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a.val[1], b1);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a.val[0], b);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_@sfx@x2 b = simd_set2_@sfx@(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_@sfx@ a0 = npyv_load_@sfx@(src0);
+                npyv_@sfx@ a1 = npyv_load_@sfx@(src0 + vstep);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_store_@sfx@(dst, r0);
+                npyv_store_@sfx@(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_load2_till_@sfx@(src0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_store2_till_@sfx@(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+                npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+                npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b.val[0]);
+                npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b.val[1]);
+                npyv_storen2_@sfx@(dst, sdst, r0);
+                npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if @is_mul@
+                npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            #else
+                npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            #endif
+                npyv_@sfx@ r = @vectorf@_@sfx@(a, b.val[0]);
+                npyv_storen2_till_@sfx@(dst, sdst, len, r);
+            }
         }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if @is_mul@
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src0, ssrc0);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src0 + ssrc0*hstep, ssrc0);
+            npyv_@sfx@ b0 = npyv_loadn2_@sfx@(src1, ssrc1);
+            npyv_@sfx@ b1 = npyv_loadn2_@sfx@(src1 + ssrc1*hstep, ssrc1);
+            npyv_@sfx@ r0 = @vectorf@_@sfx@(a0, b0);
+            npyv_@sfx@ r1 = @vectorf@_@sfx@(a1, b1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if @is_mul@
+            npyv_@sfx@ a = npyv_loadn2_till_@sfx@(src0, ssrc0, len, 1.0@c@, 1.0@c@);
+            npyv_@sfx@ b = npyv_loadn2_till_@sfx@(src1, ssrc1, len, 1.0@c@, 1.0@c@);
+        #else
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@(src0, ssrc0, len);
+            npyv_@sfx@ b = npyv_loadn2_tillz_@sfx@(src1, ssrc1, len);
+        #endif
+            npyv_@sfx@ r = @vectorf@_@sfx@(a, b);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
     }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const @ftype@ a_r = ((@ftype@ *)b_src0)[0];
+        const @ftype@ a_i = ((@ftype@ *)b_src0)[1];
+        const @ftype@ b_r = ((@ftype@ *)b_src1)[0];
+        const @ftype@ b_i = ((@ftype@ *)b_src1)[1];
+    #if @is_mul@
+        ((@ftype@ *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((@ftype@ *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((@ftype@ *)b_dst)[0] = a_r @OP@ b_r;
+        ((@ftype@ *)b_dst)[1] = a_i @OP@ b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @ftype@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@ftype@ *)(ip1 + is1 * *(npy_intp *)indx);
+        const @ftype@ b_r = ((@ftype@ *)value)[0];
+        const @ftype@ b_i = ((@ftype@ *)value)[1];
+    #if @is_mul@
+        const @ftype@ a_r = indexed[0];
+        const @ftype@ a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] @OP@= b_r;
+        indexed[1] @OP@= b_i;
+    #endif
+    }
+    return 0;
 }
 /**end repeat1**/
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_multiply)
+/**begin repeat1
+ *  #kind = conjugate, square#
+ *  #is_square = 0, 1#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
-    if (!run_binary_avx512f_multiply_@TYPE@(args, dimensions, steps)) {
-        BINARY_LOOP {
-            const @ftype@ in1r = ((@ftype@ *)ip1)[0];
-            const @ftype@ in1i = ((@ftype@ *)ip1)[1];
-            const @ftype@ in2r = ((@ftype@ *)ip2)[0];
-            const @ftype@ in2i = ((@ftype@ *)ip2)[1];
-            ((@ftype@ *)op1)[0] = in1r*in2r - in1i*in2i;
-            ((@ftype@ *)op1)[1] = in1r*in2i + in1i*in2r;
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if @VECTOR@
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(@ftype@) != 0 ||
+        b_ssrc % sizeof(@ftype@) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const @ftype@ *src  = (@ftype@*)b_src;
+          @ftype@ *dst  = (@ftype@*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(@ftype@);
+    const npy_intp sdst = b_sdst / sizeof(@ftype@);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_@sfx@ a0 = npyv_load_@sfx@(src);
+            npyv_@sfx@ a1 = npyv_load_@sfx@(src + vstep);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_storen2_@sfx@(dst, sdst, r0);
+            npyv_storen2_@sfx@(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_@sfx@ a = npyv_load2_tillz_@sfx@(src, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_storen2_till_@sfx@(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_@sfx@ a0 = npyv_loadn2_@sfx@(src, ssrc);
+            npyv_@sfx@ a1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+            npyv_@sfx@ r0 = simd_c@kind@_@sfx@(a0);
+            npyv_@sfx@ r1 = simd_c@kind@_@sfx@(a1);
+            npyv_store_@sfx@(dst, r0);
+            npyv_store_@sfx@(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_@sfx@ a = npyv_loadn2_tillz_@sfx@((@ftype@*)src, ssrc, len);
+            npyv_@sfx@ r = simd_c@kind@_@sfx@(a);
+            npyv_store2_till_@sfx@(dst, len, r);
         }
     }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const @ftype@ rl = ((@ftype@ *)b_src)[0];
+        const @ftype@ im = ((@ftype@ *)b_src)[1];
+    #if @is_square@
+        ((@ftype@ *)b_dst)[0] = rl*rl - im*im;
+        ((@ftype@ *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((@ftype@ *)b_dst)[0] = rl;
+        ((@ftype@ *)b_dst)[1] = -im;
+    #endif
+    }
 }
+/**end repeat1**/
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
index ea8685002..b6f126298 100644
--- a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -395,6 +395,24 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = floor_div_@TYPE@(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 /**end repeat**/
 
 /**begin repeat
@@ -463,4 +481,28 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
         }
     }
 }
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        @type@ in2 = *(@type@ *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
 /**end repeat**/
diff --git a/numpy/core/src/umath/loops_autovec.dispatch.c.src b/numpy/core/src/umath/loops_autovec.dispatch.c.src
new file mode 100644
index 000000000..bdbfa0f86
--- /dev/null
+++ b/numpy/core/src/umath/loops_autovec.dispatch.c.src
@@ -0,0 +1,287 @@
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
+ * #ftype = npy_float, npy_float, npy_float, npy_float, npy_double, npy_double,
+ *          npy_double, npy_double, npy_double, npy_double#
+ * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
+ * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = add, subtract, multiply#
+ * #OP = +, -, *#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
+#ifdef @TYPE@_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
+    BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
+#else
+    BINARY_LOOP {
+        @type@ in1 = *(@type@ *)ip1;
+        @type@ in2 = *(@type@ *)ip2;
+        *(@type@ *)op1 = npy_rshift@c@(in1, in2);
+    }
+#endif
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ * #c    = u,u,u,ul,ull#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
+}
+
+/**begin repeat1
+ * Arithmetic
+ * #kind = bitwise_and, bitwise_or, bitwise_xor#
+ * #OP = &, |, ^#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
+    }
+}
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = logical_and, logical_or#
+ * #OP = &&, ||#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
+}
+/**end repeat1**/
+
+NPY_FINLINE npy_bool @TYPE@_logical_xor_(@type@ in1, @type@ in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(@type@, npy_bool, *out = @TYPE@_logical_xor_(in1, in2));
+}
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(@type@, npy_bool, (void)in; *out = @val@);
+}
+/**end repeat1**/
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                         SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
+ * #c    = ,,,l,ll#
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+/**begin repeat1
+ * #kind = conjugate, invert, isnan, isinf, isfinite,
+ *         logical_and, logical_or, logical_xor, logical_not,
+ *         bitwise_and, bitwise_or, bitwise_xor#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(U@TYPE@_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat1**/
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #val = NPY_FALSE, NPY_FALSE, NPY_TRUE#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_@kind@)(args, dimensions, steps, func);
+}
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #type = npy_datetime, npy_timedelta#
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_comparison.dispatch.c.src b/numpy/core/src/umath/loops_comparison.dispatch.c.src
index 2cd76c35e..751080871 100644
--- a/numpy/core/src/umath/loops_comparison.dispatch.c.src
+++ b/numpy/core/src/umath/loops_comparison.dispatch.c.src
@@ -234,7 +234,7 @@ static void simd_binary_@kind@_b8(char **args, npy_intp len)
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src1, ++src2, ++dst) {
@@ -258,7 +258,7 @@ static void simd_binary_scalar1_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -281,7 +281,7 @@ static void simd_binary_scalar2_@kind@_b8(char **args, npy_intp len)
     for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
         npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
         npyv_b8 c = npyv_@VOP@_b8(a, b);
-        npyv_store_u8(dst, npyv_andc_u8(npyv_cvt_u8_b8(c), truemask));
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
     }
 
     for (; len > 0; --len, ++src, ++dst) {
@@ -312,19 +312,23 @@ static inline void
 run_binary_simd_@kind@_@sfx@(char **args, npy_intp const *dimensions, npy_intp const *steps)
 {
 #if @VECTOR@
-    /* argument one scalar */
-    if (IS_BLOCKABLE_BINARY_SCALAR1_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    /* argument two scalar */
-    else if (IS_BLOCKABLE_BINARY_SCALAR2_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
-        return;
-    }
-    else if (IS_BLOCKABLE_BINARY_BOOL(sizeof(@type@), NPY_SIMD_WIDTH)) {
-        simd_binary_@kind@_@sfx@(args, dimensions[0]);
-        return;
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(@type@, npy_bool)) {
+            simd_binary_scalar1_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(@type@, npy_bool)) {
+            simd_binary_scalar2_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(@type@, npy_bool)) {
+            simd_binary_@kind@_@sfx@(args, dimensions[0]);
+            return;
+        }
     }
 #endif
 
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
index 182c57b01..1fac3c150 100644
--- a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
+++ b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -239,7 +239,7 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
 
 #ifdef SIMD_AVX512F
 
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src
new file mode 100644
index 000000000..c07525be4
--- /dev/null
+++ b/numpy/core/src/umath/loops_logical.dispatch.c.src
@@ -0,0 +1,377 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+    // tmp is filled with 0xff/0x00, negate and mask to boolean true
+    return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+/*
+ * For logical_and, we have to be careful to handle non-bool inputs where
+ * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
+ * Both evaluate to boolean true, however, a & b is false.  Return value
+ * should be consistent with byte_to_true().
+ */
+NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
+    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
+    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
+    return npyv_andc_u8(truemask, r);
+}
+/*
+ * We don't really need the following, but it simplifies the templating code
+ * below since it is paired with simd_logical_and_u8() above.
+ */
+NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
+{
+    npyv_u8 r = npyv_or_u8(a, b);
+    return byte_to_true(r);
+}
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #and  = 1, 0#
+ * #scalar_op = &&, ||#
+ * #intrin = and, or#
+ * #reduce = min, max#
+ * #scalar_cmp = ==, !=#
+ * #anyall = all, any#
+ */
+static void
+simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
+        npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
+        npyv_u8 r@unroll@ = simd_logical_@intrin@_u8(a@unroll@, b@unroll@);
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = simd_logical_@intrin@_u8(a, b);
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 @scalar_op@ *ip2;
+    }
+}
+
+static void
+simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
+        npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
+        npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
+        npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
+        npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);
+
+        npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);
+
+        if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op @scalar_op@ *ip;
+        if (*op @scalar_cmp@ 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+/**end repeat**/ 
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #op = ==, !=#
+ * #not = 1, 0#
+ */
+static void
+simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if @not@
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
+#if @not@
+        npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
+#else
+        npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
+#endif
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if @not@
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip @op@ 0);
+    }
+}
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ */
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP =  &&, ||#
+ * #SC =  ==, !=#
+ * #and = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 @OP@ in2;
+                if (io1 @SC@ 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 @OP@ in2;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #OP = ==, !=#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 @OP@ 0;
+        }
+    }
+}
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 237c8e933..9d8667d38 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -451,6 +451,24 @@ clear_fp:
 #endif
 }
 
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indx = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    @type@ *indexed;
+    for(i = 0; i < n; i++, indx += isindex, value += isb) {
+        indexed = (@type@ *)(ip1 + is1 * *(npy_intp *)indx);
+        *indexed = SCALAR_OP(*indexed, *(@type@ *)value);
+    }
+    return 0;
+}
+
 #undef SCALAR_OP
 
 #endif // !fp_only || (is_fp && fp_only)
diff --git a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
index 78685e807..1b77592c8 100644
--- a/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
+++ b/numpy/core/src/umath/loops_trigonometric.dispatch.c.src
@@ -9,12 +9,18 @@
 #include "simd/simd.h"
 #include "loops_utils.h"
 #include "loops.h"
+#include "fast_loop_macros.h"
 /*
  * TODO:
  * - use vectorized version of Payne-Hanek style reduction for large elements or
  *   when there's no native FUSED support instead of fallback to libc
  */
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3 // native support
+#if NPY_SIMD_FMA3  // native support
+/**begin repeat
+ *  #check = F64, F32#
+ *  #sfx  = f64, f32#
+ */
+#if NPY_SIMD_@check@
 /*
  * Vectorized Cody-Waite range reduction technique
  * Performs the reduction step x* = x - y*C in three steps:
@@ -23,14 +29,189 @@
  * 3) x* = x - y*c3
  * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
  */
-NPY_FINLINE npyv_f32
-simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+NPY_FINLINE npyv_@sfx@
+simd_range_reduction_@sfx@(npyv_@sfx@ x, npyv_@sfx@ y, npyv_@sfx@ c1, npyv_@sfx@ c2, npyv_@sfx@ c3)
 {
-    npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
-    reduced_x = npyv_muladd_f32(y, c2, reduced_x);
-    reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+    npyv_@sfx@ reduced_x = npyv_muladd_@sfx@(y, c1, x);
+    reduced_x = npyv_muladd_@sfx@(y, c2, reduced_x);
+    reduced_x = npyv_muladd_@sfx@(y, c3, reduced_x);
     return reduced_x;
 }
+#endif
+/**end repeat**/
+
+#if NPY_SIMD_F64
+/**begin repeat
+ *  #op = cos, sin#
+ */
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_@op@_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+    // MSVC doesn't compile with direct vector access, so we copy it here
+    // as we have no npyv_get_lane/npyv_set_lane intrinsics
+    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+    npyv_storea_f64(out_copy, out);
+
+    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+        if (cmp_bits & (1 << i)) {
+            out_copy[i] = npy_@op@(out_copy[i]);
+        }
+    }
+
+    return npyv_loada_f64(out_copy);
+}
+/**end repeat**/
+
+/*
+ * Approximate sine algorithm for x \in [-pi/2, pi/2]
+ * worst-case error is 3.5 ulp.
+ * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
+ */
+NPY_FINLINE npyv_f64
+simd_approx_sine_poly_f64(npyv_f64 r)
+{
+    const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
+    const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
+    const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
+    const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
+    const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
+    const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
+    const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
+
+    npyv_f64 r2 = npyv_mul_f64(r, r);
+    npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
+    y = npyv_muladd_f64(y, r2, poly3);
+    y = npyv_muladd_f64(y, r2, poly4);
+    y = npyv_muladd_f64(y, r2, poly5);
+    y = npyv_muladd_f64(y, r2, poly6);
+    y = npyv_muladd_f64(y, r2, poly7);
+    y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
+
+    return y;
+}
+
+/* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+NPY_FINLINE npyv_f64
+simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
+    const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
+    const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
+    const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
+
+    return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
+}
+
+NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
+    const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)).  */
+    const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND.  */
+
+    return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
+}
+
+NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
+    const npyv_f64 range_val = npyv_setall_f64(0x1p23);
+
+    return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
+}
+
+NPY_FINLINE npyv_f64
+simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+    n = npyv_sub_f64(n, npyv_setall_f64(0.5));
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
+}
+
+NPY_FINLINE npyv_f64
+simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint(|x|/pi).  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
+}
+
+/**begin repeat
+ *  #op = cos, sin#
+ */
+NPY_FINLINE void
+simd_@op@_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+    const int vstep = npyv_nlanes_f64;
+
+    npyv_f64 out = npyv_zero_f64();
+    npyv_f64 x_in;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f64(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+
+        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+        npyv_b64 cmp = simd_@op@_range_check_f64(ir);
+        /* If fenv exceptions are to be triggered correctly, set any special lanes
+        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+        scalar loop later.  */
+        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+        // Some in range, at least one calculation is useful
+        if (!npyv_all_b64(cmp)) {
+            out = simd_@op@_poly_f64(r, ir, sign);
+        }
+
+        if (npyv_any_b64(cmp)) {
+            out = npyv_select_f64(cmp, x_in, out);
+            out = simd_@op@_scalar_f64(out, npyv_tobits_b64(cmp));
+        }
+
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
 /*
  * Approximate cosine algorithm for x \in [-PI/4, PI/4]
  * Maximum ULP across all 32-bit floats = 0.875
@@ -124,6 +305,11 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
         } else {
             x_in = npyv_loadn_tillz_f32(src, ssrc, len);
         }
+        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+    #if NPY_SIMD_CMPSIGNAL
+        // Eliminate NaN to avoid FP invalid exception
+        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+    #endif
         npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
         npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
         /*
@@ -132,7 +318,6 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
          * these numbers
          */
         if (simd_maski != 0) {
-            npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
             npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
 
             npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
@@ -169,7 +354,7 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
                 npyv_storen_till_f32(dst, sdst, len, cos);
             }
         }
-        if (simd_maski != ((1 << vstep) - 1)) {
+        if (simd_maski != (npy_uint64)((1 << vstep) - 1)) {
             float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
             npyv_storea_f32(ip_fback, x_in);
 
@@ -194,24 +379,58 @@ simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
     }
     npyv_cleanup();
 }
-#endif // NPY_SIMD_FMA3
+#endif // NPY_SIMD_FP32
+#endif // NYP_SIMD_FMA3
 
 /**begin repeat
  *  #func = cos, sin#
- *  #enum = SIMD_COMPUTE_COS, SIMD_COMPUTE_SIN#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_@func@_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+    }
+#else
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ *  #enum = SIMD_COMPUTE_SIN, SIMD_COMPUTE_COS#
  */
 NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
 (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
-    const float *src = (float*)args[0];
-          float *dst = (float*)args[1];
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
 
     const int lsize = sizeof(src[0]);
     const npy_intp ssrc = steps[0] / lsize;
     const npy_intp sdst = steps[1] / lsize;
     npy_intp len = dimensions[0];
     assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-#if NPY_SIMD_F32 && NPY_SIMD_FMA3
     if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
         !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
     ) {
@@ -222,9 +441,9 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_@func@)
         simd_sincos_f32(src, ssrc, dst, sdst, len, @enum@);
     }
 #else
-    for (; len > 0; --len, src += ssrc, dst += sdst) {
-        const float src0 = *src;
-        *dst = npy_@func@f(src0);
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_@func@f(in1);
     }
 #endif
 }
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
index 46ce51824..89999e879 100644
--- a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -12,10 +12,12 @@
 /**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
+ * #len = 32, 64#
  */
 /**begin repeat1
  * #func = exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
  * #default_val = 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ * #fxnan = 0,    0,   0,     64,    0,     0,    0,   0,    0,    0,    0,    0,    0,     0,     0#
  */
 static void
 simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
@@ -37,7 +39,15 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
                 x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
             }
         #endif
+    #if @fxnan@ == @len@
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b@len@ nnan_mask = npyv_notnan_@sfx@(x);
+        npyv_@sfx@ x_exnan = npyv_select_@sfx@(nnan_mask, x, npyv_setall_@sfx@(@default_val@));
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x_exnan);
+        out = npyv_select_@sfx@(nnan_mask, out, x);
+    #else
         npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+    #endif
         if (sdst == 1) {
             npyv_store_till_@sfx@(dst, len, out);
         } else {
@@ -50,32 +60,6 @@ simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
 /**end repeat**/
 
 /**begin repeat
- * #func = sin, cos#
- */
-static void
-simd_@func@_f64(const double *src, npy_intp ssrc,
-                      double *dst, npy_intp sdst, npy_intp len)
-{
-    const int vstep = npyv_nlanes_f64;
-    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
-        npyv_f64 x;
-        if (ssrc == 1) {
-            x = npyv_load_tillz_f64(src, len);
-        } else {
-            x = npyv_loadn_tillz_f64(src, ssrc, len);
-        }
-        npyv_f64 out = __svml_@func@8(x);
-        if (sdst == 1) {
-            npyv_store_till_f64(dst, len, out);
-        } else {
-            npyv_storen_till_f64(dst, sdst, len, out);
-        }
-    }
-    npyv_cleanup();
-}
-/**end repeat**/
-
-/**begin repeat
  * #sfx = f32, f64#
  * #func_suffix = f16, 8#
  */
@@ -267,31 +251,3 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
 }
 /**end repeat1**/
 /**end repeat**/
-
-/**begin repeat
- *  #func = sin, cos#
- */
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
-{
-#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
-    const double *src = (double*)args[0];
-          double *dst = (double*)args[1];
-    const int lsize = sizeof(src[0]);
-    const npy_intp ssrc = steps[0] / lsize;
-    const npy_intp sdst = steps[1] / lsize;
-    const npy_intp len = dimensions[0];
-    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
-    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
-        npyv_loadable_stride_f64(ssrc) &&
-        npyv_storable_stride_f64(sdst)) {
-        simd_@func@_f64(src, ssrc, dst, sdst, len);
-        return;
-    }
-#endif
-    UNARY_LOOP {
-        const npy_double in1 = *(npy_double *)ip1;
-        *(npy_double *)op1 = npy_@func@(in1);
-    }
-}
-/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_complex.dispatch.c.src b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
new file mode 100644
index 000000000..052ad464c
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_complex.dispatch.c.src
@@ -0,0 +1,139 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**begin repeat
+ * #type = npy_float, npy_double#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #usfx = b32, u64#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_double = 0, 1#
+ * #c = f, #
+ * #INF = NPY_INFINITYF, NPY_INFINITY#
+ * #NAN = NPY_NANF, NPY_NAN#
+ */
+#if @VECTOR@
+NPY_FINLINE npyv_@sfx@
+simd_cabsolute_@sfx@(npyv_@sfx@ re, npyv_@sfx@ im)
+{
+    const npyv_@sfx@ inf = npyv_setall_@sfx@(@INF@);
+    const npyv_@sfx@ nan = npyv_setall_@sfx@(@NAN@);
+
+    re = npyv_abs_@sfx@(re);
+    im = npyv_abs_@sfx@(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_@bsfx@ re_infmask = npyv_cmpeq_@sfx@(re, inf);
+    npyv_@bsfx@ im_infmask = npyv_cmpeq_@sfx@(im, inf);
+    im = npyv_select_@sfx@(re_infmask, inf, im);
+    re = npyv_select_@sfx@(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_@bsfx@ re_nnanmask = npyv_notnan_@sfx@(re);
+    npyv_@bsfx@ im_nnanmask = npyv_notnan_@sfx@(im);
+    im = npyv_select_@sfx@(re_nnanmask, im, nan);
+    re = npyv_select_@sfx@(im_nnanmask, re, nan);
+
+    npyv_@sfx@ larger  = npyv_max_@sfx@(re, im);
+    npyv_@sfx@ smaller = npyv_min_@sfx@(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_@bsfx@ zeromask = npyv_cmpeq_@sfx@(larger, npyv_zero_@sfx@());
+    npyv_@bsfx@ infmask = npyv_cmpeq_@sfx@(smaller, inf);
+    npyv_@bsfx@ div_mask = npyv_not_@bsfx@(npyv_or_@bsfx@(zeromask, infmask));
+
+    npyv_@sfx@ ratio = npyv_ifdivz_@sfx@(div_mask, smaller, larger);
+    npyv_@sfx@ hypot = npyv_sqrt_@sfx@(
+        npyv_muladd_@sfx@(ratio, ratio, npyv_setall_@sfx@(1.0@c@)
+    ));
+    return npyv_mul_@sfx@(hypot, larger);
+}
+#endif // VECTOR
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * complex types
+ * #TYPE = CFLOAT, CDOUBLE#
+ * #ftype = npy_float, npy_double#
+ * #VECTOR = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #sfx = f32, f64#
+ * #c = f, #
+ * #C = F, #
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VECTOR@
+    npy_intp len = dimensions[0];
+    npy_intp ssrc = steps[0] / sizeof(@ftype@);
+    npy_intp sdst = steps[1] / sizeof(@ftype@);
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) && npyv_storable_stride_@sfx@(sdst)
+        && steps[0] % sizeof(@ftype@) == 0
+        && steps[1] % sizeof(@ftype@) == 0
+    ) {
+        const @ftype@ *src = (@ftype@*)args[0];
+              @ftype@ *dst = (@ftype@*)args[1];
+
+        const int vstep = npyv_nlanes_@sfx@;
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && sdst == 1) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+                npyv_@sfx@x2 ab = npyv_load_@sfx@x2(src);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_store_@sfx@(dst, r);
+            }
+        }
+        else {
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+                npyv_@sfx@ re_im0 = npyv_loadn2_@sfx@(src, ssrc);
+                npyv_@sfx@ re_im1 = npyv_loadn2_@sfx@(src + ssrc*hstep, ssrc);
+                npyv_@sfx@x2 ab = npyv_unzip_@sfx@(re_im0, re_im1);
+                npyv_@sfx@ r = simd_cabsolute_@sfx@(ab.val[0], ab.val[1]);
+                npyv_storen_@sfx@(dst, sdst, r);
+            }
+        }
+        for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+            npyv_@sfx@ rl = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            npyv_@sfx@ im = npyv_loadn_tillz_@sfx@(src + 1, ssrc, len);
+            npyv_@sfx@ r = simd_cabsolute_@sfx@(rl, im);
+            npyv_storen_till_@sfx@(dst, sdst, len, r);
+        }
+        npyv_cleanup();
+        npy_clear_floatstatus_barrier((char*)&len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @ftype@ re = ((@ftype@ *)ip1)[0];
+        const @ftype@ im = ((@ftype@ *)ip1)[1];
+        *((@ftype@ *)op1) = npy_hypot@c@(re, im);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
new file mode 100644
index 000000000..ba133dc1e
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src
@@ -0,0 +1,565 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+ */
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+ */
+// TODO(@seiko2plus): add support for big-endian
+#if NPY_SIMD_BIGENDIAN
+    #undef NPY_SIMD
+    #undef NPY_SIMD_F32
+    #undef NPY_SIMD_F64
+    #define NPY_SIMD 0
+    #define NPY_SIMD_F32 0
+    #define NPY_SIMD_F64 0
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**
+ * We define intrinsics for isnan, isinf, isfinite, and signbit below.  There's
+ * a few flavors of each.  We'll use f32 as an example although f64 versions
+ * are also defined.
+ * 
+ * npyv_u32 npyv_KIND_f32(npyv_f32 v)
+ *   These are mainly used for the single vector loops.  As such, result should
+ *   be bool true / false, ready to write back.
+ * 
+ * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
+ *   These are used by the geneal intrinsics above as well as the multi-vector
+ *   packing intrinsics.  The multi-vector packing intrinsics are the ones
+ *   utilized in the unrolled vector loops.  Results should be vector masks
+ *   of 0x00/0xff.
+ * 
+ * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ *   These are the multi-vector packing intrinsics utilized by unrolled vector
+ *   loops.  They perform the operation on all input vectors and pack the
+ *   results to a single npyv_u8.  Assuming NPY_SIMD == 128, that means we
+ *   can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
+ *   Result should be bool true / false, ready to write back.
+ */
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_isnan_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notnan_f32(v0);
+    npyv_b32 b1 = npyv_notnan_f32(v1);
+    npyv_b32 b2 = npyv_notnan_f32(v2);
+    npyv_b32 b3 = npyv_notnan_f32(v3);
+    npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_isnan_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notnan_f64(v0);
+    npyv_b64 b1 = npyv_notnan_f64(v1);
+    npyv_b64 b2 = npyv_notnan_f64(v2);
+    npyv_b64 b3 = npyv_notnan_f64(v3);
+    npyv_b64 b4 = npyv_notnan_f64(v4);
+    npyv_b64 b5 = npyv_notnan_f64(v5);
+    npyv_b64 b6 = npyv_notnan_f64(v6);
+    npyv_b64 b7 = npyv_notnan_f64(v7);
+    npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+_npyv_isinf_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > FLT_MAX
+    const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
+    return vcagtq_f32(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
+    npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
+    return npyv_cmpeq_u32(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u32
+npyv_isinf_f32(npyv_f32 v)
+{
+    const npyv_u32 truemask = npyv_setall_u32(1==1);
+    return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = _npyv_isinf_f32(v0);
+    npyv_b32 b1 = _npyv_isinf_f32(v1);
+    npyv_b32 b2 = _npyv_isinf_f32(v2);
+    npyv_b32 b3 = _npyv_isinf_f32(v3);
+    npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+_npyv_isinf_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > DBL_MAX
+    const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
+    return vcagtq_f64(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
+    npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
+    return npyv_cmpeq_u64(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u64
+npyv_isinf_f64(npyv_f64 v)
+{
+    const npyv_u64 truemask = npyv_setall_u64(1==1);
+    return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = _npyv_isinf_f64(v0);
+    npyv_b64 b1 = _npyv_isinf_f64(v1);
+    npyv_b64 b2 = _npyv_isinf_f64(v2);
+    npyv_b64 b3 = _npyv_isinf_f64(v3);
+    npyv_b64 b4 = _npyv_isinf_f64(v4);
+    npyv_b64 b5 = _npyv_isinf_f64(v5);
+    npyv_b64 b6 = _npyv_isinf_f64(v6);
+    npyv_b64 b7 = _npyv_isinf_f64(v7);
+    npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F64
+
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+npyv_notfinite_f32(npyv_f32 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mentissa is.
+    const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
+    npyv_u32 bits = npyv_reinterpret_u32_f32(v);
+    bits = npyv_and_u32(bits, exp_mask);
+    return npyv_cmpeq_u32(bits, exp_mask);
+}
+NPY_FINLINE npyv_u32
+npyv_isfinite_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F32 exponent is 8-bits, which means we can pack multiple into
+    // a single vector.  We shift out sign bit so that we're left
+    // with only exponent in high byte.  If not all bits are set,
+    // then we've got a finite number.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+    tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+    tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+    tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+    const npyv_u8 expmask = npyv_setall_u8(0xff);
+    r = npyv_cmpneq_u8(r, expmask);
+    r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notfinite_f32(v0);
+    npyv_b32 b1 = npyv_notfinite_f32(v1);
+    npyv_b32 b2 = npyv_notfinite_f32(v2);
+    npyv_b32 b3 = npyv_notfinite_f32(v3);
+    npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+npyv_notfinite_f64(npyv_f64 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mantissa is.
+    const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
+    npyv_u64 bits = npyv_reinterpret_u64_f64(v);
+    bits = npyv_and_u64(bits, exp_mask);
+    return npyv_cmpeq_u64(bits, exp_mask);
+}
+NPY_FINLINE npyv_u64
+npyv_isfinite_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                       npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F64 exponent is 11-bits, which means we can pack multiple into
+    // a single vector.  We'll need to use u16 to fit all exponent
+    // bits.  If not all bits are set, then we've got a finite number.
+    uint8x16x4_t t0123, t4567;
+    t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+    t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+    t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+    t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+    t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+    t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+    t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+    t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+    const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
+    npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+    npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+    const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+    r0 = npyv_and_u16(r0, expmask);
+    r0 = npyv_cmpneq_u16(r0, expmask);
+    r0 = npyv_shri_u16(r0, 15);
+    r1 = npyv_and_u16(r1, expmask);
+    r1 = npyv_cmpneq_u16(r1, expmask);
+    r1 = npyv_shri_u16(r1, 15);
+
+    npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notfinite_f64(v0);
+    npyv_b64 b1 = npyv_notfinite_f64(v1);
+    npyv_b64 b2 = npyv_notfinite_f64(v2);
+    npyv_b64 b3 = npyv_notfinite_f64(v3);
+    npyv_b64 b4 = npyv_notfinite_f64(v4);
+    npyv_b64 b5 = npyv_notfinite_f64(v5);
+    npyv_b64 b6 = npyv_notfinite_f64(v6);
+    npyv_b64 b7 = npyv_notfinite_f64(v7);
+    npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_signbit_f32(npyv_f32 v)
+{
+    return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+    tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+    tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+    tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
+    npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
+    npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
+    npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
+    npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_signbit_f64(npyv_f64 v)
+{
+    return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                      npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+
+    // vuzp2 faster than vtbl for f64
+    npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+    npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+    npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+    npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+    npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+    npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+    npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
+    npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
+    npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
+    npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
+    npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
+    npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
+    npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
+    npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
+    npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #ssfx = 32, 64#
+ */
+#if @VCHK@
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ */
+/**begin repeat2
+ * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
+ * #DTYPE  = CONTIG, CONTIG,  NCONTIG, NCONTIG#
+ */
+static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_@sfx@ *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_@sfx@)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if @STYPE@ == CONTIG
+            // contiguous input
+            npyv_@sfx@ v0 = npyv_load_@sfx@(ip + vstep * 0);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(ip + vstep * 1);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(ip + vstep * 2);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_load_@sfx@(ip + vstep * 4);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(ip + vstep * 5);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(ip + vstep * 6);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_@sfx@ v0 = npyv_loadn_@sfx@(ip + istride * vstep * 0, istride);
+            npyv_@sfx@ v1 = npyv_loadn_@sfx@(ip + istride * vstep * 1, istride);
+            npyv_@sfx@ v2 = npyv_loadn_@sfx@(ip + istride * vstep * 2, istride);
+            npyv_@sfx@ v3 = npyv_loadn_@sfx@(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_@sfx@ v4 = npyv_loadn_@sfx@(ip + istride * vstep * 4, istride);
+            npyv_@sfx@ v5 = npyv_loadn_@sfx@(ip + istride * vstep * 5, istride);
+            npyv_@sfx@ v6 = npyv_loadn_@sfx@(ip + istride * vstep * 6, istride);
+            npyv_@sfx@ v7 = npyv_loadn_@sfx@(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_@kind@_@sfx@(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if @DTYPE@ == CONTIG
+            npyv_store_u8(op, r);
+        #else // @DTYPE@ == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_@sfx@)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_@sfx@)];
+            }
+        #endif // @DTYPE@ == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if @STYPE@ == CONTIG
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+    #else
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+    #endif
+
+        npyv_u@ssfx@ r = npyv_@kind@_@sfx@(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u@ssfx@(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)];
+        #if npyv_nlanes_@sfx@ == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_@kind@(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+/**end repeat2**/
+/**end repeat1**/
+
+#endif // @VCHK@
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ */
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if @VCHK@
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_@sfx@);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_@sfx@(istride) &&
+        npyv_storable_stride_@sfx@(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_@kind@_@TYPE@_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_@kind@_@TYPE@_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_@kind@_@TYPE@_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // @VCHK@
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_@sfx@ in = *(npyv_lanetype_@sfx@ *)ip1;
+        *((npy_bool *)op1) = (npy_@kind@(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+/**end repeat1**/
+/**end repeat**/
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index d247c2639..167164163 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -23,18 +23,19 @@
  * Returns -1 on failure.
  */
 static int
-get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
+get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
                           PyObject **with_override, PyObject **methods)
 {
     int i;
     int num_override_args = 0;
-    int narg, nout;
+    int narg, nout, nwhere;
 
     narg = (int)PyTuple_GET_SIZE(in_args);
     /* It is valid for out_args to be NULL: */
     nout = (out_args != NULL) ? (int)PyTuple_GET_SIZE(out_args) : 0;
+    nwhere = (wheremask_obj != NULL) ? 1: 0;
 
-    for (i = 0; i < narg + nout; ++i) {
+    for (i = 0; i < narg + nout + nwhere; ++i) {
         PyObject *obj;
         int j;
         int new_class = 1;
@@ -42,9 +43,12 @@ get_array_ufunc_overrides(PyObject *in_args, PyObject *out_args,
         if (i < narg) {
             obj = PyTuple_GET_ITEM(in_args, i);
         }
-        else {
+        else if (i < narg + nout){
             obj = PyTuple_GET_ITEM(out_args, i - narg);
         }
+        else {
+            obj = wheremask_obj;
+        }
         /*
          * Have we seen this class before?  If so, ignore.
          */
@@ -208,7 +212,7 @@ copy_positional_args_to_kwargs(const char **keywords,
  */
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result)
 {
@@ -227,7 +231,7 @@ PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
      * Check inputs for overrides
      */
     num_override_args = get_array_ufunc_overrides(
-           in_args, out_args, with_override, array_ufunc_methods);
+           in_args, out_args, wheremask_obj, with_override, array_ufunc_methods);
     if (num_override_args == -1) {
         goto fail;
     }
diff --git a/numpy/core/src/umath/override.h b/numpy/core/src/umath/override.h
index 4e9a323ca..20621bb19 100644
--- a/numpy/core/src/umath/override.h
+++ b/numpy/core/src/umath/override.h
@@ -6,7 +6,7 @@
 
 NPY_NO_EXPORT int
 PyUFunc_CheckOverride(PyUFuncObject *ufunc, char *method,
-        PyObject *in_args, PyObject *out_args,
+        PyObject *in_args, PyObject *out_args, PyObject *wheremask_obj,
         PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames,
         PyObject **result);
 
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 817f99a04..9416e9a29 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -17,6 +17,9 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_pycompat.h"
+#include "array_assign.h"
+#include "array_coercion.h"
+#include "array_method.h"
 #include "ctors.h"
 
 #include "numpy/ufuncobject.h"
@@ -148,24 +151,15 @@ PyArray_CopyInitialReduceValues(
  * context     : The ArrayMethod context (with ufunc, method, and descriptors).
  * operand     : The array to be reduced.
  * out         : NULL, or the array into which to place the result.
- * wheremask   : NOT YET SUPPORTED, but this parameter is placed here
- *               so that support can be added in the future without breaking
- *               API compatibility. Pass in NULL.
+ * wheremask   : Reduction mask of valid values used for `where=`.
  * axis_flags  : Flags indicating the reduction axes of 'operand'.
- * reorderable : If True, the reduction being done is reorderable, which
- *               means specifying multiple axes of reduction at once is ok,
- *               and the reduction code may calculate the reduction in an
- *               arbitrary order. The calculation may be reordered because
- *               of cache behavior or multithreading requirements.
  * keepdims    : If true, leaves the reduction dimensions in the result
  *               with size one.
  * subok       : If true, the result uses the subclass of operand, otherwise
  *               it is always a base class ndarray.
- * identity    : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
- *               this value is used to initialize the result to
- *               the reduction's unit.
+ * initial     : Initial value, if NULL the default is fetched from the
+ *               ArrayMethod (typically as the default from the ufunc).
  * loop        : `reduce_loop` from `ufunc_object.c`.  TODO: Refactor
- * data        : Data which is passed to the inner loop.
  * buffersize  : Buffer size for the iterator. For the default, pass in 0.
  * funcname    : The name of the reduction function, for error messages.
  * errormask   : forwarded from _get_bufsize_errmask
@@ -182,9 +176,9 @@ PyArray_CopyInitialReduceValues(
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask)
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask)
 {
     assert(loop != NULL);
     PyArrayObject *result = NULL;
@@ -198,38 +192,35 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     /* Loop auxdata (must be freed on error) */
     NpyAuxData *auxdata = NULL;
 
-    /* More than one axis means multiple orders are possible */
-    if (!reorderable && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' is not reorderable, "
-                     "so at most one axis may be specified",
-                     funcname);
-        return NULL;
-    }
-    /* Can only use where with an initial ( from identity or argument) */
-    if (wheremask != NULL && identity == Py_None) {
-        PyErr_Format(PyExc_ValueError,
-                     "reduction operation '%s' does not have an identity, "
-                     "so to use a where mask one has to specify 'initial'",
-                     funcname);
-        return NULL;
-    }
-
-
     /* Set up the iterator */
     op[0] = out;
     op[1] = operand;
     op_dtypes[0] = context->descriptors[0];
     op_dtypes[1] = context->descriptors[1];
 
+    /* Buffer to use when we need an initial value */
+    char *initial_buf = NULL;
+
+    /* More than one axis means multiple orders are possible */
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)
+            && count_axes(PyArray_NDIM(operand), axis_flags) > 1) {
+        PyErr_Format(PyExc_ValueError,
+                "reduction operation '%s' is not reorderable, "
+                "so at most one axis may be specified",
+                funcname);
+        goto fail;
+    }
+
     it_flags = NPY_ITER_BUFFERED |
             NPY_ITER_EXTERNAL_LOOP |
             NPY_ITER_GROWINNER |
-            NPY_ITER_DONT_NEGATE_STRIDES |
             NPY_ITER_ZEROSIZE_OK |
             NPY_ITER_REFS_OK |
             NPY_ITER_DELAY_BUFALLOC |
             NPY_ITER_COPY_IF_OVERLAP;
+    if (!(context->method->flags & NPY_METH_IS_REORDERABLE)) {
+        it_flags |= NPY_ITER_DONT_NEGATE_STRIDES;
+    }
     op_flags[0] = NPY_ITER_READWRITE |
                   NPY_ITER_ALIGNED |
                   NPY_ITER_ALLOCATE |
@@ -297,8 +288,52 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         goto fail;
     }
 
+    npy_bool empty_iteration = NpyIter_GetIterSize(iter) == 0;
     result = NpyIter_GetOperandArray(iter)[0];
 
+    /*
+     * Get the initial value (if it exists).  If the iteration is empty
+     * then we assume the reduction is also empty.  The reason is that when
+     * the outer iteration is empty we just won't use the initial value
+     * in any case.  (`np.sum(np.zeros((0, 3)), axis=0)` is a length 3
+     * reduction but has an empty result.)
+     */
+    if ((initial == NULL && context->method->get_reduction_initial == NULL)
+            || initial == Py_None) {
+        /* There is no initial value, or initial value was explicitly unset */
+    }
+    else {
+        /* Not all functions will need initialization, but init always: */
+        initial_buf = PyMem_Calloc(1, op_dtypes[0]->elsize);
+        if (initial_buf == NULL) {
+            PyErr_NoMemory();
+            goto fail;
+        }
+        if (initial != NULL) {
+            /* must use user provided initial value */
+            if (PyArray_Pack(op_dtypes[0], initial_buf, initial) < 0) {
+                goto fail;
+            }
+        }
+        else {
+            /*
+             * Fetch initial from ArrayMethod, we pretend the reduction is
+             * empty when the iteration is.  This may be wrong, but when it is,
+             * we will not need the identity as the result is also empty.
+             */
+            int has_initial = context->method->get_reduction_initial(
+                    context, empty_iteration, initial_buf);
+            if (has_initial < 0) {
+                goto fail;
+            }
+            if (!has_initial) {
+                /* We have no initial value available, free buffer to indicate */
+                PyMem_FREE(initial_buf);
+                initial_buf = NULL;
+            }
+        }
+    }
+
     PyArrayMethod_StridedLoop *strided_loop;
     NPY_ARRAYMETHOD_FLAGS flags = 0;
 
@@ -313,12 +348,27 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
      * Initialize the result to the reduction unit if possible,
      * otherwise copy the initial values and get a view to the rest.
      */
-    if (identity != Py_None) {
-        if (PyArray_FillWithScalar(result, identity) < 0) {
+    if (initial_buf != NULL) {
+        /* Loop provided an identity or default value, assign to result. */
+        int ret = raw_array_assign_scalar(
+                PyArray_NDIM(result), PyArray_DIMS(result),
+                PyArray_DESCR(result),
+                PyArray_BYTES(result), PyArray_STRIDES(result),
+                op_dtypes[0], initial_buf);
+        if (ret < 0) {
             goto fail;
         }
     }
     else {
+        /* Can only use where with an initial (from identity or argument) */
+        if (wheremask != NULL) {
+            PyErr_Format(PyExc_ValueError,
+                    "reduction operation '%s' does not have an identity, "
+                    "so to use a where mask one has to specify 'initial'",
+                    funcname);
+            return NULL;
+        }
+
         /*
          * For 1-D skip_first_count could be optimized to 0, but no-identity
          * reductions are not super common.
@@ -354,7 +404,7 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         }
     }
 
-    if (NpyIter_GetIterSize(iter) != 0) {
+    if (!empty_iteration) {
         NpyIter_IterNextFunc *iternext;
         char **dataptr;
         npy_intp *strideptr;
@@ -387,6 +437,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     }
     Py_INCREF(result);
 
+    if (initial_buf != NULL && PyDataType_REFCHK(PyArray_DESCR(result))) {
+        PyArray_Item_XDECREF(initial_buf, PyArray_DESCR(result));
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (!NpyIter_Deallocate(iter)) {
         Py_DECREF(result);
@@ -395,6 +449,10 @@ PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
     return result;
 
 fail:
+    if (initial_buf != NULL && PyDataType_REFCHK(op_dtypes[0])) {
+        PyArray_Item_XDECREF(initial_buf, op_dtypes[0]);
+    }
+    PyMem_FREE(initial_buf);
     NPY_AUXDATA_FREE(auxdata);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 2170e27a7..d2cbe4849 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -64,8 +64,8 @@ typedef int (PyArray_ReduceLoopFunc)(PyArrayMethod_Context *context,
 NPY_NO_EXPORT PyArrayObject *
 PyUFunc_ReduceWrapper(PyArrayMethod_Context *context,
         PyArrayObject *operand, PyArrayObject *out, PyArrayObject *wheremask,
-        npy_bool *axis_flags, int reorderable, int keepdims,
-        PyObject *identity, PyArray_ReduceLoopFunc *loop,
-        void *data, npy_intp buffersize, const char *funcname, int errormask);
+        npy_bool *axis_flags, int keepdims,
+        PyObject *initial, PyArray_ReduceLoopFunc *loop,
+        npy_intp buffersize, const char *funcname, int errormask);
 
 #endif
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 70fe963b9..a159fdc12 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -453,7 +453,7 @@ static inline int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
 {
     char *args[3] = {(char *)&a, (char *)&b, (char *)out};
-    npy_intp steps[3];
+    npy_intp steps[3] = {0, 0, 0};
     npy_intp size = 1;
     @TYPE@_divide(args, &size, steps, NULL);
     return 0;
@@ -806,7 +806,7 @@ typedef enum {
      */
     CONVERT_PYSCALAR,
     /*
-     * Other object is an unkown scalar or array-like, we (typically) use
+     * Other object is an unknown scalar or array-like, we (typically) use
      * the generic path, which normally ends up in the ufunc machinery.
      */
     OTHER_IS_UNKNOWN_OBJECT,
@@ -1004,7 +1004,7 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
         if (overflow) {
             /* handle as if "unsafe" */
             if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
-                return PROMOTION_REQUIRED;
+                return OTHER_IS_UNKNOWN_OBJECT;
             }
             return CONVERT_PYSCALAR;
         }
@@ -1179,6 +1179,11 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (Half, Float, Double, LongDouble,
  *             CFloat, CDouble, CLongDouble)*4,
  *         (Half, Float, Double, LongDouble)*3#
+ * #NAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *              LONG, ULONG, LONGLONG, ULONGLONG)*12,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
+ *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
  *         (npy_half, npy_float, npy_double, npy_longdouble,
@@ -1202,24 +1207,12 @@ convert_to_@name@(PyObject *value, @type@ *result, npy_bool *may_need_deferring)
  *         (npy_half, npy_float, npy_double, npy_longdouble,
  *             npy_cfloat, npy_cdouble, npy_clongdouble)*4,
  *         (npy_half, npy_float, npy_double, npy_longdouble)*3#
- * #oname = (byte, ubyte, short, ushort, int, uint,
- *              long, ulong, longlong, ulonglong)*11,
- *          double*10,
- *          (half, float, double, longdouble,
- *              cfloat, cdouble, clongdouble)*4,
- *          (half, float, double, longdouble)*3#
  * #OName = (Byte, UByte, Short, UShort, Int, UInt,
  *              Long, ULong, LongLong, ULongLong)*11,
  *          Double*10,
  *          (Half, Float, Double, LongDouble,
  *              CFloat, CDouble, CLongDouble)*4,
  *          (Half, Float, Double, LongDouble)*3#
- * #ONAME = (BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *              LONG, ULONG, LONGLONG, ULONGLONG)*11,
- *          DOUBLE*10,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE,
- *              CFLOAT, CDOUBLE, CLONGDOUBLE)*4,
- *          (HALF, FLOAT, DOUBLE, LONGDOUBLE)*3#
  */
 #define IS_@name@
 /* drop the "true_" from "true_divide" for floating point warnings: */
@@ -1234,7 +1227,7 @@ static PyObject *
 @name@_@oper@(PyObject *a, PyObject *b)
 {
     PyObject *ret;
-    @otype@ arg1, arg2, other_val;
+    @type@ arg1, arg2, other_val;
 
     /*
      * Check if this operation may be considered forward.  Note `is_forward`
@@ -1263,7 +1256,7 @@ static PyObject *
     PyObject *other = is_forward ? b : a;
 
     npy_bool may_need_deferring;
-    conversion_result res = convert_to_@oname@(
+    conversion_result res = convert_to_@name@(
             other, &other_val, &may_need_deferring);
     if (res == CONVERSION_ERROR) {
         return NULL;  /* an error occurred (should never happen) */
@@ -1305,7 +1298,7 @@ static PyObject *
              */
             return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
         case CONVERT_PYSCALAR:
-            if (@ONAME@_setitem(other, (char *)&other_val, NULL) < 0) {
+            if (@NAME@_setitem(other, (char *)&other_val, NULL) < 0) {
                 return NULL;
             }
             break;
@@ -1345,7 +1338,7 @@ static PyObject *
 #if @twoout@
     int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2);
 #else
-    int retstatus = @oname@_ctype_@oper@(arg1, arg2, &out);
+    int retstatus = @name@_ctype_@oper@(arg1, arg2, &out);
 #endif
 
 #if @fperr@
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
deleted file mode 100644
index 6fc1501c9..000000000
--- a/numpy/core/src/umath/simd.inc.src
+++ /dev/null
@@ -1,1148 +0,0 @@
-
-
-/*
- * This file is for the definitions of simd vectorized operations.
- *
- * Currently contains sse2 functions that are built on amd64, x32 or
- * non-generic builds (CFLAGS=-march=...)
- * In future it may contain other instruction sets like AVX or NEON detected
- * at runtime in which case it needs to be included indirectly via a file
- * compiled with special options (or use gcc target attributes) so the binary
- * stays portable.
- */
-
-
-#ifndef __NPY_SIMD_INC
-#define __NPY_SIMD_INC
-
-#include "lowlevel_strided_loops.h"
-#include "numpy/npy_common.h"
-#include "numpy/npy_math.h"
-#include "npy_simd_data.h"
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#include <emmintrin.h>
-#if !defined(_MSC_VER) || _MSC_VER >= 1600
-#include <immintrin.h>
-#else
-#undef __AVX2__
-#undef __AVX512F__
-#endif
-#endif
-#include "loops_utils.h" // nomemoverlap
-#include <assert.h>
-#include <stdlib.h>
-#include <float.h>
-#include <string.h> /* for memcpy */
-
-#define VECTOR_SIZE_BYTES 16
-
-/*
- * Dispatcher functions
- * decide whether the operation can be vectorized and run it
- * if it was run returns true and false if nothing was done
- */
-
-/*
- *****************************************************************************
- **                           CMPLX DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type= npy_float, npy_double#
- * #esize = 8, 16#
- */
-
-/**begin repeat1
- *  #func = square, absolute, conjugate#
- *  #outsize = 1, 2, 1#
- *  #max_stride = 2, 8, 8#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_unary_avx512f_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-    if ((IS_OUTPUT_BLOCKABLE_UNARY(@esize@, (npy_uint)(@esize@/@outsize@), 64)) && (labs(steps[0]) < 2*@max_stride@*@esize@)) {
-        AVX512F_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else
-        return 0;
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           FLOAT DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * #type = npy_float, npy_double, npy_longdouble#
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #EXISTS = 1, 1, 0#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
-#endif
-
-static inline int
-run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
-    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(@type@), sizeof(npy_bool), 64)) {
-        AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
-        return 1;
-    }
-    else {
-        return 0;
-    }
-#endif
-    return 0;
-}
-
-
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * Float types
- *  #type = npy_float, npy_double, npy_longdouble#
- *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- *  #vector = 1, 1, 0#
- *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
- */
-/**begin repeat1
- * #kind = isnan, isfinite, isinf, signbit#
- */
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n);
-
-#endif
-
-static inline int
-run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (steps[0] == sizeof(@type@) && steps[1] == 1 &&
-        npy_is_aligned(args[0], sizeof(@type@))) {
-        sse2_@kind@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-/**end repeat1**/
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
-                        npy_intp n);
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
-#endif
-
-static inline int
-run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
-                               (npy_bool*)args[1], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-
-static inline int
-run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
-                                dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
-#endif
-
-static inline int
-run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-
-/*
- * Vectorized operations
- */
-/*
- *****************************************************************************
- **                           FLOAT LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
-* horizontal reductions on a vector
-* # VOP = min, max#
-*/
-
-NPY_FINLINE npy_float sse2_horizontal_@VOP@___m128(__m128 v)
-{
-    npy_float r;
-    __m128 tmp = _mm_movehl_ps(v, v);                   /* c     d     ... */
-    __m128 m = _mm_@VOP@_ps(v, tmp);                    /* m(ac) m(bd) ... */
-    tmp = _mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1));/* m(bd) m(bd) ... */
-    _mm_store_ss(&r, _mm_@VOP@_ps(tmp, m));             /* m(acbd) ... */
-    return r;
-}
-
-NPY_FINLINE npy_double sse2_horizontal_@VOP@___m128d(__m128d v)
-{
-    npy_double r;
-    __m128d tmp = _mm_unpackhi_pd(v, v);    /* b     b */
-    _mm_store_sd(&r, _mm_@VOP@_pd(tmp, v)); /* m(ab) m(bb) */
-    return r;
-}
-/**end repeat**/
-
-/**begin repeat
- *  #type = npy_float, npy_double#
- *  #TYPE = FLOAT, DOUBLE#
- *  #scalarf = npy_sqrtf, npy_sqrt#
- *  #c = f, #
- *  #vtype = __m128, __m128d#
- *  #vtype256 = __m256, __m256d#
- *  #vtype512 = __m512, __m512d#
- *  #vpre = _mm, _mm#
- *  #vpre256 = _mm256, _mm256#
- *  #vpre512 = _mm512, _mm512#
- *  #vsuf = ps, pd#
- *  #vsufs = ss, sd#
- *  #nan = NPY_NANF, NPY_NAN#
- *  #double = 0, 1#
- *  #cast = _mm_castps_si128, _mm_castpd_si128#
- */
-/*
- * compress 4 vectors to 4/8 bytes in op with filled with 0 or 1
- * the last vector is passed as a pointer as MSVC 2010 is unable to ignore the
- * calling convention leading to C2719 on 32 bit, see #4795
- */
-NPY_FINLINE void
-sse2_compress4_to_byte_@TYPE@(@vtype@ r1, @vtype@ r2, @vtype@ r3, @vtype@ * r4,
-                              npy_bool * op)
-{
-    const __m128i mask = @vpre@_set1_epi8(0x1);
-    __m128i ir1 = @vpre@_packs_epi32(@cast@(r1), @cast@(r2));
-    __m128i ir2 = @vpre@_packs_epi32(@cast@(r3), @cast@(*r4));
-    __m128i rr = @vpre@_packs_epi16(ir1, ir2);
-#if @double@
-    rr = @vpre@_packs_epi16(rr, rr);
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storel_epi64((__m128i*)op, rr);
-#else
-    rr = @vpre@_and_si128(rr, mask);
-    @vpre@_storeu_si128((__m128i*)op, rr);
-#endif
-}
-
-static void
-sse2_signbit_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i]);
-        int r = @vpre@_movemask_@vsuf@(a);
-        if (sizeof(@type@) == 8) {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1);
-        }
-        else {
-            op[i] = r & 1;
-            op[i + 1] = (r >> 1) & 1;
-            op[i + 2] = (r >> 2) & 1;
-            op[i + 3] = (r >> 3);
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_signbit(ip1[i]) != 0;
-    }
-}
-
-/**begin repeat1
- * #kind = isnan, isfinite, isinf#
- * #var = 0, 1, 2#
- */
-
-static void
-sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
-{
-#if @var@ != 0 /* isinf/isfinite */
-    /* signbit mask 0x7FFFFFFF after andnot */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-    const @vtype@ ones = @vpre@_cmpeq_@vsuf@(@vpre@_setzero_@vsuf@(),
-                                             @vpre@_setzero_@vsuf@());
-#if @double@
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(DBL_MAX);
-#else
-    const @vtype@ fltmax = @vpre@_set1_@vsuf@(FLT_MAX);
-#endif
-#endif
-    LOOP_BLOCK_ALIGN_VAR(ip1, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-    LOOP_BLOCKED(@type@, 4 * VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vpre@_load_@vsuf@(&ip1[i + 0 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ b = @vpre@_load_@vsuf@(&ip1[i + 1 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ c = @vpre@_load_@vsuf@(&ip1[i + 2 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ d = @vpre@_load_@vsuf@(&ip1[i + 3 * VECTOR_SIZE_BYTES / sizeof(@type@)]);
-        @vtype@ r1, r2, r3, r4;
-#if @var@ != 0 /* isinf/isfinite */
-        /* fabs via masking of sign bit */
-        r1 = @vpre@_andnot_@vsuf@(mask, a);
-        r2 = @vpre@_andnot_@vsuf@(mask, b);
-        r3 = @vpre@_andnot_@vsuf@(mask, c);
-        r4 = @vpre@_andnot_@vsuf@(mask, d);
-#if @var@ == 1 /* isfinite */
-        /* negative compare against max float, nan is always true */
-        r1 = @vpre@_cmpnle_@vsuf@(r1, fltmax);
-        r2 = @vpre@_cmpnle_@vsuf@(r2, fltmax);
-        r3 = @vpre@_cmpnle_@vsuf@(r3, fltmax);
-        r4 = @vpre@_cmpnle_@vsuf@(r4, fltmax);
-#else /* isinf */
-        r1 = @vpre@_cmpnlt_@vsuf@(fltmax, r1);
-        r2 = @vpre@_cmpnlt_@vsuf@(fltmax, r2);
-        r3 = @vpre@_cmpnlt_@vsuf@(fltmax, r3);
-        r4 = @vpre@_cmpnlt_@vsuf@(fltmax, r4);
-#endif
-        /* flip results to what we want (andnot as there is no sse not) */
-        r1 = @vpre@_andnot_@vsuf@(r1, ones);
-        r2 = @vpre@_andnot_@vsuf@(r2, ones);
-        r3 = @vpre@_andnot_@vsuf@(r3, ones);
-        r4 = @vpre@_andnot_@vsuf@(r4, ones);
-#endif
-#if @var@ == 0 /* isnan */
-        r1 = @vpre@_cmpneq_@vsuf@(a, a);
-        r2 = @vpre@_cmpneq_@vsuf@(b, b);
-        r3 = @vpre@_cmpneq_@vsuf@(c, c);
-        r4 = @vpre@_cmpneq_@vsuf@(d, d);
-#endif
-        sse2_compress4_to_byte_@TYPE@(r1, r2, r3, &r4, &op[i]);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = npy_@kind@(ip1[i]) != 0;
-    }
-}
-
-/**end repeat1**/
-/**end repeat**/
-
-/* bunch of helper functions used in ISA_exp/log_FLOAT*/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_full_load_mask_ps(void)
-{
-    return _mm256_set1_ps(-1.0);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_full_load_mask_pd(void)
-{
-    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
-{
-    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
-                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-    float* addr = maskint + num_lanes - num_elem;
-    return _mm256_loadu_ps(addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
-{
-    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
-    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
-    return _mm256_loadu_si256((__m256i*) addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_gather_ps(__m256 src,
-                     npy_float* addr,
-                     __m256i vindex,
-                     __m256 mask)
-{
-    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_gather_pd(__m256d src,
-                     npy_double* addr,
-                     __m128i vindex,
-                     __m256d mask)
-{
-    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_masked_load_ps(__m256 mask, npy_float* addr)
-{
-    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_masked_load_pd(__m256i mask, npy_double* addr)
-{
-    return _mm256_maskload_pd(addr, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
-{
-    return _mm256_blendv_ps(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256d
-fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
-{
-    return _mm256_blendv_pd(x, val, mask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_blend(__m256 x, __m256 y, __m256 ymask)
-{
-    return _mm256_blendv_ps(x, y, ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256
-fma_invert_mask_ps(__m256 ymask)
-{
-    return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA __m256i
-fma_invert_mask_pd(__m256i ymask)
-{
-    return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
-}
-
-/**begin repeat
- *  #vsub = ps, pd#
- *  #vtype = __m256, __m256d#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_abs_@vsub@(@vtype@ x)
-{
-    return _mm256_andnot_@vsub@(_mm256_set1_@vsub@(-0.0), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm256_div_@vsub@(_mm256_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_rint_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEAREST_INT);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_floor_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_NEG_INF);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_FMA @vtype@
-fma_trunc_@vsub@(@vtype@ x)
-{
-    return _mm256_round_@vsub@(x, _MM_FROUND_TO_ZERO);
-}
-/**end repeat**/
-#endif
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_full_load_mask_ps(void)
-{
-    return 0xFFFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_full_load_mask_pd(void)
-{
-    return 0xFF;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x0001 << num_elem) - 0x0001;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
-{
-    return (0x01 << num_elem) - 0x01;
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_gather_ps(__m512 src,
-                        npy_float* addr,
-                        __m512i vindex,
-                        __mmask16 kmask)
-{
-    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_gather_pd(__m512d src,
-                        npy_double* addr,
-                        __m256i vindex,
-                        __mmask8 kmask)
-{
-    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
-{
-    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
-{
-    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
-{
-    return _mm512_mask_blend_ps(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
-avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
-{
-    return _mm512_mask_blend_pd(mask, x, val);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512
-avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
-{
-    return _mm512_mask_mov_ps(x, ymask, y);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
-avx512_invert_mask_ps(__mmask16 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
-avx512_invert_mask_pd(__mmask8 ymask)
-{
-    return _mm512_knot(ymask);
-}
-
-/**begin repeat
- *  #vsub  = ps, pd#
- *  #type= npy_float, npy_double#
- *  #epi_vsub  = epi32, epi64#
- *  #vtype = __m512, __m512d#
- *  #mask = __mmask16, __mmask8#
- *  #and_const = 0x7fffffff, 0x7fffffffffffffffLL#
- *  #neg_mask = 0x80000000, 0x8000000000000000#
- *  #perm_ = 0xb1, 0x55#
- *  #cmpx_img_mask = 0xAAAA, 0xAA#
- *  #cmpx_re_mask = 0x5555, 0x55#
- *  #INF = NPY_INFINITYF, NPY_INFINITY#
- *  #NAN = NPY_NANF, NPY_NAN#
- */
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_abs_@vsub@(@vtype@ x)
-{
-    return (@vtype@) _mm512_and_@epi_vsub@((__m512i) x,
-				    _mm512_set1_@epi_vsub@ (@and_const@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_reciprocal_@vsub@(@vtype@ x)
-{
-    return _mm512_div_@vsub@(_mm512_set1_@vsub@(1.0f), x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_rint_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x08);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_floor_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x09);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_trunc_@vsub@(@vtype@ x)
-{
-    return _mm512_roundscale_@vsub@(x, 0x0B);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hadd_@vsub@(const @vtype@ x)
-{
-    return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_hsub_@vsub@(const @vtype@ x)
-{
-    return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cabsolute_@vsub@(const @vtype@ x1,
-                        const @vtype@ x2,
-                        const __m512i re_indices,
-                        const __m512i im_indices)
-{
-    @vtype@ inf = _mm512_set1_@vsub@(@INF@);
-    @vtype@ nan = _mm512_set1_@vsub@(@NAN@);
-    @vtype@ x1_abs = avx512_abs_@vsub@(x1);
-    @vtype@ x2_abs = avx512_abs_@vsub@(x2);
-    @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs);
-    @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs);
-    /*
-     * If real or imag = INF, then convert it to inf + j*inf
-     * Handles: inf + j*nan, nan + j*inf
-     */
-    @mask@ re_infmask = _mm512_cmp_@vsub@_mask(re, inf, _CMP_EQ_OQ);
-    @mask@ im_infmask = _mm512_cmp_@vsub@_mask(im, inf, _CMP_EQ_OQ);
-    im = _mm512_mask_mov_@vsub@(im, re_infmask, inf);
-    re = _mm512_mask_mov_@vsub@(re, im_infmask, inf);
-
-    /*
-     * If real or imag = NAN, then convert it to nan + j*nan
-     * Handles: x + j*nan, nan + j*x
-     */
-    @mask@ re_nanmask = _mm512_cmp_@vsub@_mask(re, re, _CMP_NEQ_UQ);
-    @mask@ im_nanmask = _mm512_cmp_@vsub@_mask(im, im, _CMP_NEQ_UQ);
-    im = _mm512_mask_mov_@vsub@(im, re_nanmask, nan);
-    re = _mm512_mask_mov_@vsub@(re, im_nanmask, nan);
-
-    @vtype@ larger  = _mm512_max_@vsub@(re, im);
-    @vtype@ smaller = _mm512_min_@vsub@(im, re);
-
-    /*
-     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
-     */
-    @mask@ zeromask = _mm512_cmp_@vsub@_mask(larger, _mm512_setzero_@vsub@(), _CMP_EQ_OQ);
-    @mask@ infmask = _mm512_cmp_@vsub@_mask(smaller, inf, _CMP_EQ_OQ);
-    @mask@ div_mask = _mm512_knot(_mm512_kor(zeromask, infmask));
-    @vtype@ ratio = _mm512_maskz_div_@vsub@(div_mask, smaller, larger);
-    @vtype@ hypot = _mm512_sqrt_@vsub@(_mm512_fmadd_@vsub@(
-                                        ratio, ratio, _mm512_set1_@vsub@(1.0f)));
-    return _mm512_mul_@vsub@(hypot, larger);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_conjugate_@vsub@(const @vtype@ x)
-{
-    /*
-     * __mm512_mask_xor_ps/pd requires AVX512DQ. We cast it to __m512i and
-     * use the xor_epi32/64 uinstruction instead. Cast is a zero latency instruction
-     */
-    __m512i cast_x = _mm512_cast@vsub@_si512(x);
-    __m512i res = _mm512_mask_xor_@epi_vsub@(cast_x, @cmpx_img_mask@,
-                                        cast_x, _mm512_set1_@epi_vsub@(@neg_mask@));
-    return _mm512_castsi512_@vsub@(res);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
-{
-    // x1 = r1, i1
-    // x2 = r2, i2
-    @vtype@ x3  = _mm512_permute_@vsub@(x2, @perm_@);   // i2, r2
-    @vtype@ x12 = _mm512_mul_@vsub@(x1, x2);            // r1*r2, i1*i2
-    @vtype@ x13 = _mm512_mul_@vsub@(x1, x3);            // r1*i2, r2*i1
-    @vtype@ outreal = avx512_hsub_@vsub@(x12);          // r1*r2 - i1*i2, r1*r2 - i1*i2
-    @vtype@ outimg  = avx512_hadd_@vsub@(x13);          // r1*i2 + i1*r2, r1*i2 + i1*r2
-    return _mm512_mask_blend_@vsub@(@cmpx_img_mask@, outreal, outimg);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@
-avx512_csquare_@vsub@(@vtype@ x)
-{
-    return avx512_cmul_@vsub@(x, x);
-}
-
-/**end repeat**/
-#endif
-
-/**begin repeat
- * #ISA = FMA, AVX512F#
- * #isa = fma, avx512#
- * #vtype = __m256, __m512#
- * #vsize = 256, 512#
- * #or = or_ps, kor#
- * #vsub = , _mask#
- * #mask = __m256, __mmask16#
- * #fmadd = _mm256_fmadd_ps, _mm512_fmadd_ps#
- * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS#
- **/
-
-#if defined @CHK@
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_sqrt_ps(@vtype@ x)
-{
-    return _mm@vsize@_sqrt_ps(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_sqrt_pd(@vtype@d x)
-{
-    return _mm@vsize@_sqrt_pd(x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
-@isa@_square_ps(@vtype@ x)
-{
-    return _mm@vsize@_mul_ps(x,x);
-}
-
-NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
-@isa@_square_pd(@vtype@d x)
-{
-    return _mm@vsize@_mul_pd(x,x);
-}
-
-#endif
-/**end repeat**/
-
-/**begin repeat
- * #type = npy_float, npy_double#
- * #TYPE = FLOAT, DOUBLE#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #episize = epi32, epi64#
- */
-
-/**begin repeat1
- * #func = isnan, isfinite, isinf, signbit#
- * #IMM8 = 0x81, 0x99, 0x18, 0x04#
- * #is_finite = 0, 1, 0, 0#
- * #is_signbit = 0, 0, 0, 1#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static inline NPY_GCC_TARGET_AVX512_SKX void
-AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
-{
-    const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
-    npy_intp num_remaining_elements = array_size;
-
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-#if @is_signbit@
-    @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
-#endif
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum
-     * index will fit in an int32 as a precondition for this function via
-     * IS_OUTPUT_BLOCKABLE_UNARY
-     */
-
-    npy_int32 index_ip[@num_lanes@];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
-        index_ip[ii] = ii*stride_ip;
-    }
-    @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
-    @vtype@ zeros_f = _mm512_setzero_@vsuffix@();
-    __m512i ones = _mm512_set1_@episize@(1);
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
-        }
-#if @is_signbit@
-        x1 = _mm512_and_@vsuffix@(x1,signbit);
-#endif
-
-        @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
-#if @is_finite@
-        fpclassmask = _mm512_knot(fpclassmask);
-#endif
-
-        __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
-        _mm_mask_storeu_epi8(op, load_mask, out);
-
-        ip += @num_lanes@*stride_ip;
-        op += @num_lanes@;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-/**end repeat**/
-
-/**begin repeat
- * #TYPE = CFLOAT, CDOUBLE#
- * #type = npy_float, npy_double#
- * #num_lanes = 16, 8#
- * #vsuffix = ps, pd#
- * #epi_vsub  = epi32, epi64#
- * #mask = __mmask16, __mmask8#
- * #vtype = __m512, __m512d#
- * #scale = 4, 8#
- * #vindextype = __m512i, __m256i#
- * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
- * #storemask = 0xFF, 0xF#
- * #IS_FLOAT = 1, 0#
- */
-
-/**begin repeat1
- *  #func = square, conjugate#
- *  #vectorf = avx512_csquare, avx512_conjugate#
- */
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_@func@_@TYPE@(@type@ * op,
-                      @type@ * ip,
-                      const npy_intp array_size,
-                      const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-     /*
-      * Note: while generally indices are npy_intp, we ensure that our maximum index
-      * will fit in an int32 as a precondition for this function via max_stride
-      */
-    npy_int32 index_ip1[16];
-    for (npy_int32 ii = 0; ii < @num_lanes@; ii=ii+2) {
-        index_ip1[ii] = ii*stride_ip1;
-        index_ip1[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1);
-    @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-        }
-        @vtype@ x1;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex, load_mask);
-        }
-
-        @vtype@ out = @vectorf@_@vsuffix@(x1);
-
-        _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
-        op += @num_lanes@;
-        ip += @num_lanes@*stride_ip1;
-        num_remaining_elements -= @num_lanes@;
-    }
-}
-#endif
-/**end repeat1**/
-
-#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
-static NPY_GCC_OPT_3 inline NPY_GCC_TARGET_AVX512F void
-AVX512F_absolute_@TYPE@(@type@ * op,
-                        @type@ * ip,
-                        const npy_intp array_size,
-                        const npy_intp steps)
-{
-    npy_intp num_remaining_elements = 2*array_size;
-    const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2;
-
-    /*
-     * Note: while generally indices are npy_intp, we ensure that our maximum index
-     * will fit in an int32 as a precondition for this function via max_stride
-     */
-    npy_int32 index_ip[32];
-    for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) {
-        index_ip[ii] = ii*stride_ip1;
-        index_ip[ii+1] = ii*stride_ip1 + 1;
-    }
-    @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip);
-    @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@));
-
-    @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@();
-    @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@();
-    @vtype@ zeros = _mm512_setzero_@vsuffix@();
-
-#if @IS_FLOAT@
-    __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1);
-#else
-    __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0);
-    __m512i im_index  = _mm512_set_epi64(15,13,11,9,7,5,3,1);
-#endif
-
-    while (num_remaining_elements > 0) {
-        if (num_remaining_elements < @num_lanes@) {
-            load_mask1 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements, @num_lanes@);
-            load_mask2 = 0x0000;
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        } else if (num_remaining_elements < 2*@num_lanes@) {
-            load_mask1 = avx512_get_full_load_mask_@vsuffix@();
-            load_mask2 = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements - @num_lanes@, @num_lanes@);
-            store_mask = avx512_get_partial_load_mask_@vsuffix@(
-                                    num_remaining_elements/2, @num_lanes@);
-        }
-        @vtype@ x1, x2;
-        if (stride_ip1 == 1) {
-            x1 = avx512_masked_load_@vsuffix@(load_mask1, ip);
-            x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@);
-        }
-        else {
-            x1  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1);
-            x2  = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2);
-        }
-
-        @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index);
-
-        _mm512_mask_storeu_@vsuffix@(op, store_mask, out);
-        op += @num_lanes@;
-        ip += 2*@num_lanes@*stride_ip1;
-        num_remaining_elements -= 2*@num_lanes@;
-    }
-    npy_clear_floatstatus_barrier((char*)&num_remaining_elements);
-}
-
-#endif
-/**end repeat**/
-
-/*
- *****************************************************************************
- **                           BOOL LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- * # and = 0, 1#
- * # op = ||, &&#
- * # sc = !=, ==#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vload = _mm_load_si128*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-/*
- * convert any bit set to boolean true so vectorized and normal operations are
- * consistent, should not be required if bool is used correctly everywhere but
- * you never know
- */
-#if !@and@
-NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-    /* get 0xFF for zeros */
-    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
-    /* filled with 0xFF/0x00, negate and mask to boolean true */
-    return @vpre@_andnot_@vsuf@(tmp, truemask);
-}
-#endif
-
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @op@ ip2[i];
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
-        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
-#if @and@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        /* get 0xFF for non zeros*/
-        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
-        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
-        tmp = @vpre@_andnot_@vsuf@(tmp, b);
-#else
-        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
-#endif
-
-        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip1[i] @op@ ip2[i]);
-    }
-}
-
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-    /* unrolled once to replace a slow movmsk with a fast pmaxb */
-    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
-        @vtype@ v = @vload@((@vtype@*)&ip[i]);
-        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
-        v = @vpre@_cmpeq_epi8(v, zero);
-        v2 = @vpre@_cmpeq_epi8(v2, zero);
-#if @and@
-        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
-            *op = 0;
-#else
-        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
-            *op = 1;
-#endif
-            return;
-        }
-    }
-    LOOP_BLOCKED_END {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- * # op = !=, ==#
- * # not = 0, 1#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-static void
-sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = (ip[i] @op@ 0);
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
-#if @not@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-        /* equivalent to byte_to_true but can skip the negation */
-        a = @vpre@_cmpeq_epi8(a, zero);
-        a = @vpre@_and_@vsuf@(a, truemask);
-#else
-        /* abs is kind of pointless but maybe its used for byte_to_true */
-        a = byte_to_true(a);
-#endif
-        @vstore@((@vtype@*)&op[i], a);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip[i] @op@ 0);
-    }
-}
-
-/**end repeat**/
-
-#undef VECTOR_SIZE_BYTES
-#endif  /* NPY_HAVE_SSE2_INTRINSICS */
-#endif
-
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 4e628f59a..94cd73ef6 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -1011,6 +1011,34 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
          * necessary to propagate the information to the legacy type resolution.
          */
         if (npy_mark_tmp_array_if_pyscalar(obj, out_op[i], &out_op_DTypes[i])) {
+            if (PyArray_FLAGS(out_op[i]) & NPY_ARRAY_WAS_PYTHON_INT
+                    && PyArray_TYPE(out_op[i]) != NPY_LONG) {
+                /*
+                 * When `np.array(integer)` is not the default integer (mainly
+                 * object dtype), this confuses many type resolvers.  Simply
+                 * forcing a default integer array is unfortunately easiest.
+                 * In this disables the optional NEP 50 warnings, but in
+                 * practice when this happens we should _usually_ pick the
+                 * default integer loop and that raises an error.
+                 * (An exception is `float64(1.) + 10**100` which silently
+                 * will give a float64 result rather than a Python float.)
+                 *
+                 * TODO: Just like the general dual NEP 50/legacy promotion
+                 * support this is meant as a temporary hack for NumPy 1.25.
+                 */
+                static PyArrayObject *zero_arr = NULL;
+                if (NPY_UNLIKELY(zero_arr == NULL)) {
+                    zero_arr = (PyArrayObject *)PyArray_ZEROS(
+                            0, NULL, NPY_LONG, NPY_FALSE);
+                    if (zero_arr == NULL) {
+                        goto fail;
+                    }
+                    ((PyArrayObject_fields *)zero_arr)->flags |= (
+                        NPY_ARRAY_WAS_PYTHON_INT|NPY_ARRAY_WAS_INT_AND_REPLACED);
+                }
+                Py_INCREF(zero_arr);
+                Py_SETREF(out_op[i], zero_arr);
+            }
             *promoting_pyscalars = NPY_TRUE;
         }
     }
@@ -2085,12 +2113,16 @@ _get_coredim_sizes(PyUFuncObject *ufunc, PyArrayObject **op,
 }
 
 /*
- * Returns a new reference
+ * Returns a new reference to the ufunc identity.  Note that this identity
+ * is only a default identity value stored on the ufunc, since the invidiual
+ * ufunc loop (ArrayMethod) is queried for the actual identity.
+ *
  * TODO: store a reference in the ufunc object itself, rather than
  *       constructing one each time
  */
-static PyObject *
-_get_identity(PyUFuncObject *ufunc, npy_bool *reorderable) {
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable)
+{
     switch(ufunc->identity) {
     case PyUFunc_One:
         *reorderable = 1;
@@ -3006,10 +3038,8 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         PyObject *initial, PyArrayObject *wheremask)
 {
     int iaxes, ndim;
-    npy_bool reorderable;
     npy_bool axis_flags[NPY_MAXDIMS];
-    PyArrayObject *result = NULL;
-    PyObject *identity;
+
     const char *ufunc_name = ufunc_get_name_cstr(ufunc);
     /* These parameters come from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -3034,9 +3064,6 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         return NULL;
     }
 
-    /*
-     * Promote and fetch ufuncimpl (currently needed to fix up the identity).
-     */
     PyArray_Descr *descrs[3];
     PyArrayMethodObject *ufuncimpl = reducelike_promote_and_resolve(ufunc,
             arr, out, signature, NPY_FALSE, descrs, NPY_UNSAFE_CASTING, "reduce");
@@ -3044,62 +3071,19 @@ PyUFunc_Reduce(PyUFuncObject *ufunc,
         return NULL;
     }
 
-    /* Get the identity */
-    /* TODO: Both of these should be provided by the ArrayMethod! */
-    identity = _get_identity(ufunc, &reorderable);
-    if (identity == NULL) {
-        goto finish;
-    }
-
-    /* Get the initial value */
-    if (initial == NULL) {
-        initial = identity;
-
-        /*
-        * The identity for a dynamic dtype like
-        * object arrays can't be used in general
-        */
-        if (initial != Py_None && PyArray_ISOBJECT(arr) && PyArray_SIZE(arr) != 0) {
-            Py_DECREF(initial);
-            initial = Py_None;
-            Py_INCREF(initial);
-        }
-        else if (PyTypeNum_ISUNSIGNED(descrs[2]->type_num)
-                    && PyLong_CheckExact(initial)) {
-            /*
-             * This is a bit of a hack until we have truly loop specific
-             * identities.  Python -1 cannot be cast to unsigned so convert
-             * it to a NumPy scalar, but we use -1 for bitwise functions to
-             * signal all 1s.
-             * (A builtin identity would not overflow here, although we may
-             * unnecessary convert 0 and 1.)
-             */
-            Py_SETREF(initial, PyObject_CallFunctionObjArgs(
-                        (PyObject *)&PyLongArrType_Type, initial, NULL));
-            if (initial == NULL) {
-                goto finish;
-            }
-        }
-    } else {
-        Py_DECREF(identity);
-        Py_INCREF(initial);  /* match the reference count in the if above */
-    }
-
     PyArrayMethod_Context context = {
         .caller = (PyObject *)ufunc,
         .method = ufuncimpl,
         .descriptors = descrs,
     };
 
-    result = PyUFunc_ReduceWrapper(&context,
-            arr, out, wheremask, axis_flags, reorderable, keepdims,
-            initial, reduce_loop, ufunc, buffersize, ufunc_name, errormask);
+    PyArrayObject *result = PyUFunc_ReduceWrapper(&context,
+            arr, out, wheremask, axis_flags, keepdims,
+            initial, reduce_loop, buffersize, ufunc_name, errormask);
 
-  finish:
     for (int i = 0; i < 3; i++) {
         Py_DECREF(descrs[i]);
     }
-    Py_XDECREF(initial);
     return result;
 }
 
@@ -4115,7 +4099,7 @@ PyUFunc_GenericReduction(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     int errval = PyUFunc_CheckOverride(ufunc, _reduce_type[operation],
-            full_args.in, full_args.out, args, len_args, kwnames, &override);
+            full_args.in, full_args.out, wheremask_obj, args, len_args, kwnames, &override);
     if (errval) {
         return NULL;
     }
@@ -4887,7 +4871,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
     /* We now have all the information required to check for Overrides */
     PyObject *override = NULL;
     errval = PyUFunc_CheckOverride(ufunc, method,
-            full_args.in, full_args.out,
+            full_args.in, full_args.out, where_obj,
             args, len_args, kwnames, &override);
     if (errval) {
         goto fail;
@@ -4973,9 +4957,13 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
             if (!(orig_flags & NPY_ARRAY_WAS_PYTHON_LITERAL)) {
                 continue;
             }
-            /* If the descriptor matches, no need to worry about conversion */
-            if (PyArray_EquivTypes(
-                    PyArray_DESCR(operands[i]), operation_descrs[i])) {
+            /*
+             * If descriptor matches, no need to convert, but integers may
+             * have been too large.
+             */
+            if (!(orig_flags & NPY_ARRAY_WAS_INT_AND_REPLACED)
+                    && PyArray_EquivTypes(
+                        PyArray_DESCR(operands[i]), operation_descrs[i])) {
                 continue;
             }
             /* Otherwise, replace the operand with a new array */
@@ -5079,14 +5067,11 @@ ufunc_generic_vectorcall(PyObject *ufunc,
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg))
 {
     PyObject *thedict;
     PyObject *res;
 
-    if (!PyArg_ParseTuple(args, "")) {
-        return NULL;
-    }
     thedict = PyThreadState_GetDict();
     if (thedict == NULL) {
         thedict = PyEval_GetBuiltins();
@@ -5112,16 +5097,13 @@ ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args)
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg)
 {
     PyObject *thedict;
     int res;
-    PyObject *val;
+    PyObject *val = arg;
     static char *msg = "Error object must be a list of length 3";
 
-    if (!PyArg_ParseTuple(args, "O:seterrobj", &val)) {
-        return NULL;
-    }
     if (!PyList_CheckExact(val) || PyList_GET_SIZE(val) != 3) {
         PyErr_SetString(PyExc_ValueError, msg);
         return NULL;
@@ -5937,6 +5919,7 @@ static inline PyArrayObject *
 new_array_op(PyArrayObject *op_array, char *data)
 {
     npy_intp dims[1] = {1};
+    Py_INCREF(PyArray_DESCR(op_array));  /* NewFromDescr steals a reference */
     PyObject *r = PyArray_NewFromDescr(&PyArray_Type, PyArray_DESCR(op_array),
                                        1, dims, NULL, data,
                                        NPY_ARRAY_WRITEABLE, NULL);
@@ -5944,206 +5927,171 @@ new_array_op(PyArrayObject *op_array, char *data)
 }
 
 /*
- * Call ufunc only on selected array items and store result in first operand.
- * For add ufunc, method call is equivalent to op1[idx] += op2 with no
- * buffering of the first operand.
- * Arguments:
- * op1 - First operand to ufunc
- * idx - Indices that are applied to first operand. Equivalent to op1[idx].
- * op2 - Second operand to ufunc (if needed). Must be able to broadcast
- *       over first operand.
+ * Use an indexed loop to do the work
+ * Returns 0 if successful
  */
-static PyObject *
-ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+static int
+trivial_at_loop(PyArrayMethodObject *ufuncimpl, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_Context *context)
 {
-    PyObject *op1 = NULL;
-    PyObject *idx = NULL;
-    PyObject *op2 = NULL;
-    PyArrayObject *op1_array = NULL;
-    PyArrayObject *op2_array = NULL;
-    PyArrayMapIterObject *iter = NULL;
-    PyArrayIterObject *iter2 = NULL;
-    PyArrayObject *operands[3] = {NULL, NULL, NULL};
-    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
-
-    PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
-    PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
-    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
-
-    int nop;
-
-    /* override vars */
-    int errval;
-    PyObject *override = NULL;
-
-    NpyIter *iter_buffer;
-    NpyIter_IterNextFunc *iternext;
-    npy_uint32 op_flags[NPY_MAXARGS];
-    int buffersize;
-    int errormask = 0;
-    char * err_msg = NULL;
-
-    PyArrayMethod_StridedLoop *strided_loop;
-    NpyAuxData *auxdata = NULL;
+    int buffersize=0, errormask = 0;
+    int res;
+    char *args[3];
+    npy_intp steps[3];
+    args[0] = (char *) iter->baseoffset;
+    steps[0] = iter->fancy_strides[0];
+    if (ufuncimpl->nin == 1) {
+        args[2] = NULL;
+        steps[2] = 0;
+    } else {
+        args[2] = (char *)PyArray_DATA(op2_array);
+        if (PyArray_NDIM(op2_array) == 0
+            || PyArray_DIM(op2_array, 0) <= 1) {
+            steps[2] = 0;
+        } else {
+            steps[2] = PyArray_STRIDE(op2_array, 0);
+        }
+    }
 
-    NPY_BEGIN_THREADS_DEF;
+    npy_intp *inner_size = NpyIter_GetInnerLoopSizePtr(iter->outer);
 
-    if (ufunc->core_enabled) {
-        PyErr_Format(PyExc_TypeError,
-            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
-            ufunc->name, ufunc->name, ufunc->core_signature);
-        return NULL;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
     }
 
-    if (ufunc->nin > 2) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only unary and binary ufuncs supported at this time");
-        return NULL;
-    }
+    do {
+        args[1] = (char *) iter->outer_ptrs[0];
+        steps[1] = iter->outer_strides[0];
 
-    if (ufunc->nout != 1) {
-        PyErr_SetString(PyExc_ValueError,
-            "Only single output ufuncs supported at this time");
-        return NULL;
-    }
+        res = ufuncimpl->contiguous_indexed_loop(
+                context, args, inner_size, steps, NULL);
+        if (args[2] != NULL) {
+            args[2] += (*inner_size) * steps[2];
+        }
+    } while (res == 0 && iter->outer_next(iter->outer));
 
-    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
-        return NULL;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        const char * ufunc_name =
+                        ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        if (_get_bufsize_errmask(NULL, ufunc_name,
+                                 &buffersize, &errormask) < 0) {
+            return -1;
+        }
+        res = _check_ufunc_fperr(errormask, NULL, ufunc_name);
     }
+    return res;
+}
 
-    if (ufunc->nin == 2 && op2 == NULL) {
-        PyErr_SetString(PyExc_ValueError,
-                        "second operand needed for ufunc");
-        return NULL;
-    }
-    errval = PyUFunc_CheckOverride(ufunc, "at",
-            args, NULL, NULL, 0, NULL, &override);
+static int
+ufunc_at__fast_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    NPY_BEGIN_THREADS_DEF;
 
-    if (errval) {
-        return NULL;
-    }
-    else if (override) {
-        return override;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    if (!PyArray_Check(op1)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "first operand must be array");
-        return NULL;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
     }
 
-    op1_array = (PyArrayObject *)op1;
-
-    /* Create second operand from number array if needed. */
-    if (op2 != NULL) {
-        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
-                                0, 0, 0, NULL);
-        if (op2_array == NULL) {
-            goto fail;
-        }
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
     }
 
-    /* Create map iterator */
-    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
-        op1_array, idx, 1, op2_array);
-    if (iter == NULL) {
-        goto fail;
-    }
-    op1_array = iter->array;  /* May be updateifcopied on overlap */
+    npy_intp strides[3] = {0, 0, 0};
+    /*
+     * Iterate over first and second operands and call ufunc
+     * for each pair of inputs
+     */
+    for (npy_intp i = iter->size; i > 0; i--)
+    {
+        char *dataptr[3];
+        /* one element at a time, no stride required but read by innerloop */
+        npy_intp count = 1;
 
-    if (op2 != NULL) {
         /*
-         * May need to swap axes so that second operand is
-         * iterated over correctly
+         * Set up data pointers for either one or two input operands.
+         * The output data pointer points to the first operand data.
          */
-        if ((iter->subspace != NULL) && (iter->consec)) {
-            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
-            if (op2_array == NULL) {
-                goto fail;
-            }
+        dataptr[0] = iter->dataptr;
+        if (iter2 != NULL) {
+            dataptr[1] = PyArray_ITER_DATA(iter2);
+            dataptr[2] = iter->dataptr;
+        }
+        else {
+            dataptr[1] = iter->dataptr;
+            dataptr[2] = NULL;
         }
 
-        /*
-         * Create array iter object for second operand that
-         * "matches" the map iter object for the first operand.
-         * Then we can just iterate over the first and second
-         * operands at the same time and not have to worry about
-         * picking the correct elements from each operand to apply
-         * the ufunc to.
-         */
-        if ((iter2 = (PyArrayIterObject *)\
-             PyArray_BroadcastToShape((PyObject *)op2_array,
-                                        iter->dimensions, iter->nd))==NULL) {
-            goto fail;
+        res = strided_loop(context, dataptr, &count, strides, auxdata);
+        if (res != 0) {
+            break;
         }
-    }
 
-    /*
-     * Create dtypes array for either one or two input operands.
-     * Compare to the logic in `convert_ufunc_arguments`.
-     * TODO: It may be good to review some of this behaviour, since the
-     *       operand array is special (it is written to) similar to reductions.
-     *       Using unsafe-casting as done here, is likely not desirable.
-     */
-    operands[0] = op1_array;
-    operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
-    Py_INCREF(operand_DTypes[0]);
-    int force_legacy_promotion = 0;
-    int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+        PyArray_MapIterNext(iter);
+        if (iter2 != NULL) {
+            PyArray_ITER_NEXT(iter2);
+        }
+    }
 
-    if (op2_array != NULL) {
-        operands[1] = op2_array;
-        operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
-        Py_INCREF(operand_DTypes[1]);
-        allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
-        operands[2] = operands[0];
-        operand_DTypes[2] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[2]);
+    NPY_END_THREADS;
 
-        nop = 3;
-        if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
-                                       != (PyArray_NDIM(op2_array) == 0))) {
-                /* both are legacy and only one is 0-D: force legacy */
-                force_legacy_promotion = should_use_min_scalar(2, operands, 0, NULL);
-            }
-    }
-    else {
-        operands[1] = operands[0];
-        operand_DTypes[1] = operand_DTypes[0];
-        Py_INCREF(operand_DTypes[1]);
-        operands[2] = NULL;
-        nop = 2;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    return res;
+}
 
-    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
-            operands, signature, operand_DTypes,
-            force_legacy_promotion, allow_legacy_promotion,
-            NPY_FALSE, NPY_FALSE);
-    if (ufuncimpl == NULL) {
-        goto fail;
-    }
+static int
+ufunc_at__slow_iter(PyUFuncObject *ufunc, NPY_ARRAYMETHOD_FLAGS flags,
+                    PyArrayMapIterObject *iter, PyArrayIterObject *iter2,
+                    PyArrayObject *op1_array, PyArrayObject *op2_array,
+                    PyArray_Descr *operation_descrs[3],
+                    PyArrayMethod_StridedLoop *strided_loop,
+                    PyArrayMethod_Context *context,
+                    NpyAuxData *auxdata
+                    )
+{
+    NpyIter *iter_buffer = NULL;
+    PyArrayObject *array_operands[3] = {NULL, NULL, NULL};
+    int buffersize;
+    int errormask = 0;
+    int res = 0;
+    int nop = 0;
+    NpyIter_IterNextFunc *iternext;
+    char * err_msg = NULL;
+    NPY_BEGIN_THREADS_DEF;
 
-    /* Find the correct descriptors for the operation */
-    if (resolve_descriptors(nop, ufunc, ufuncimpl,
-            operands, operation_descrs, signature, NPY_UNSAFE_CASTING) < 0) {
-        goto fail;
+    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
+        return -1;
     }
-
-    Py_INCREF(PyArray_DESCR(op1_array));
     array_operands[0] = new_array_op(op1_array, iter->dataptr);
     if (iter2 != NULL) {
-        Py_INCREF(PyArray_DESCR(op2_array));
         array_operands[1] = new_array_op(op2_array, PyArray_ITER_DATA(iter2));
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[2] = new_array_op(op1_array, iter->dataptr);
+        nop = 3;
     }
     else {
-        Py_INCREF(PyArray_DESCR(op1_array));
         array_operands[1] = new_array_op(op1_array, iter->dataptr);
         array_operands[2] = NULL;
+        nop = 2;
     }
-
     /* Set up the flags */
+    npy_uint32 op_flags[3];
     op_flags[0] = NPY_ITER_READONLY|
                   NPY_ITER_ALIGNED;
 
@@ -6163,11 +6111,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                       NPY_ITER_NO_BROADCAST|
                       NPY_ITER_NO_SUBTYPE;
     }
-
-    if (_get_bufsize_errmask(NULL, ufunc->name, &buffersize, &errormask) < 0) {
-        goto fail;
-    }
-
     /*
      * Create NpyIter object to "iterate" over single element of each input
      * operand. This is an easy way to reuse the NpyIter logic for dealing
@@ -6190,33 +6133,23 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
                         -1, NULL, NULL, buffersize);
 
     if (iter_buffer == NULL) {
-        goto fail;
+        /* will fail only on memory allocation errors */
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
     iternext = NpyIter_GetIterNext(iter_buffer, NULL);
     if (iternext == NULL) {
+        /* can not really happen, iter_buffer creation is tightly controlled */
         NpyIter_Deallocate(iter_buffer);
-        goto fail;
-    }
-
-    PyArrayMethod_Context context = {
-            .caller = (PyObject *)ufunc,
-            .method = ufuncimpl,
-            .descriptors = operation_descrs,
-    };
-
-    NPY_ARRAYMETHOD_FLAGS flags;
-    /* Use contiguous strides; if there is such a loop it may be faster */
-    npy_intp strides[3] = {
-            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
-    if (nop == 3) {
-        strides[2] = operation_descrs[2]->elsize;
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(array_operands[i]);
+        }
+        return -1;
     }
 
-    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
-            &strided_loop, &auxdata, &flags) < 0) {
-        goto fail;
-    }
     int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
     needs_api |= NpyIter_IterationNeedsAPI(iter_buffer);
     if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
@@ -6224,6 +6157,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         npy_clear_floatstatus_barrier((char*)&iter);
     }
 
+    npy_intp strides[3] = {0, 0, 0};
     if (!needs_api) {
         NPY_BEGIN_THREADS;
     }
@@ -6232,7 +6166,6 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * Iterate over first and second operands and call ufunc
      * for each pair of inputs
      */
-    int res = 0;
     for (npy_intp i = iter->size; i > 0; i--)
     {
         char *dataptr[3];
@@ -6263,7 +6196,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
         buffer_dataptr = NpyIter_GetDataPtrArray(iter_buffer);
 
-        res = strided_loop(&context, buffer_dataptr, &count, strides, auxdata);
+        res = strided_loop(context, buffer_dataptr, &count, strides, auxdata);
         if (res != 0) {
             break;
         }
@@ -6289,18 +6222,284 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
         /* NOTE: We could check float errors even when `res < 0` */
         res = _check_ufunc_fperr(errormask, NULL, "at");
     }
+    NpyIter_Deallocate(iter_buffer);
+    for (int i = 0; i < 3; i++) {
+        Py_XDECREF(array_operands[i]);
+    }
+    return res;
+}
+
+/*
+ * Call ufunc only on selected array items and store result in first operand.
+ * For add ufunc, method call is equivalent to op1[idx] += op2 with no
+ * buffering of the first operand.
+ * Arguments:
+ * op1 - First operand to ufunc
+ * idx - Indices that are applied to first operand. Equivalent to op1[idx].
+ * op2 - Second operand to ufunc (if needed). Must be able to broadcast
+ *       over first operand.
+ */
+static PyObject *
+ufunc_at(PyUFuncObject *ufunc, PyObject *args)
+{
+    PyObject *op1 = NULL;
+    PyObject *idx = NULL;
+    PyObject *op2 = NULL;
+    PyArrayObject *op1_array = NULL;
+    PyArrayObject *op2_array = NULL;
+    PyArrayMapIterObject *iter = NULL;
+    PyArrayIterObject *iter2 = NULL;
+    PyArray_Descr *operation_descrs[3] = {NULL, NULL, NULL};
+
+    int nop;
+
+    /* override vars */
+    int errval;
+    PyObject *override = NULL;
+    int res = -1;   /* start with fail condition so "goto fail" will error */
+
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+
+    if (ufunc->core_enabled) {
+        PyErr_Format(PyExc_TypeError,
+            "%s.at does not support ufunc with non-trivial signature: %s has signature %s.",
+            ufunc->name, ufunc->name, ufunc->core_signature);
+        return NULL;
+    }
+
+    if (ufunc->nin > 2) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only unary and binary ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (ufunc->nout != 1) {
+        PyErr_SetString(PyExc_ValueError,
+            "Only single output ufuncs supported at this time");
+        return NULL;
+    }
+
+    if (!PyArg_ParseTuple(args, "OO|O:at", &op1, &idx, &op2)) {
+        return NULL;
+    }
+
+    if (ufunc->nin == 2 && op2 == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand needed for ufunc");
+        return NULL;
+    }
+
+    if (ufunc->nin == 1 && op2 != NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                        "second operand provided when ufunc is unary");
+        return NULL;
+    }
+    errval = PyUFunc_CheckOverride(ufunc, "at",
+            args, NULL, NULL, NULL, 0, NULL, &override);
+
+    if (errval) {
+        return NULL;
+    }
+    else if (override) {
+        return override;
+    }
+
+    if (!PyArray_Check(op1)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "first operand must be array");
+        return NULL;
+    }
+
+    op1_array = (PyArrayObject *)op1;
+
+    /* Create second operand from number array if needed. */
+    if (op2 == NULL) {
+        nop = 2;
+    }
+    else {
+        nop = 3;
+        op2_array = (PyArrayObject *)PyArray_FromAny(op2, NULL,
+                                0, 0, 0, NULL);
+        if (op2_array == NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethodObject *ufuncimpl = NULL;
+    {
+        /* Do all the dtype handling and find the correct ufuncimpl */
+
+        PyArrayObject *tmp_operands[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *signature[3] = {NULL, NULL, NULL};
+        PyArray_DTypeMeta *operand_DTypes[3] = {NULL, NULL, NULL};
+        /*
+         * Create dtypes array for either one or two input operands.
+         * Compare to the logic in `convert_ufunc_arguments`.
+         * TODO: It may be good to review some of this behaviour, since the
+         *       operand array is special (it is written to) similar to reductions.
+         *       Using unsafe-casting as done here, is likely not desirable.
+         */
+        tmp_operands[0] = op1_array;
+        operand_DTypes[0] = NPY_DTYPE(PyArray_DESCR(op1_array));
+        Py_INCREF(operand_DTypes[0]);
+        int force_legacy_promotion = 0;
+        int allow_legacy_promotion = NPY_DT_is_legacy(operand_DTypes[0]);
+
+        if (op2_array != NULL) {
+            tmp_operands[1] = op2_array;
+            operand_DTypes[1] = NPY_DTYPE(PyArray_DESCR(op2_array));
+            Py_INCREF(operand_DTypes[1]);
+            allow_legacy_promotion &= NPY_DT_is_legacy(operand_DTypes[1]);
+            tmp_operands[2] = tmp_operands[0];
+            operand_DTypes[2] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[2]);
+
+            if (allow_legacy_promotion && ((PyArray_NDIM(op1_array) == 0)
+                                           != (PyArray_NDIM(op2_array) == 0))) {
+                    /* both are legacy and only one is 0-D: force legacy */
+                    force_legacy_promotion = should_use_min_scalar(2, tmp_operands, 0, NULL);
+                }
+        }
+        else {
+            tmp_operands[1] = tmp_operands[0];
+            operand_DTypes[1] = operand_DTypes[0];
+            Py_INCREF(operand_DTypes[1]);
+            tmp_operands[2] = NULL;
+        }
+
+        ufuncimpl = promote_and_get_ufuncimpl(ufunc, tmp_operands, signature,
+                        operand_DTypes, force_legacy_promotion,
+                        allow_legacy_promotion, NPY_FALSE, NPY_FALSE);
+        if (ufuncimpl == NULL) {
+            for (int i = 0; i < 3; i++) {
+                Py_XDECREF(signature[i]);
+                Py_XDECREF(operand_DTypes[i]);
+            }
+            goto fail;
+        }
+
+        /* Find the correct operation_descrs for the operation */
+        int resolve_result = resolve_descriptors(nop, ufunc, ufuncimpl,
+                tmp_operands, operation_descrs, signature, NPY_UNSAFE_CASTING);
+        for (int i = 0; i < 3; i++) {
+            Py_XDECREF(signature[i]);
+            Py_XDECREF(operand_DTypes[i]);
+        }
+        if (resolve_result < 0) {
+            goto fail;
+        }
+    }
 
+    iter = (PyArrayMapIterObject *)PyArray_MapIterArrayCopyIfOverlap(
+        op1_array, idx, 1, op2_array);
+    if (iter == NULL) {
+        goto fail;
+    }
+    op1_array = iter->array;  /* May be updateifcopied on overlap */
+
+    if (op2_array != NULL) {
+        /*
+         * May need to swap axes so that second operand is
+         * iterated over correctly
+         */
+        if ((iter->subspace != NULL) && (iter->consec)) {
+            PyArray_MapIterSwapAxes(iter, &op2_array, 0);
+            if (op2_array == NULL) {
+                /* only on memory allocation failure */
+                goto fail;
+            }
+        }
+
+        /*
+         * Create array iter object for second operand that
+         * "matches" the map iter object for the first operand.
+         * Then we can just iterate over the first and second
+         * operands at the same time and not have to worry about
+         * picking the correct elements from each operand to apply
+         * the ufunc to.
+         */
+        if ((iter2 = (PyArrayIterObject *)\
+             PyArray_BroadcastToShape((PyObject *)op2_array,
+                                        iter->dimensions, iter->nd))==NULL) {
+            goto fail;
+        }
+    }
+
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
+    /* Use contiguous strides; if there is such a loop it may be faster */
+    npy_intp strides[3] = {
+            operation_descrs[0]->elsize, operation_descrs[1]->elsize, 0};
+    if (nop == 3) {
+        strides[2] = operation_descrs[2]->elsize;
+    }
+
+    NPY_ARRAYMETHOD_FLAGS flags;
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    int fast_path = 1;
+    /* check no casting, alignment */
+    if (PyArray_DESCR(op1_array) != operation_descrs[0]) {
+        fast_path = 0;
+    }
+    if (PyArray_DESCR(op1_array) != operation_descrs[nop - 1]) {
+        /* output casting */
+        fast_path = 0;
+    }
+    if (!PyArray_ISALIGNED(op1_array)) {
+        fast_path = 0;
+    }
+    if (nop >2) {
+        if  (PyArray_DESCR(op2_array) != operation_descrs[1]) {
+            fast_path = 0;
+        }
+        if (!PyArray_ISALIGNED(op2_array)) {
+            fast_path = 0;
+        }
+    }
+    if (fast_path == 1) {
+        /*
+         * Try to use trivial loop (1d, no casting, aligned) if
+         * - the matching info has a indexed loop
+         * - idx must be exactly one integer index array
+         * - all operands are 1d
+         * A future enhancement could loosen the restriction on 1d operands
+         * by adding an iteration loop inside trivial_at_loop
+         */
+        if ((ufuncimpl->contiguous_indexed_loop != NULL) &&
+                (PyArray_NDIM(op1_array) == 1)  &&
+                (op2_array == NULL || PyArray_NDIM(op2_array) <= 1) &&
+                (iter->subspace_iter == NULL) && (iter->numiter == 1)) {
+            res = trivial_at_loop(ufuncimpl, flags, iter, op1_array,
+                        op2_array, &context);
+
+        }
+        else {
+            /* Couldn't use the fastest path, use the faster path */
+            res = ufunc_at__fast_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, strided_loop, &context, auxdata);
+        }
+    } else {
+        res = ufunc_at__slow_iter(ufunc, flags, iter, iter2, op1_array,
+                        op2_array, operation_descrs, strided_loop, &context,
+                        auxdata);
+    }
+
+fail:
     NPY_AUXDATA_FREE(auxdata);
-    NpyIter_Deallocate(iter_buffer);
 
     Py_XDECREF(op2_array);
     Py_XDECREF(iter);
     Py_XDECREF(iter2);
     for (int i = 0; i < nop; i++) {
-        Py_DECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
         Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
     }
 
     /*
@@ -6309,29 +6508,15 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
      * (e.g. `power` released the GIL but manually set an Exception).
      */
     if (res != 0 || PyErr_Occurred()) {
+        /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
+        if (PyArray_FLAGS(op1_array) & NPY_ARRAY_WRITEBACKIFCOPY) {
+            PyArray_DiscardWritebackIfCopy(op1_array);
+        }
         return NULL;
     }
     else {
         Py_RETURN_NONE;
     }
-
-fail:
-    /* iter_buffer has already been deallocated, don't use NpyIter_Dealloc */
-    if (op1_array != (PyArrayObject*)op1) {
-        PyArray_DiscardWritebackIfCopy(op1_array);
-    }
-    Py_XDECREF(op2_array);
-    Py_XDECREF(iter);
-    Py_XDECREF(iter2);
-    for (int i = 0; i < 3; i++) {
-        Py_XDECREF(signature[i]);
-        Py_XDECREF(operand_DTypes[i]);
-        Py_XDECREF(operation_descrs[i]);
-        Py_XDECREF(array_operands[i]);
-    }
-    NPY_AUXDATA_FREE(auxdata);
-
-    return NULL;
 }
 
 
@@ -6454,16 +6639,10 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
             Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_NewFromDescr_int(
                     &PyArray_Type, descr, 0, NULL, NULL, NULL,
-                    0, NULL, NULL, 0, 1);
+                    0, NULL, NULL, _NPY_ARRAY_ENSURE_DTYPE_IDENTITY);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
             }
-            if (PyArray_DESCR(dummy_arrays[i]) != descr) {
-                PyErr_SetString(PyExc_NotImplementedError,
-                    "dtype was replaced during array creation, the dtype is "
-                    "unsupported currently (a subarray dtype?).");
-                goto finish;
-            }
             DTypes[i] = NPY_DTYPE(descr);
             Py_INCREF(DTypes[i]);
             if (!NPY_DT_is_legacy(DTypes[i])) {
@@ -6473,7 +6652,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
          /* Explicitly allow int, float, and complex for the "weak" types. */
         else if (descr_obj == (PyObject *)&PyLong_Type) {
             descr = PyArray_DescrFromType(NPY_LONG);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6485,7 +6663,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
         }
         else if (descr_obj == (PyObject *)&PyFloat_Type) {
             descr = PyArray_DescrFromType(NPY_DOUBLE);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6497,7 +6674,6 @@ py_resolve_dtypes_generic(PyUFuncObject *ufunc, npy_bool return_context,
         }
         else if (descr_obj == (PyObject *)&PyComplex_Type) {
             descr = PyArray_DescrFromType(NPY_CDOUBLE);
-            Py_INCREF(descr);
             dummy_arrays[i] = (PyArrayObject *)PyArray_Empty(0, NULL, descr, 0);
             if (dummy_arrays[i] == NULL) {
                 goto finish;
@@ -6887,7 +7063,7 @@ static PyObject *
 ufunc_get_identity(PyUFuncObject *ufunc, void *NPY_UNUSED(ignored))
 {
     npy_bool reorderable;
-    return _get_identity(ufunc, &reorderable);
+    return PyUFunc_GetDefaultIdentity(ufunc, &reorderable);
 }
 
 static PyObject *
diff --git a/numpy/core/src/umath/ufunc_object.h b/numpy/core/src/umath/ufunc_object.h
index 32af6c58e..1b80bb2df 100644
--- a/numpy/core/src/umath/ufunc_object.h
+++ b/numpy/core/src/umath/ufunc_object.h
@@ -4,14 +4,17 @@
 #include <numpy/ufuncobject.h>
 
 NPY_NO_EXPORT PyObject *
-ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_geterr(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(arg));
 
 NPY_NO_EXPORT PyObject *
-ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *args);
+ufunc_seterr(PyObject *NPY_UNUSED(dummy), PyObject *arg);
 
 NPY_NO_EXPORT const char*
 ufunc_get_name_cstr(PyUFuncObject *ufunc);
 
+NPY_NO_EXPORT PyObject *
+PyUFunc_GetDefaultIdentity(PyUFuncObject *ufunc, npy_bool *reorderable);
+
 /* strings from umathmodule.c that are interned on umath import */
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_ufunc;
 NPY_VISIBILITY_HIDDEN extern PyObject *npy_um_str_array_prepare;
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index a0a16a0f9..decd26580 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -362,7 +362,7 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                 && PyArray_ISDATETIME(operands[1])
                 && type_num1 != type_num2) {
             /*
-             * Reject mixed datetime and timedelta explictly, this was always
+             * Reject mixed datetime and timedelta explicitly, this was always
              * implicitly rejected because casting fails (except with
              * `casting="unsafe"` admittedly).
              * This is required to ensure that `==` and `!=` can correctly
@@ -381,8 +381,28 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
             if (out_dtypes[0] == NULL) {
                 return -1;
             }
-            out_dtypes[1] = out_dtypes[0];
-            Py_INCREF(out_dtypes[1]);
+            if (PyArray_ISINTEGER(operands[0])
+                    && PyArray_ISINTEGER(operands[1])
+                    && !PyDataType_ISINTEGER(out_dtypes[0])) {
+                /*
+                 * NumPy promotion allows uint+int to go to float, avoid it
+                 * (input must have been a mix of signed and unsigned)
+                 */
+                if (PyArray_ISSIGNED(operands[0])) {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_LONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_ULONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+                else {
+                    Py_SETREF(out_dtypes[0], PyArray_DescrFromType(NPY_ULONGLONG));
+                    out_dtypes[1] = PyArray_DescrFromType(NPY_LONGLONG);
+                    Py_INCREF(out_dtypes[1]);
+                }
+            }
+            else {
+                out_dtypes[1] = out_dtypes[0];
+                Py_INCREF(out_dtypes[1]);
+            }
         }
         else {
             /* Not doing anything will lead to a loop no found error. */
@@ -398,15 +418,8 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
                 operands, type_tup, out_dtypes);
     }
 
-    /* Output type is always boolean */
+    /* Output type is always boolean (cannot fail for builtins) */
     out_dtypes[2] = PyArray_DescrFromType(NPY_BOOL);
-    if (out_dtypes[2] == NULL) {
-        for (i = 0; i < 2; ++i) {
-            Py_DECREF(out_dtypes[i]);
-            out_dtypes[i] = NULL;
-        }
-        return -1;
-    }
 
     /* Check against the casting rules */
     if (PyUFunc_ValidateCasting(ufunc, casting, operands, out_dtypes) < 0) {
diff --git a/numpy/core/src/umath/wrapping_array_method.c b/numpy/core/src/umath/wrapping_array_method.c
index 9f8f036e8..64fea7aeb 100644
--- a/numpy/core/src/umath/wrapping_array_method.c
+++ b/numpy/core/src/umath/wrapping_array_method.c
@@ -177,6 +177,39 @@ wrapping_method_get_loop(
 }
 
 
+/*
+ * Wraps the original identity function, needs to translate the descriptors
+ * back to the original ones and provide an "original" context (identically to
+ * `get_loop`).
+ * We assume again that translating the descriptors is quick.
+ */
+static int
+wrapping_method_get_identity_function(
+        PyArrayMethod_Context *context, npy_bool reduction_is_empty,
+        char *item)
+{
+    /* Copy the context, and replace descriptors: */
+    PyArrayMethod_Context orig_context = *context;
+    PyArray_Descr *orig_descrs[NPY_MAXARGS];
+    orig_context.descriptors = orig_descrs;
+    orig_context.method = context->method->wrapped_meth;
+
+    int nin = context->method->nin, nout = context->method->nout;
+    PyArray_DTypeMeta **dtypes = context->method->wrapped_dtypes;
+
+    if (context->method->translate_given_descrs(
+            nin, nout, dtypes, context->descriptors, orig_descrs) < 0) {
+        return -1;
+    }
+    int res = context->method->wrapped_meth->get_reduction_initial(
+            &orig_context, reduction_is_empty, item);
+    for (int i = 0; i < nin + nout; i++) {
+        Py_DECREF(orig_descrs);
+    }
+    return res;
+}
+
+
 /**
  * Allows creating of a fairly lightweight wrapper around an existing ufunc
  * loop.  The idea is mainly for units, as this is currently slightly limited
@@ -235,14 +268,17 @@ PyUFunc_AddWrappingLoop(PyObject *ufunc_obj,
         break;
     }
     if (wrapped_meth == NULL) {
-        PyErr_SetString(PyExc_TypeError,
-                "Did not find the to-be-wrapped loop in the ufunc.");
+        PyErr_Format(PyExc_TypeError,
+                "Did not find the to-be-wrapped loop in the ufunc with given "
+                "DTypes. Received wrapping types: %S", wrapped_dt_tuple);
         goto finish;
     }
 
     PyType_Slot slots[] = {
         {NPY_METH_resolve_descriptors, &wrapping_method_resolve_descriptors},
-        {NPY_METH_get_loop, &wrapping_method_get_loop},
+        {_NPY_METH_get_loop, &wrapping_method_get_loop},
+        {NPY_METH_get_reduction_initial,
+            &wrapping_method_get_identity_function},
         {0, NULL}
     };