15 files changed, 1604 insertions, 354 deletions
diff --git a/doc/source/reference/c-api/types-and-structures.rst b/doc/source/reference/c-api/types-and-structures.rst
index 75a97c20e..54a1e09e1 100644
--- a/doc/source/reference/c-api/types-and-structures.rst
+++ b/doc/source/reference/c-api/types-and-structures.rst
@@ -818,6 +818,7 @@ PyUFunc_Type and PyUFuncObject
           npy_intp *core_dim_sizes;
           npy_uint32 *core_dim_flags;
           PyObject *identity_value;
+          /* Further private slots (size depends on the NumPy version) */
       } PyUFuncObject;
 
    .. c:macro: PyObject_HEAD
@@ -957,9 +958,12 @@ PyUFunc_Type and PyUFuncObject
 
    .. c:member:: PyUFunc_LegacyInnerLoopSelectionFunc *legacy_inner_loop_selector
 
-       A function which returns an inner loop. The ``legacy`` in the name arises
-       because for NumPy 1.6 a better variant had been planned. This variant
-       has not yet come about.
+       .. deprecated:: 1.22
+
+            Some fallback support for this slot exists, but will be removed
+            eventually.  A univiersal function which relied on this will have
+            eventually have to be ported.
+            See ref:`NEP 41 <NEP41>` and ref:`NEP 43 <NEP43>`
 
    .. c:member:: void *reserved2
 
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index 0f3b8529a..fd7307703 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -211,6 +211,12 @@ typedef struct _tagPyUFuncObject {
         /* Identity for reduction, when identity == PyUFunc_IdentityValue */
         PyObject *identity_value;
 
+        /* New in NPY_API_VERSION 0x0000000F and above */
+
+        /* New private fields related to dispatching */
+        void *_dispatch_cache;
+        /* A PyListObject of `(tuple of DTypes, ArrayMethod/Promoter)` */
+        PyObject *_loops;
 } PyUFuncObject;
 
 #include "arrayobject.h"
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 29d309f74..c20320910 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -928,6 +928,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'clip.h.src'),
             join('src', 'umath', 'clip.c.src'),
+            join('src', 'umath', 'dispatching.c'),
+            join('src', 'umath', 'legacy_array_method.c'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'extobj.c'),
             join('src', 'umath', 'scalarmath.c.src'),
diff --git a/numpy/core/src/multiarray/array_method.c b/numpy/core/src/multiarray/array_method.c
index cc841ee64..c1b6d4e71 100644
--- a/numpy/core/src/multiarray/array_method.c
+++ b/numpy/core/src/multiarray/array_method.c
@@ -757,9 +757,6 @@ boundarraymethod__simple_strided_call(
 
 
 /*
- * TODO: Currently still based on the old ufunc system and not ArrayMethod!
- *       This requires fixing the ufunc code first.
- *
  * Support for masked inner-strided loops.  Masked inner-strided loops are
  * only used in the ufunc machinery.  So this special cases them.
  * In the future it probably makes sense to create an::
@@ -770,8 +767,8 @@ boundarraymethod__simple_strided_call(
  */
 typedef struct {
     NpyAuxData base;
-    PyUFuncGenericFunction unmasked_stridedloop;
-    void *innerloopdata;
+    PyArrayMethod_StridedLoop *unmasked_stridedloop;
+    NpyAuxData *unmasked_auxdata;
     int nargs;
     char *dataptrs[];
 } _masked_stridedloop_data;
@@ -781,6 +778,7 @@ static void
 _masked_stridedloop_data_free(NpyAuxData *auxdata)
 {
     _masked_stridedloop_data *data = (_masked_stridedloop_data *)auxdata;
+    NPY_AUXDATA_FREE(data->unmasked_auxdata);
     PyMem_Free(data);
 }
 
@@ -790,15 +788,15 @@ _masked_stridedloop_data_free(NpyAuxData *auxdata)
  * masked strided-loop, only calling the function for elements
  * where the mask is True.
  */
-static void
-unmasked_ufunc_loop_as_masked(
-        char **data, const npy_intp *dimensions,
-        const npy_intp *strides, void *_auxdata)
+static int
+generic_masked_strided_loop(PyArrayMethod_Context *context,
+        char *const *data, const npy_intp *dimensions,
+        const npy_intp *strides, NpyAuxData *_auxdata)
 {
     _masked_stridedloop_data *auxdata = (_masked_stridedloop_data *)_auxdata;
     int nargs = auxdata->nargs;
-    PyUFuncGenericFunction strided_loop = auxdata->unmasked_stridedloop;
-    void *innerloopdata = auxdata->innerloopdata;
+    PyArrayMethod_StridedLoop *strided_loop = auxdata->unmasked_stridedloop;
+    NpyAuxData *strided_loop_auxdata = auxdata->unmasked_auxdata;
 
     char **dataptrs = auxdata->dataptrs;
     memcpy(dataptrs, data, nargs * sizeof(char *));
@@ -819,39 +817,37 @@ unmasked_ufunc_loop_as_masked(
 
         /* Process unmasked values */
         mask = npy_memchr(mask, 0, mask_stride, N, &subloopsize, 0);
-        strided_loop(dataptrs, &subloopsize, strides, innerloopdata);
+        int res = strided_loop(context,
+                dataptrs, &subloopsize, strides, strided_loop_auxdata);
+        if (res != 0) {
+            return res;
+        }
         for (int i = 0; i < nargs; i++) {
             dataptrs[i] += subloopsize * strides[i];
         }
         N -= subloopsize;
     } while (N > 0);
+
+    return 0;
 }
 
 
 /*
- * TODO: This function will be the masked equivalent to `get_loop`.
- * This function wraps a legacy inner loop so it becomes masked.
- *
- * Returns 0 on success, -1 on error.
+ * Identical to the `get_loop` functions and wraps it.  This adds support
+ * to a boolean mask being passed in as a last, additional, operand.
+ * The wrapped loop will only be called for unmasked elements.
+ * (Does not support `move_references` or inner dimensions!)
  */
 NPY_NO_EXPORT int
-PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
-        PyArray_Descr **dtypes,
-        PyUFuncGenericFunction *out_innerloop,
-        NpyAuxData **out_innerloopdata,
-        int *out_needs_api)
+PyArrayMethod_GetMaskedStridedLoop(
+        PyArrayMethod_Context *context,
+        int aligned, npy_intp *fixed_strides,
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags)
 {
-    int retcode;
     _masked_stridedloop_data *data;
-    int nargs = ufunc->nin + ufunc->nout;
-
-    if (ufunc->legacy_inner_loop_selector == NULL) {
-        PyErr_SetString(PyExc_RuntimeError,
-                "the ufunc default masked inner loop selector doesn't "
-                "yet support wrapping the new inner loop selector, it "
-                "still only wraps the legacy inner loop selector");
-        return -1;
-    }
+    int nargs = context->method->nin + context->method->nout;
 
     /* Add working memory for the data pointers, to modify them in-place */
     data = PyMem_Malloc(sizeof(_masked_stridedloop_data) +
@@ -865,18 +861,14 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
     data->unmasked_stridedloop = NULL;
     data->nargs = nargs;
 
-    /* Get the unmasked ufunc inner loop */
-    retcode = ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-            &data->unmasked_stridedloop, &data->innerloopdata,
-            out_needs_api);
-    if (retcode < 0) {
-        PyArray_free(data);
-        return retcode;
+    if (context->method->get_strided_loop(context,
+            aligned, 0, fixed_strides,
+            &data->unmasked_stridedloop, &data->unmasked_auxdata, flags) < 0) {
+        PyMem_Free(data);
+        return -1;
     }
-
-    /* Return the loop function + aux data */
-    *out_innerloop = &unmasked_ufunc_loop_as_masked;
-    *out_innerloopdata = (NpyAuxData *)data;
+    *out_transferdata = (NpyAuxData *)data;
+    *out_loop = generic_masked_strided_loop;
     return 0;
 }
 
diff --git a/numpy/core/src/multiarray/array_method.h b/numpy/core/src/multiarray/array_method.h
index c2122a2da..fc2304889 100644
--- a/numpy/core/src/multiarray/array_method.h
+++ b/numpy/core/src/multiarray/array_method.h
@@ -17,6 +17,7 @@ typedef enum {
      * setup/check. No function should set error flags and ignore them
      * since it would interfere with chaining operations (e.g. casting).
      */
+    /* TODO: Change this into a positive flag */
     NPY_METH_NO_FLOATINGPOINT_ERRORS = 1 << 2,
     /* Whether the method supports unaligned access (not runtime) */
     NPY_METH_SUPPORTS_UNALIGNED = 1 << 3,
@@ -158,17 +159,16 @@ npy_default_get_strided_loop(
         PyArrayMethod_StridedLoop **out_loop, NpyAuxData **out_transferdata,
         NPY_ARRAYMETHOD_FLAGS *flags);
 
-/*
- * TODO: This function will not rely on the current ufunc code after the
- *       ufunc refactor.
- */
-#include "numpy/ufuncobject.h"
+
 NPY_NO_EXPORT int
-PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
-        PyArray_Descr **dtypes,
-        PyUFuncGenericFunction *out_innerloop,
-        NpyAuxData **out_innerloopdata,
-        int *out_needs_api);
+PyArrayMethod_GetMaskedStridedLoop(
+        PyArrayMethod_Context *context,
+        int aligned,
+        npy_intp *fixed_strides,
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
 
 /*
  * TODO: This function is the internal version, and its error paths may
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index a0154e474..98d4f5a75 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -449,6 +449,11 @@ NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
     /*
      * If REFS_OK was specified, check whether there are any
      * reference arrays and flag it if so.
+     *
+     * NOTE: This really should be unnecessary, but chances are someone relies
+     *       on it.  The iterator itself does not require the API here
+     *       as it only does so for casting/buffering.  But in almost all
+     *       use-cases the API will be required for whatever operation is done.
      */
     if (flags & NPY_ITER_REFS_OK) {
         for (iop = 0; iop < nop; ++iop) {
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
new file mode 100644
index 000000000..e63780458
--- /dev/null
+++ b/numpy/core/src/umath/dispatching.c
@@ -0,0 +1,688 @@
+/*
+ * This file implements universal function dispatching and promotion (which
+ * is necessary to happen before dispatching).
+ * This is part of the UFunc object.  Promotion and dispatching uses the
+ * following things:
+ *
+ * - operand_DTypes:  The datatypes as passed in by the user.
+ * - signature: The DTypes fixed by the user with `dtype=` or `signature=`.
+ * - ufunc._loops: A list of all ArrayMethods and promoters, it contains
+ *   tuples `(dtypes, ArrayMethod)` or `(dtypes, promoter)`.
+ * - ufunc._dispatch_cache: A cache to store previous promotion and/or
+ *   dispatching results.
+ * - The actual arrays are used to support the old code paths where necessary.
+ *   (this includes any value-based casting/promotion logic)
+ *
+ * In general, `operand_Dtypes` is always overridden by `signature`.  If a
+ * DType is included in the `signature` it must match precisely.
+ *
+ * The process of dispatching and promotion can be summarized in the following
+ * steps:
+ *
+ * 1. Override any `operand_DTypes` from `signature`.
+ * 2. Check if the new `operand_Dtypes` is cached (if it is, got to 4.)
+ * 3. Find the best matching "loop".  This is done using multiple dispatching
+ *    on all `operand_DTypes` and loop `dtypes`.  A matching loop must be
+ *    one whose DTypes are superclasses of the `operand_DTypes` (that are
+ *    defined).  The best matching loop must be better than any other matching
+ *    loop.  This result is cached.
+ * 4. If the found loop is a promoter: We call the promoter. It can modify
+ *    the `operand_DTypes` currently.  Then go back to step 2.
+ *    (The promoter can call arbitrary code, so it could even add the matching
+ *    loop first.)
+ * 5. The final `ArrayMethod` is found, its registered `dtypes` is copied
+ *    into the `signature` so that it is available to the ufunc loop.
+ *
+ */
+#include <Python.h>
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/ndarraytypes.h"
+#include "common.h"
+
+#include "dispatching.h"
+#include "dtypemeta.h"
+#include "npy_hashtable.h"
+#include "legacy_array_method.h"
+#include "ufunc_object.h"
+#include "ufunc_type_resolution.h"
+
+
+/* forward declaration */
+static NPY_INLINE PyObject *
+promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
+        PyArrayObject *const ops[],
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *op_dtypes[],
+        npy_bool allow_legacy_promotion, npy_bool cache);
+
+
+/**
+ * Function to add a new loop to the ufunc.  This mainly appends it to the
+ * list (as it currently is just a list).
+ *
+ * @param ufunc The universal function to add the loop to.
+ * @param info The tuple (dtype_tuple, ArrayMethod/promoter).
+ * @param ignore_duplicate If 1 and a loop with the same `dtype_tuple` is
+ *        found, the function does nothing.
+ */
+static int
+add_ufunc_loop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate)
+{
+    /*
+     * Validate the info object, this should likely move to to a different
+     * entry-point in the future (and is mostly unnecessary currently).
+     */
+    if (!PyTuple_CheckExact(info) || PyTuple_GET_SIZE(info) != 2) {
+        PyErr_SetString(PyExc_TypeError,
+                "Info must be a tuple: "
+                "(tuple of DTypes or None, ArrayMethod or promoter)");
+        return -1;
+    }
+    PyObject *DType_tuple = PyTuple_GetItem(info, 0);
+    if (PyTuple_GET_SIZE(DType_tuple) != ufunc->nargs) {
+        PyErr_SetString(PyExc_TypeError,
+                "DType tuple length does not match ufunc number of operands");
+        return -1;
+    }
+    for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(DType_tuple); i++) {
+        PyObject *item = PyTuple_GET_ITEM(DType_tuple, i);
+        if (item != Py_None
+                && !PyObject_TypeCheck(item, &PyArrayDTypeMeta_Type)) {
+            PyErr_SetString(PyExc_TypeError,
+                    "DType tuple may only contain None and DType classes");
+            return -1;
+        }
+    }
+    if (!PyObject_TypeCheck(PyTuple_GET_ITEM(info, 1), &PyArrayMethod_Type)) {
+        /* Must also accept promoters in the future. */
+        PyErr_SetString(PyExc_TypeError,
+                "Second argument to info must be an ArrayMethod or promoter");
+        return -1;
+    }
+
+    if (ufunc->_loops == NULL) {
+        ufunc->_loops = PyList_New(0);
+        if (ufunc->_loops == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *loops = ufunc->_loops;
+    Py_ssize_t length = PyList_Size(loops);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *item = PyList_GetItem(loops, i);
+        PyObject *cur_DType_tuple = PyTuple_GetItem(item, 0);
+        int cmp = PyObject_RichCompareBool(cur_DType_tuple, DType_tuple, Py_EQ);
+        if (cmp < 0) {
+            return -1;
+        }
+        if (cmp == 0) {
+            continue;
+        }
+        if (ignore_duplicate) {
+            return 0;
+        }
+        PyErr_Format(PyExc_TypeError,
+                "A loop/promoter has already been registered with '%s' for %R",
+                ufunc_get_name_cstr(ufunc), DType_tuple);
+        return -1;
+    }
+
+    if (PyList_Append(loops, info) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+/**
+ * Resolves the implementation to use, this uses typical multiple dispatching
+ * methods of finding the best matching implementation or resolver.
+ * (Based on `isinstance()`, the knowledge that non-abstract DTypes cannot
+ * be subclassed is used, however.)
+ *
+ * @param ufunc
+ * @param op_dtypes The DTypes that are either passed in (defined by an
+ *        operand) or defined by the `signature` as also passed in as
+ *        `fixed_DTypes`.
+ * @param out_info Returns the tuple describing the best implementation
+ *        (consisting of dtypes and ArrayMethod or promoter).
+ *        WARNING: Returns a borrowed reference!
+ * @returns -1 on error 0 on success.  Note that the output can be NULL on
+ *          success if nothing is found.
+ */
+static int
+resolve_implementation_info(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyObject **out_info)
+{
+    int nin = ufunc->nin, nargs = ufunc->nargs;
+    Py_ssize_t size = PySequence_Length(ufunc->_loops);
+    PyObject *best_dtypes = NULL;
+    PyObject *best_resolver_info = NULL;
+
+    for (Py_ssize_t res_idx = 0; res_idx < size; res_idx++) {
+        /* Test all resolvers  */
+        PyObject *resolver_info = PySequence_Fast_GET_ITEM(
+                ufunc->_loops, res_idx);
+        PyObject *curr_dtypes = PyTuple_GET_ITEM(resolver_info, 0);
+        /*
+         * Test if the current resolver matches, it could make sense to
+         * reorder these checks to avoid the IsSubclass check as much as
+         * possible.
+         */
+
+        npy_bool matches = NPY_TRUE;
+        /*
+         * NOTE: We check also the output DType.  In principle we do not
+         *       have to strictly match it (unless it is provided by the
+         *       `signature`).  This assumes that a (fallback) promoter will
+         *       unset the output DType if no exact match is found.
+         */
+        for (Py_ssize_t i = 0; i < nargs; i++) {
+            PyArray_DTypeMeta *given_dtype = op_dtypes[i];
+            PyArray_DTypeMeta *resolver_dtype = (
+                    (PyArray_DTypeMeta *)PyTuple_GET_ITEM(curr_dtypes, i));
+            assert((PyObject *)given_dtype != Py_None);
+            if (given_dtype == NULL && i >= nin) {
+                /* Unspecified out always matches (see below for inputs) */
+                continue;
+            }
+            if (given_dtype == resolver_dtype) {
+                continue;
+            }
+            if (!resolver_dtype->abstract) {
+                matches = NPY_FALSE;
+                break;
+            }
+            if (given_dtype == NULL) {
+                /*
+                 * If an input was not specified, this is a reduce-like
+                 * operation: reductions use `(operand_DType, NULL, out_DType)`
+                 * as they only have a single operand.  This allows special
+                 * reduce promotion rules useful for example for sum/product.
+                 * E.g. `np.add.reduce([True, True])` promotes to integer.
+                 *
+                 * Continuing here allows a promoter to handle reduce-like
+                 * promotions explicitly if necessary.
+                 * TODO: The `!resolver_dtype->abstract` currently ensures that
+                 *       this is a promoter.  If we allow ArrayMethods to use
+                 *       abstract DTypes, we may have to reject it here or the
+                 *       ArrayMethod has to implement the reduce promotion.
+                 */
+                continue;
+            }
+            int subclass = PyObject_IsSubclass(
+                    (PyObject *)given_dtype, (PyObject *)resolver_dtype);
+            if (subclass < 0) {
+                return -1;
+            }
+            if (!subclass) {
+                matches = NPY_FALSE;
+                break;
+            }
+            /*
+             * TODO: Could consider allowing reverse subclass relation, i.e.
+             *       the operation DType passed in to be abstract.  That
+             *       definitely is OK for outputs (and potentially useful,
+             *       you could enforce e.g. an inexact result).
+             *       It might also be useful for some stranger promoters.
+             */
+        }
+        if (!matches) {
+            continue;
+        }
+
+        /* The resolver matches, but we have to check if it is better */
+        if (best_dtypes != NULL) {
+            int current_best = -1;  /* -1 neither, 0 current best, 1 new */
+            /*
+             * If both have concrete and None in the same position and
+             * they are identical, we will continue searching using the
+             * first best for comparison, in an attempt to find a better
+             * one.
+             * In all cases, we give up resolution, since it would be
+             * necessary to compare to two "best" cases.
+             */
+            int unambiguously_equally_good = 1;
+            for (Py_ssize_t i = 0; i < nargs; i++) {
+                int best;
+
+                PyObject *prev_dtype = PyTuple_GET_ITEM(best_dtypes, i);
+                PyObject *new_dtype = PyTuple_GET_ITEM(curr_dtypes, i);
+
+                if (prev_dtype == new_dtype) {
+                    /* equivalent, so this entry does not matter */
+                    continue;
+                }
+                /*
+                 * TODO: Even if the input is not specified, if we have
+                 *       abstract DTypes and one is a subclass of the other,
+                 *       the subclass should be considered a better match
+                 *       (subclasses are always more specific).
+                 */
+                /* If either is None, the other is strictly more specific */
+                if (prev_dtype == Py_None) {
+                    unambiguously_equally_good = 0;
+                    best = 1;
+                }
+                else if (new_dtype == Py_None) {
+                    unambiguously_equally_good = 0;
+                    best = 0;
+                }
+                /*
+                 * If both are concrete and not identical, this is
+                 * ambiguous.
+                 */
+                else if (!((PyArray_DTypeMeta *)prev_dtype)->abstract &&
+                         !((PyArray_DTypeMeta *)new_dtype)->abstract) {
+                    /*
+                     * Ambiguous unless the are identical (checked above),
+                     * but since they are concrete it does not matter which
+                     * best to compare.
+                     */
+                    best = -1;
+                }
+                /*
+                 * TODO: Unreachable, but we will need logic for abstract
+                 *       DTypes to decide if one is a subclass of the other
+                 *       (And their subclass relation is well defined.)
+                 */
+                else {
+                    assert(0);
+                }
+
+                if ((current_best != -1) && (current_best != best)) {
+                    /*
+                     * We need a clear best, this could be tricky, unless
+                     * the signature is identical, we would have to compare
+                     * against both of the found ones until we find a
+                     * better one.
+                     * Instead, only support the case where they are
+                     * identical.
+                     */
+                    /* TODO: Document the above comment, may need relaxing? */
+                    current_best = -1;
+                    break;
+                }
+                current_best = best;
+            }
+
+            if (current_best == -1) {
+                /*
+                 * TODO: It would be nice to have a "diagnostic mode" that
+                 *       informs if this happens! (An immediate error currently
+                 *       blocks later legacy resolution, but may work in the
+                 *       future.)
+                 */
+                if (unambiguously_equally_good) {
+                    /* unset the best resolver to indicate this */
+                    best_resolver_info = NULL;
+                    continue;
+                }
+                *out_info = NULL;
+                return 0;
+            }
+            else if (current_best == 0) {
+                /* The new match is not better, continue looking. */
+                continue;
+            }
+        }
+        /* The new match is better (or there was no previous match) */
+        best_dtypes = curr_dtypes;
+        best_resolver_info = resolver_info;
+    }
+    if (best_dtypes == NULL) {
+        /* The non-legacy lookup failed */
+        *out_info = NULL;
+        return 0;
+    }
+
+    *out_info = best_resolver_info;
+    return 0;
+}
+
+
+/*
+ * A promoter can currently be either a C-Capsule containing a promoter
+ * function pointer, or a Python function.  Both of these can at this time
+ * only return new operation DTypes (i.e. mutate the input while leaving
+ * those defined by the `signature` unmodified).
+ */
+static PyObject *
+call_promoter_and_recurse(
+        PyUFuncObject *NPY_UNUSED(ufunc), PyObject *NPY_UNUSED(promoter),
+        PyArray_DTypeMeta *NPY_UNUSED(op_dtypes[]),
+        PyArray_DTypeMeta *NPY_UNUSED(signature[]),
+        PyArrayObject *const NPY_UNUSED(operands[]))
+{
+    PyErr_SetString(PyExc_NotImplementedError,
+            "Internal NumPy error, promoters are not used/implemented yet.");
+    return NULL;
+}
+
+
+/*
+ * Convert the DType `signature` into the tuple of descriptors that is used
+ * by the old ufunc type resolvers in `ufunc_type_resolution.c`.
+ *
+ * Note that we do not need to pass the type tuple when we use the legacy path
+ * for type resolution rather than promotion, since the signature is always
+ * correct in that case.
+ */
+static int
+_make_new_typetup(
+        int nop, PyArray_DTypeMeta *signature[], PyObject **out_typetup) {
+    *out_typetup = PyTuple_New(nop);
+    if (*out_typetup == NULL) {
+        return -1;
+    }
+
+    int none_count = 0;
+    for (int i = 0; i < nop; i++) {
+        PyObject *item;
+        if (signature[i] == NULL) {
+            item = Py_None;
+            none_count++;
+        }
+        else {
+            if (!signature[i]->legacy || signature[i]->abstract) {
+                /*
+                 * The legacy type resolution can't deal with these.
+                 * This path will return `None` or so in the future to
+                 * set an error later if the legacy type resolution is used.
+                 */
+                PyErr_SetString(PyExc_RuntimeError,
+                        "Internal NumPy error: new DType in signature not yet "
+                        "supported. (This should be unreachable code!)");
+                Py_SETREF(*out_typetup, NULL);
+                return -1;
+            }
+            item = (PyObject *)signature[i]->singleton;
+        }
+        Py_INCREF(item);
+        PyTuple_SET_ITEM(*out_typetup, i, item);
+    }
+    if (none_count == nop) {
+        /* The whole signature was None, simply ignore type tuple */
+        Py_DECREF(*out_typetup);
+        *out_typetup = NULL;
+    }
+    return 0;
+}
+
+
+/*
+ * Fills in the operation_DTypes with borrowed references.  This may change
+ * the content, since it will use the legacy type resolution, which can special
+ * case 0-D arrays (using value-based logic).
+ */
+static int
+legacy_promote_using_legacy_type_resolver(PyUFuncObject *ufunc,
+        PyArrayObject *const *ops, PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *operation_DTypes[], int *out_cacheable)
+{
+    int nargs = ufunc->nargs;
+    PyArray_Descr *out_descrs[NPY_MAXARGS] = {NULL};
+
+    PyObject *type_tuple = NULL;
+    if (_make_new_typetup(nargs, signature, &type_tuple) < 0) {
+        return -1;
+    }
+
+    /*
+     * We use unsafe casting. This is of course not accurate, but that is OK
+     * here, because for promotion/dispatching the casting safety makes no
+     * difference.  Whether the actual operands can be casts must be checked
+     * during the type resolution step (which may _also_ calls this!).
+     */
+    if (ufunc->type_resolver(ufunc,
+            NPY_UNSAFE_CASTING, (PyArrayObject **)ops, type_tuple,
+            out_descrs) < 0) {
+        Py_XDECREF(type_tuple);
+        return -1;
+    }
+    Py_XDECREF(type_tuple);
+
+    for (int i = 0; i < nargs; i++) {
+        Py_XSETREF(operation_DTypes[i], NPY_DTYPE(out_descrs[i]));
+        Py_INCREF(operation_DTypes[i]);
+        Py_DECREF(out_descrs[i]);
+    }
+    if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver) {
+        /*
+         * In this one case, the deprecation means that we actually override
+         * the signature.
+         */
+        for (int i = 0; i < nargs; i++) {
+            if (signature[i] != NULL && signature[i] != operation_DTypes[i]) {
+                Py_INCREF(operation_DTypes[i]);
+                Py_SETREF(signature[i], operation_DTypes[i]);
+                *out_cacheable = 0;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/*
+ * Note, this function returns a BORROWED references to info since it adds
+ * it to the loops.
+ */
+NPY_NO_EXPORT PyObject *
+add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *operation_dtypes[], int ignore_duplicate)
+{
+    PyObject *DType_tuple = PyArray_TupleFromItems(ufunc->nargs,
+            (PyObject **)operation_dtypes, 0);
+    if (DType_tuple == NULL) {
+        return NULL;
+    }
+
+    PyArrayMethodObject *method = PyArray_NewLegacyWrappingArrayMethod(
+            ufunc, operation_dtypes);
+    if (method == NULL) {
+        Py_DECREF(DType_tuple);
+        return NULL;
+    }
+    PyObject *info = PyTuple_Pack(2, DType_tuple, method);
+    Py_DECREF(DType_tuple);
+    Py_DECREF(method);
+    if (info == NULL) {
+        return NULL;
+    }
+    if (add_ufunc_loop(ufunc, info, ignore_duplicate) < 0) {
+        Py_DECREF(info);
+        return NULL;
+    }
+
+    return info;
+}
+
+
+/*
+ * The main implementation to find the correct DType signature and ArrayMethod
+ * to use for a ufunc.  This function may recurse with `do_legacy_fallback`
+ * set to False.
+ *
+ * If value-based promotion is necessary, this is handled ahead of time by
+ * `promote_and_get_ufuncimpl`.
+ */
+static NPY_INLINE PyObject *
+promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
+        PyArrayObject *const ops[],
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *op_dtypes[],
+        npy_bool allow_legacy_promotion, npy_bool cache)
+{
+    /*
+     * Fetch the dispatching info which consists of the implementation and
+     * the DType signature tuple.  There are three steps:
+     *
+     * 1. Check the cache.
+     * 2. Check all registered loops/promoters to find the best match.
+     * 3. Fall back to the legacy implementation if no match was found.
+     */
+    PyObject *info = PyArrayIdentityHash_GetItem(ufunc->_dispatch_cache,
+                (PyObject **)op_dtypes);
+    if (info != NULL && PyObject_TypeCheck(
+            PyTuple_GET_ITEM(info, 1), &PyArrayMethod_Type)) {
+        /* Found the ArrayMethod and NOT a promoter: return it */
+        return info;
+    }
+
+    /*
+     * If `info == NULL`, the caching failed, repeat using the full resolution
+     * in `resolve_implementation_info`.
+     */
+    if (info == NULL) {
+        if (resolve_implementation_info(ufunc, op_dtypes, &info) < 0) {
+            return NULL;
+        }
+        if (info != NULL && PyObject_TypeCheck(
+                PyTuple_GET_ITEM(info, 1), &PyArrayMethod_Type)) {
+            /*
+             * Found the ArrayMethod and NOT promoter.  Before returning it
+             * add it to the cache for faster lookup in the future.
+             */
+            if (cache && PyArrayIdentityHash_SetItem(ufunc->_dispatch_cache,
+                    (PyObject **)op_dtypes, info, 0) < 0) {
+                return NULL;
+            }
+            return info;
+        }
+    }
+
+    /*
+     * At this point `info` is NULL if there is no matching loop, or it is
+     * a promoter that needs to be used/called:
+     */
+    if (info != NULL) {
+        PyObject *promoter = PyTuple_GET_ITEM(info, 1);
+
+        info = call_promoter_and_recurse(ufunc,
+                promoter, op_dtypes, signature, ops);
+        if (info == NULL && PyErr_Occurred()) {
+            return NULL;
+        }
+        else if (info != NULL) {
+            return info;
+        }
+    }
+
+    /*
+     * Even using promotion no loop was found.
+     * Using promotion failed, this should normally be an error.
+     * However, we need to give the legacy implementation a chance here.
+     * (it will modify `op_dtypes`).
+     */
+    if (!allow_legacy_promotion || ufunc->type_resolver == NULL ||
+            (ufunc->ntypes == 0 && ufunc->userloops == NULL)) {
+        /* Already tried or not a "legacy" ufunc (no loop found, return) */
+        return NULL;
+    }
+
+    PyArray_DTypeMeta *new_op_dtypes[NPY_MAXARGS] = {NULL};
+    int cacheable = 1;  /* TODO: only the comparison deprecation needs this */
+    if (legacy_promote_using_legacy_type_resolver(ufunc,
+            ops, signature, new_op_dtypes, &cacheable) < 0) {
+        return NULL;
+    }
+    info = promote_and_get_info_and_ufuncimpl(ufunc,
+            ops, signature, new_op_dtypes, NPY_FALSE, cacheable);
+    for (int i = 0; i < ufunc->nargs; i++) {
+        Py_XDECREF(new_op_dtypes);
+    }
+    return info;
+}
+
+
+/**
+ * The central entry-point for the promotion and dispatching machinery.
+ *
+ * It currently may work with the operands (although it would be possible to
+ * only work with DType (classes/types).  This is because it has to ensure
+ * that legacy (value-based promotion) is used when necessary.
+ *
+ * @param ufunc The ufunc object, used mainly for the fallback.
+ * @param ops The array operands (used only for the fallback).
+ * @param signature As input, the DType signature fixed explicitly by the user.
+ *        The signature is *filled* in with the operation signature we end up
+ *        using.
+ * @param op_dtypes The operand DTypes (without casting) which are specified
+ *        either by the `signature` or by an `operand`.
+ *        (outputs and the second input can be NULL for reductions).
+ *        NOTE: In some cases, the promotion machinery may currently modify
+ *        these.
+ * @param force_legacy_promotion If set, we have to use the old type resolution
+ *        to implement value-based promotion/casting.
+ */
+NPY_NO_EXPORT PyArrayMethodObject *
+promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
+        PyArrayObject *const ops[],
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *op_dtypes[],
+        npy_bool force_legacy_promotion,
+        npy_bool allow_legacy_promotion)
+{
+    int nargs = ufunc->nargs;
+
+    /*
+     * Get the actual DTypes we operate with by mixing the operand array
+     * ones with the passed signature.
+     */
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i] != NULL) {
+            /*
+             * ignore the operand input, we cannot overwrite signature yet
+             * since it is fixed (cannot be promoted!)
+             */
+            Py_INCREF(signature[i]);
+            Py_XSETREF(op_dtypes[i], signature[i]);
+            assert(i >= ufunc->nin || !signature[i]->abstract);
+        }
+    }
+
+    if (force_legacy_promotion) {
+        /*
+         * We must use legacy promotion for value-based logic. Call the old
+         * resolver once up-front to get the "actual" loop dtypes.
+         * After this (additional) promotion, we can even use normal caching.
+         */
+        int cacheable = 1;  /* unused, as we modify the original `op_dtypes` */
+        if (legacy_promote_using_legacy_type_resolver(ufunc,
+                ops, signature, op_dtypes, &cacheable) < 0) {
+            return NULL;
+        }
+    }
+
+    PyObject *info = promote_and_get_info_and_ufuncimpl(ufunc,
+            ops, signature, op_dtypes, allow_legacy_promotion, NPY_TRUE);
+
+    if (info == NULL) {
+        if (!PyErr_Occurred()) {
+            raise_no_loop_found_error(ufunc, (PyObject **)op_dtypes);
+        }
+        return NULL;
+    }
+
+    PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1);
+
+    /* Fill `signature` with final DTypes used by the ArrayMethod/inner-loop */
+    PyObject *all_dtypes = PyTuple_GET_ITEM(info, 0);
+    for (int i = 0; i < nargs; i++) {
+        if (signature[i] == NULL) {
+            signature[i] = (PyArray_DTypeMeta *)PyTuple_GET_ITEM(all_dtypes, i);
+            Py_INCREF(signature[i]);
+        }
+        else {
+            assert((PyObject *)signature[i] == PyTuple_GET_ITEM(all_dtypes, i));
+        }
+    }
+
+    return method;
+}
diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h
new file mode 100644
index 000000000..cefad691f
--- /dev/null
+++ b/numpy/core/src/umath/dispatching.h
@@ -0,0 +1,22 @@
+#ifndef _NPY_DISPATCHING_H
+#define _NPY_DISPATCHING_H
+
+#define _UMATHMODULE
+
+#include <numpy/ufuncobject.h>
+#include "array_method.h"
+
+
+NPY_NO_EXPORT PyArrayMethodObject *
+promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
+        PyArrayObject *const ops[],
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *op_dtypes[],
+        npy_bool force_legacy_promotion,
+        npy_bool allow_legacy_promotion);
+
+NPY_NO_EXPORT PyObject *
+add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *operation_dtypes[], int ignore_duplicate);
+
+#endif  /*_NPY_DISPATCHING_H */
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
new file mode 100644
index 000000000..e5043aa71
--- /dev/null
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -0,0 +1,257 @@
+/*
+ * This file defines most of the machinery in order to wrap legacy style
+ * ufunc loops into new style arraymethods.
+ */
+
+#include <Python.h>
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/ndarraytypes.h"
+
+#include "convert_datatype.h"
+#include "array_method.h"
+#include "dtype_transfer.h"
+#include "legacy_array_method.h"
+
+
+typedef struct {
+    NpyAuxData base;
+    /* The legacy loop and additional user data: */
+    PyUFuncGenericFunction loop;
+    void *user_data;
+    /* Whether to check for PyErr_Occurred(), must require GIL if used */
+    int pyerr_check;
+} legacy_array_method_auxdata;
+
+
+/* Use a free list, since we should normally only need one at a time */
+#define NPY_LOOP_DATA_CACHE_SIZE 5
+static int loop_data_num_cached = 0;
+static  legacy_array_method_auxdata *loop_data_cache[NPY_LOOP_DATA_CACHE_SIZE];
+
+
+static void
+legacy_array_method_auxdata_free(NpyAuxData *data)
+{
+    if (loop_data_num_cached < NPY_LOOP_DATA_CACHE_SIZE) {
+        loop_data_cache[loop_data_num_cached] = (
+                (legacy_array_method_auxdata *)data);
+        loop_data_num_cached++;
+    }
+    else {
+        PyMem_Free(data);
+    }
+}
+
+#undef NPY_LOOP_DATA_CACHE_SIZE
+
+
+NpyAuxData *
+get_new_loop_data(
+        PyUFuncGenericFunction loop, void *user_data, int pyerr_check)
+{
+    legacy_array_method_auxdata *data;
+    if (NPY_LIKELY(loop_data_num_cached > 0)) {
+        loop_data_num_cached--;
+        data = loop_data_cache[loop_data_num_cached];
+    }
+    else {
+        data = PyMem_Malloc(sizeof(legacy_array_method_auxdata));
+        if (data == NULL) {
+            return NULL;
+        }
+        data->base.free = legacy_array_method_auxdata_free;
+        data->base.clone = NULL;  /* no need for cloning (at least for now) */
+    }
+    data->loop = loop;
+    data->user_data = user_data;
+    data->pyerr_check = pyerr_check;
+    return (NpyAuxData *)data;
+}
+
+
+/*
+ * This is a thin wrapper around the legacy loop signature.
+ */
+static int
+generic_wrapped_legacy_loop(PyArrayMethod_Context *NPY_UNUSED(context),
+        char *const *data, const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *auxdata)
+{
+    legacy_array_method_auxdata *ldata = (legacy_array_method_auxdata *)auxdata;
+
+    ldata->loop((char **)data, dimensions, strides, ldata->user_data);
+    if (ldata->pyerr_check && PyErr_Occurred()) {
+        return -1;
+    }
+    return 0;
+}
+
+
+/*
+ * Signal that the old type-resolution function must be used to resolve
+ * the descriptors (mainly/only used for datetimes due to the unit).
+ *
+ * ArrayMethod's are expected to implement this, but it is too tricky
+ * to support properly.  So we simply set an error that should never be seen.
+ */
+NPY_NO_EXPORT NPY_CASTING
+wrapped_legacy_resolve_descriptors(PyArrayMethodObject *NPY_UNUSED(self),
+        PyArray_DTypeMeta *NPY_UNUSED(dtypes[]),
+        PyArray_Descr *NPY_UNUSED(given_descrs[]),
+        PyArray_Descr *NPY_UNUSED(loop_descrs[]))
+{
+    PyErr_SetString(PyExc_RuntimeError,
+            "cannot use legacy wrapping ArrayMethod without calling the ufunc "
+            "itself.  If this error is hit, the solution will be to port the "
+            "legacy ufunc loop implementation to the new API.");
+    return -1;
+}
+
+/*
+ * Much the same as the default type resolver, but tries a bit harder to
+ * preserve metadata.
+ */
+static NPY_CASTING
+simple_legacy_resolve_descriptors(
+        PyArrayMethodObject *method,
+        PyArray_DTypeMeta **dtypes,
+        PyArray_Descr **given_descrs,
+        PyArray_Descr **output_descrs)
+{
+    int nin = method->nin;
+    int nout = method->nout;
+
+    for (int i = 0; i < nin + nout; i++) {
+        if (given_descrs[i] != NULL) {
+            output_descrs[i] = ensure_dtype_nbo(given_descrs[i]);
+        }
+        else if (dtypes[i] == dtypes[0] && i > 0) {
+            /* Preserve metadata from the first operand if same dtype */
+            Py_INCREF(output_descrs[0]);
+            output_descrs[i] = output_descrs[0];
+        }
+        else {
+            output_descrs[i] = dtypes[i]->default_descr(dtypes[i]);
+        }
+        if (output_descrs[i] == NULL) {
+            goto fail;
+        }
+    }
+
+    return NPY_SAFE_CASTING;
+
+  fail:
+    for (int i = 0; i < nin + nout; i++) {
+        Py_CLEAR(output_descrs[i]);
+    }
+    return -1;
+}
+
+
+/*
+ * This function grabs the legacy inner-loop.  If this turns out to be slow
+ * we could probably cache it (with some care).
+ */
+NPY_NO_EXPORT int
+get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        npy_intp *NPY_UNUSED(strides),
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags)
+{
+    assert(aligned);
+    assert(!move_references);
+
+    if (context->caller == NULL ||
+            !PyObject_TypeCheck(context->caller, &PyUFunc_Type)) {
+        PyErr_Format(PyExc_RuntimeError,
+                "cannot call %s without its ufunc as caller context.",
+                context->method->name);
+        return -1;
+    }
+
+    PyUFuncObject *ufunc = (PyUFuncObject *)context->caller;
+    void *user_data;
+    int needs_api = 0;
+
+    PyUFuncGenericFunction loop = NULL;
+    /* Note that `needs_api` is not reliable (it was in fact unused normally) */
+    if (ufunc->legacy_inner_loop_selector(ufunc,
+            context->descriptors, &loop, &user_data, &needs_api) < 0) {
+        return -1;
+    }
+    *flags = context->method->flags & NPY_METH_RUNTIME_FLAGS;
+    if (needs_api) {
+        *flags |= NPY_METH_REQUIRES_PYAPI;
+    }
+
+    *out_loop = &generic_wrapped_legacy_loop;
+    *out_transferdata = get_new_loop_data(
+            loop, user_data, (*flags & NPY_METH_REQUIRES_PYAPI) != 0);
+    return 0;
+}
+
+
+/*
+ * Get the unbound ArrayMethod which wraps the instances of the ufunc.
+ * Note that this function stores the result on the ufunc and then only
+ * returns the same one.
+ */
+NPY_NO_EXPORT PyArrayMethodObject *
+PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *signature[])
+{
+    char method_name[101];
+    const char *name = ufunc->name ? ufunc->name : "<unknown>";
+    snprintf(method_name, 100, "legacy_ufunc_wrapper_for_%s", name);
+
+    /*
+     * Assume that we require the Python API when any of the (legacy) dtypes
+     * flags it.
+     */
+    int any_output_flexible = 0;
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+
+    for (int i = 0; i < ufunc->nin+ufunc->nout; i++) {
+        if (signature[i]->singleton->flags & (
+                NPY_ITEM_REFCOUNT | NPY_ITEM_IS_POINTER | NPY_NEEDS_PYAPI)) {
+            flags |= NPY_METH_REQUIRES_PYAPI;
+        }
+        if (signature[i]->parametric) {
+            any_output_flexible = 1;
+        }
+    }
+
+    PyType_Slot slots[3] = {
+        {NPY_METH_get_loop, &get_wrapped_legacy_ufunc_loop},
+        {NPY_METH_resolve_descriptors, &simple_legacy_resolve_descriptors},
+        {0, NULL},
+    };
+    if (any_output_flexible) {
+        /* We cannot use the default descriptor resolver. */
+        slots[1].pfunc = &wrapped_legacy_resolve_descriptors;
+    }
+
+    PyArrayMethod_Spec spec = {
+        .name = method_name,
+        .nin = ufunc->nin,
+        .nout = ufunc->nout,
+        .dtypes = signature,
+        .flags = flags,
+        .slots = slots,
+        .casting = NPY_EQUIV_CASTING,
+    };
+
+    PyBoundArrayMethodObject *bound_res = PyArrayMethod_FromSpec_int(&spec, 1);
+    if (bound_res == NULL) {
+        return NULL;
+    }
+    PyArrayMethodObject *res = bound_res->method;
+    Py_INCREF(res);
+    Py_DECREF(bound_res);
+    return res;
+}
diff --git a/numpy/core/src/umath/legacy_array_method.h b/numpy/core/src/umath/legacy_array_method.h
new file mode 100644
index 000000000..0dec1fb3a
--- /dev/null
+++ b/numpy/core/src/umath/legacy_array_method.h
@@ -0,0 +1,33 @@
+#ifndef _NPY_LEGACY_ARRAY_METHOD_H
+#define _NPY_LEGACY_ARRAY_METHOD_H
+
+#include "numpy/ndarraytypes.h"
+#include "numpy/ufuncobject.h"
+#include "array_method.h"
+
+
+NPY_NO_EXPORT PyArrayMethodObject *
+PyArray_NewLegacyWrappingArrayMethod(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *signature[]);
+
+
+
+/*
+ * The following two symbols are in the header so that other places can use
+ * them to probe for special cases (or whether an ArrayMethod is a "legacy"
+ * one).
+ */
+NPY_NO_EXPORT int
+get_wrapped_legacy_ufunc_loop(PyArrayMethod_Context *context,
+        int aligned, int move_references,
+        npy_intp *NPY_UNUSED(strides),
+        PyArrayMethod_StridedLoop **out_loop,
+        NpyAuxData **out_transferdata,
+        NPY_ARRAYMETHOD_FLAGS *flags);
+
+NPY_NO_EXPORT NPY_CASTING
+wrapped_legacy_resolve_descriptors(PyArrayMethodObject *,
+        PyArray_DTypeMeta **, PyArray_Descr **, PyArray_Descr **);
+
+
+#endif  /*_NPY_LEGACY_ARRAY_METHOD_H */
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 60a315f6e..5a32ae603 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -41,6 +41,7 @@
 #include "ufunc_type_resolution.h"
 #include "reduction.h"
 #include "mem_overlap.h"
+#include "npy_hashtable.h"
 
 #include "ufunc_object.h"
 #include "override.h"
@@ -49,7 +50,10 @@
 #include "common.h"
 #include "dtypemeta.h"
 #include "numpyos.h"
+#include "dispatching.h"
 #include "convert_datatype.h"
+#include "legacy_array_method.h"
+#include "abstractdtypes.h"
 
 /********** PRINTF DEBUG TRACING **************/
 #define NPY_UF_DBG_TRACING 0
@@ -101,6 +105,12 @@ _get_wrap_prepare_args(ufunc_full_args full_args) {
 static PyObject *
 prepare_input_arguments_for_outer(PyObject *args, PyUFuncObject *ufunc);
 
+static int
+resolve_descriptors(int nop,
+        PyUFuncObject *ufunc, PyArrayMethodObject *ufuncimpl,
+        PyArrayObject *operands[], PyArray_Descr *dtypes[],
+        PyArray_DTypeMeta *signature[], NPY_CASTING casting);
+
 
 /*UFUNC_API*/
 NPY_NO_EXPORT int
@@ -911,7 +921,9 @@ _wheremask_converter(PyObject *obj, PyArrayObject **wheremask)
  */
 static int
 convert_ufunc_arguments(PyUFuncObject *ufunc,
-        ufunc_full_args full_args, PyArrayObject **out_op,
+        ufunc_full_args full_args, PyArrayObject *out_op[],
+        PyArray_DTypeMeta *out_op_DTypes[],
+        npy_bool *force_legacy_promotion, npy_bool *allow_legacy_promotion,
         PyObject *order_obj, NPY_ORDER *out_order,
         PyObject *casting_obj, NPY_CASTING *out_casting,
         PyObject *subok_obj, npy_bool *out_subok,
@@ -924,21 +936,55 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
     PyObject *obj;
 
     /* Convert and fill in input arguments */
+    npy_bool all_scalar = NPY_TRUE;
+    npy_bool any_scalar = NPY_FALSE;
+    *allow_legacy_promotion = NPY_TRUE;
+    *force_legacy_promotion = NPY_FALSE;
     for (int i = 0; i < nin; i++) {
         obj = PyTuple_GET_ITEM(full_args.in, i);
 
         if (PyArray_Check(obj)) {
-            PyArrayObject *obj_a = (PyArrayObject *)obj;
-            out_op[i] = (PyArrayObject *)PyArray_FromArray(obj_a, NULL, 0);
+            out_op[i] = (PyArrayObject *)obj;
+            Py_INCREF(out_op[i]);
         }
         else {
-            out_op[i] = (PyArrayObject *)PyArray_FromAny(obj,
-                                    NULL, 0, 0, 0, NULL);
+            /* Convert the input to an array and check for special cases */
+            out_op[i] = (PyArrayObject *)PyArray_FromAny(obj, NULL, 0, 0, 0, NULL);
+            if (out_op[i] == NULL) {
+                goto fail;
+            }
         }
+        out_op_DTypes[i] = NPY_DTYPE(PyArray_DESCR(out_op[i]));
+        Py_INCREF(out_op_DTypes[i]);
 
-        if (out_op[i] == NULL) {
-            goto fail;
+        if (!out_op_DTypes[i]->legacy) {
+            *allow_legacy_promotion = NPY_FALSE;
+        }
+        if (PyArray_NDIM(out_op[i]) == 0) {
+            any_scalar = NPY_TRUE;
+        }
+        else {
+            all_scalar = NPY_FALSE;
+            continue;
         }
+        /*
+         * TODO: we need to special case scalars here, if the input is a
+         *       Python int, float, or complex, we have to use the "weak"
+         *       DTypes: `PyArray_PyIntAbstractDType`, etc.
+         *       This is to allow e.g. `float32(1.) + 1` to return `float32`.
+         *       The correct array dtype can only be found after promotion for
+         *       such a "weak scalar".  We could avoid conversion here, but
+         *       must convert it for use in the legacy promotion.
+         *       There is still a small chance that this logic can instead
+         *       happen inside the Python operators.
+         */
+    }
+    if (*allow_legacy_promotion && (!all_scalar && any_scalar)) {
+        *force_legacy_promotion = should_use_min_scalar(nin, out_op, 0, NULL);
+        /*
+         * TODO: if this is False, we end up in a "very slow" path that should
+         *       be avoided.  This makes `int_arr + 0.` ~40% slower.
+         */
     }
 
     /* Convert and fill in output arguments */
@@ -948,6 +994,10 @@ convert_ufunc_arguments(PyUFuncObject *ufunc,
             if (_set_out_array(obj, out_op + i + nin) < 0) {
                 goto fail;
             }
+            if (out_op[i] != NULL) {
+                out_op_DTypes[i + nin] = NPY_DTYPE(PyArray_DESCR(out_op[i]));
+                Py_INCREF(out_op_DTypes[i + nin]);
+            }
         }
     }
 
@@ -991,11 +1041,11 @@ fail:
  * -1 if there is an error.
  */
 static int
-check_for_trivial_loop(PyUFuncObject *ufunc,
+check_for_trivial_loop(PyArrayMethodObject *ufuncimpl,
         PyArrayObject **op, PyArray_Descr **dtypes,
-        npy_intp buffersize)
+        NPY_CASTING casting, npy_intp buffersize)
 {
-    int i, nin = ufunc->nin, nop = nin + ufunc->nout;
+    int i, nin = ufuncimpl->nin, nop = nin + ufuncimpl->nout;
 
     for (i = 0; i < nop; ++i) {
         /*
@@ -1017,6 +1067,10 @@ check_for_trivial_loop(PyUFuncObject *ufunc,
             if (!(safety & _NPY_CAST_IS_VIEW)) {
                 must_copy = 1;
             }
+
+            if (PyArray_MinCastSafety(safety, casting) != casting) {
+                return 0;  /* the cast is not safe enough */
+            }
         }
         if (must_copy) {
             /*
@@ -1132,14 +1186,15 @@ prepare_ufunc_output(PyUFuncObject *ufunc,
  *
  * Returns -2 if a trivial loop is not possible, 0 on success and -1 on error.
  */
-static NPY_INLINE int
-try_trivial_single_output_loop(PyUFuncObject *ufunc,
-        PyArrayObject *op[], PyArray_Descr *dtypes[],
-        NPY_ORDER order, PyObject *arr_prep[], ufunc_full_args full_args)
+static int
+try_trivial_single_output_loop(PyArrayMethod_Context *context,
+        PyArrayObject *op[], NPY_ORDER order,
+        PyObject *arr_prep[], ufunc_full_args full_args,
+        int errormask, PyObject *extobj)
 {
-    int nin = ufunc->nin;
+    int nin = context->method->nin;
     int nop = nin + 1;
-    assert(ufunc->nout == 1);
+    assert(context->method->nout == 1);
 
     /* The order of all N-D contiguous operands, can be fixed by `order` */
     int operation_order = 0;
@@ -1204,14 +1259,14 @@ try_trivial_single_output_loop(PyUFuncObject *ufunc,
     }
 
     if (op[nin] == NULL) {
-        Py_INCREF(dtypes[nin]);
+        Py_INCREF(context->descriptors[nin]);
         op[nin] = (PyArrayObject *) PyArray_NewFromDescr(&PyArray_Type,
-                dtypes[nin], operation_ndim, operation_shape,
+                context->descriptors[nin], operation_ndim, operation_shape,
                 NULL, NULL, operation_order==NPY_ARRAY_F_CONTIGUOUS, NULL);
         if (op[nin] == NULL) {
             return -1;
         }
-        fixed_strides[nin] = dtypes[nin]->elsize;
+        fixed_strides[nin] = context->descriptors[nin]->elsize;
     }
     else {
         /* If any input overlaps with the output, we use the full path. */
@@ -1232,7 +1287,7 @@ try_trivial_single_output_loop(PyUFuncObject *ufunc,
     }
 
     /* Call the __prepare_array__ if necessary */
-    if (prepare_ufunc_output(ufunc, &op[nin],
+    if (prepare_ufunc_output((PyUFuncObject *)context->caller, &op[nin],
             arr_prep[0], full_args, 0) < 0) {
         return -1;
     }
@@ -1243,46 +1298,88 @@ try_trivial_single_output_loop(PyUFuncObject *ufunc,
      */
     char *data[NPY_MAXARGS];
     npy_intp count = PyArray_MultiplyList(operation_shape, operation_ndim);
-    int needs_api = 0;
     NPY_BEGIN_THREADS_DEF;
 
-    PyUFuncGenericFunction innerloop;
-    void *innerloopdata = NULL;
-    if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-            &innerloop, &innerloopdata, &needs_api) < 0) {
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata = NULL;
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+    if (context->method->get_strided_loop(context,
+            1, 0, fixed_strides,
+            &strided_loop, &auxdata, &flags) < 0) {
         return -1;
     }
-
-    for (int iop = 0; iop < nop; iop++) {
+    for (int iop=0; iop < nop; iop++) {
         data[iop] = PyArray_BYTES(op[iop]);
-        needs_api |= PyDataType_REFCHK(dtypes[iop]);
     }
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
+    }
+    if (!(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS_THRESHOLDED(count);
     }
 
-    innerloop(data, &count, fixed_strides, innerloopdata);
+    int res = strided_loop(context, data, &count, fixed_strides, auxdata);
 
     NPY_END_THREADS;
+    NPY_AUXDATA_FREE(auxdata);
+
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        const char *name = ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        res = _check_ufunc_fperr(errormask, extobj, name);
+    }
+    return res;
+}
+
+
+/*
+ * Check casting: It would be nice to just move this into the iterator
+ * or pass in the full cast information.  But this can special case
+ * the logical functions and prints a better error message.
+ */
+static NPY_INLINE int
+validate_casting(PyArrayMethodObject *method, PyUFuncObject *ufunc,
+        PyArrayObject *ops[], PyArray_Descr *descriptors[],
+        NPY_CASTING casting)
+{
+    if (method->resolve_descriptors == &wrapped_legacy_resolve_descriptors) {
+        /*
+         * In this case the legacy type resolution was definitely called
+         * and we do not need to check (astropy/pyerfa relied on this).
+         */
+        return 0;
+    }
+    if (PyUFunc_ValidateCasting(ufunc, casting, ops, descriptors) < 0) {
+        return -1;
+    }
     return 0;
 }
 
 
+/*
+ * The ufunc loop implementation for both normal ufunc calls and masked calls
+ * when the iterator has to be used.
+ *
+ * See `PyUFunc_GenericFunctionInternal` for more information (where this is
+ * called from).
+ */
 static int
-execute_ufunc_loop(PyUFuncObject *ufunc,
-                    int masked,
-                    PyArrayObject **op,
-                    PyArray_Descr **dtypes,
-                    NPY_ORDER order,
-                    npy_intp buffersize,
-                    PyObject **arr_prep,
-                    ufunc_full_args full_args,
-                    npy_uint32 *op_flags)
+execute_ufunc_loop(PyArrayMethod_Context *context, int masked,
+        PyArrayObject **op, NPY_ORDER order, npy_intp buffersize,
+        NPY_CASTING casting,
+        PyObject **arr_prep, ufunc_full_args full_args,
+        npy_uint32 *op_flags, int errormask, PyObject *extobj)
 {
-    int nin = ufunc->nin, nout = ufunc->nout;
+    PyUFuncObject *ufunc = (PyUFuncObject *)context->caller;
+    int nin = context->method->nin, nout = context->method->nout;
     int nop = nin + nout;
 
+    if (validate_casting(context->method,
+            ufunc, op, context->descriptors, casting) < 0) {
+        return -1;
+    }
+
     if (masked) {
         assert(PyArray_TYPE(op[nop]) == NPY_BOOL);
         if (ufunc->_always_null_previously_masked_innerloop_selector != NULL) {
@@ -1345,7 +1442,7 @@ execute_ufunc_loop(PyUFuncObject *ufunc,
     NpyIter *iter = NpyIter_AdvancedNew(nop + masked, op,
                         iter_flags,
                         order, NPY_UNSAFE_CASTING,
-                        op_flags, dtypes,
+                        op_flags, context->descriptors,
                         -1, NULL, NULL, buffersize);
     if (iter == NULL) {
         return -1;
@@ -1410,22 +1507,25 @@ execute_ufunc_loop(PyUFuncObject *ufunc,
     }
 
     /*
-     * Get the inner loop.
+     * Get the inner loop, with the possibility of specialization
+     * based on the fixed strides.
      */
-    int needs_api = 0;
-    PyUFuncGenericFunction innerloop;
-    void *innerloopdata = NULL;
+    PyArrayMethod_StridedLoop *strided_loop;
+    NpyAuxData *auxdata;
+    npy_intp fixed_strides[NPY_MAXARGS];
+
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
     if (masked) {
-        if (PyUFunc_DefaultMaskedInnerLoopSelector(ufunc,
-                dtypes, &innerloop, (NpyAuxData **)&innerloopdata,
-                &needs_api) < 0) {
+        if (PyArrayMethod_GetMaskedStridedLoop(context,
+                1, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
             NpyIter_Deallocate(iter);
             return -1;
         }
     }
     else {
-        if (ufunc->legacy_inner_loop_selector(ufunc, dtypes,
-                &innerloop, &innerloopdata, &needs_api) < 0) {
+        if (context->method->get_strided_loop(context,
+                1, 0, fixed_strides, &strided_loop, &auxdata, &flags) < 0) {
             NpyIter_Deallocate(iter);
             return -1;
         }
@@ -1434,87 +1534,45 @@ execute_ufunc_loop(PyUFuncObject *ufunc,
     /* Get the variables needed for the loop */
     NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL);
     if (iternext == NULL) {
+        NPY_AUXDATA_FREE(auxdata);
         NpyIter_Deallocate(iter);
-        if (masked) {
-            NPY_AUXDATA_FREE((NpyAuxData *)innerloopdata);
-        }
         return -1;
     }
     char **dataptr = NpyIter_GetDataPtrArray(iter);
     npy_intp *strides = NpyIter_GetInnerStrideArray(iter);
     npy_intp *countptr = NpyIter_GetInnerLoopSizePtr(iter);
-    needs_api |= NpyIter_IterationNeedsAPI(iter);
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
 
     NPY_BEGIN_THREADS_DEF;
 
-    if (!needs_api) {
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        npy_clear_floatstatus_barrier((char *)context);
+    }
+    if (!needs_api && !(flags & NPY_METH_REQUIRES_PYAPI)) {
         NPY_BEGIN_THREADS_THRESHOLDED(full_size);
     }
 
     NPY_UF_DBG_PRINT("Actual inner loop:\n");
     /* Execute the loop */
+    int res;
     do {
-        NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)*count_ptr);
-        innerloop(dataptr, countptr, strides, innerloopdata);
-    } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        NPY_UF_DBG_PRINT1("iterator loop count %d\n", (int)*countptr);
+        res = strided_loop(context, dataptr, countptr, strides, auxdata);
+    } while (res == 0 && iternext(iter));
 
     NPY_END_THREADS;
-    if (masked) {
-        NPY_AUXDATA_FREE((NpyAuxData *)innerloopdata);
-    }
+    NPY_AUXDATA_FREE(auxdata);
 
-    /*
-     * Currently `innerloop` may leave an error set, in this case
-     * NpyIter_Deallocate will always return an error as well.
-     */
-    if (NpyIter_Deallocate(iter) == NPY_FAIL) {
-        return -1;
+    if (res == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        const char *name = ufunc_get_name_cstr((PyUFuncObject *)context->caller);
+        res = _check_ufunc_fperr(errormask, extobj, name);
     }
-    return 0;
-}
 
-/*
- * ufunc           - the ufunc to call
- * trivial_loop_ok - 1 if no alignment, data conversion, etc required
- * op              - the operands (ufunc->nin + ufunc->nout of them)
- * dtypes          - the dtype of each operand
- * order           - the loop execution order/output memory order
- * buffersize      - how big of a buffer to use
- * arr_prep        - the __array_prepare__ functions for the outputs
- * full_args       - the original input, output PyObject *
- * op_flags        - per-operand flags, a combination of NPY_ITER_* constants
- */
-static int
-execute_legacy_ufunc_loop(PyUFuncObject *ufunc,
-                    int trivial_loop_ok,
-                    PyArrayObject **op,
-                    PyArray_Descr **dtypes,
-                    NPY_ORDER order,
-                    npy_intp buffersize,
-                    PyObject **arr_prep,
-                    ufunc_full_args full_args,
-                    npy_uint32 *op_flags)
-{
-    /* First check for the trivial cases that don't need an iterator */
-    if (trivial_loop_ok && ufunc->nout == 1) {
-        int fast_path_result = try_trivial_single_output_loop(ufunc,
-                op, dtypes, order, arr_prep, full_args);
-        if (fast_path_result != -2) {
-            return fast_path_result;
-        }
-    }
-
-    /*
-     * If no trivial loop matched, an iterator is required to
-     * resolve broadcasting, etc
-     */
-    NPY_UF_DBG_PRINT("iterator loop\n");
-    if (execute_ufunc_loop(ufunc, 0, op, dtypes, order,
-                    buffersize, arr_prep, full_args, op_flags) < 0) {
+    if (!NpyIter_Deallocate(iter)) {
         return -1;
     }
-
-    return 0;
+    return res;
 }
 
 
@@ -2007,9 +2065,9 @@ _initialize_variable_parts(PyUFuncObject *ufunc,
 
 static int
 PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
-        PyArray_Descr *operation_descrs[],
+        PyArrayMethodObject *ufuncimpl, PyArray_Descr *operation_descrs[],
         PyArrayObject *op[], PyObject *extobj,
-        NPY_ORDER order,
+        NPY_CASTING casting, NPY_ORDER order,
         PyObject *axis, PyObject *axes, int keepdims)
 {
     int nin, nout;
@@ -2034,13 +2092,12 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
 
-    /* The selected inner loop */
-    PyUFuncGenericFunction innerloop = NULL;
-    void *innerloopdata = NULL;
     /* The dimensions which get passed to the inner loop */
     npy_intp inner_dimensions[NPY_MAXDIMS+1];
     /* The strides which get passed to the inner loop */
     npy_intp *inner_strides = NULL;
+    /* Auxiliary data allocated by the ufuncimpl (ArrayMethod) */
+    NpyAuxData *auxdata = NULL;
 
     /* The sizes of the core dimensions (# entries is ufunc->core_num_dim_ix) */
     npy_intp *core_dim_sizes = inner_dimensions + 1;
@@ -2057,6 +2114,11 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     NPY_UF_DBG_PRINT1("\nEvaluating ufunc %s\n", ufunc_name);
 
+    if (validate_casting(ufuncimpl,
+            ufunc, op, operation_descrs, casting) < 0) {
+        return -1;
+    }
+
     /* Initialize possibly variable parts to the values from the ufunc */
     retval = _initialize_variable_parts(ufunc, op_core_num_dims,
                                         core_dim_sizes, core_dim_flags);
@@ -2274,18 +2336,11 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
                        NPY_ITER_WRITEONLY |
                        NPY_UFUNC_DEFAULT_OUTPUT_FLAGS,
                        op_flags);
-    /* For the generalized ufunc, we get the loop right away too */
-    retval = ufunc->legacy_inner_loop_selector(ufunc,
-            operation_descrs, &innerloop, &innerloopdata, &needs_api);
-    if (retval < 0) {
-        goto fail;
-    }
 
     /*
      * Set up the iterator per-op flags.  For generalized ufuncs, we
      * can't do buffering, so must COPY or UPDATEIFCOPY.
      */
-
     iter_flags = ufunc->iter_flags |
                  NPY_ITER_MULTI_INDEX |
                  NPY_ITER_REFS_OK |
@@ -2394,21 +2449,34 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 
     /*
      * The first nop strides are for the inner loop (but only can
-     * copy them after removing the core axes)
+     * copy them after removing the core axes).  The strides will not change
+     * if the iterator is not buffered (they are effectively fixed).
+     * Supporting buffering would make sense, but probably would have to be
+     * done in the inner-loop itself (not the iterator).
      */
+    assert(!NpyIter_IsBuffered(iter));
     memcpy(inner_strides, NpyIter_GetInnerStrideArray(iter),
                                     NPY_SIZEOF_INTP * nop);
 
-#if 0
-    printf("strides: ");
-    for (i = 0; i < nop+core_dim_ixs_size; ++i) {
-        printf("%d ", (int)inner_strides[i]);
+    /* Final preparation of the arraymethod call */
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+    PyArrayMethod_StridedLoop *strided_loop;
+    NPY_ARRAYMETHOD_FLAGS flags = 0;
+
+    if (ufuncimpl->get_strided_loop(&context, 1, 0, inner_strides,
+            &strided_loop, &auxdata, &flags) < 0) {
+        goto fail;
+    }
+    needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    needs_api |= NpyIter_IterationNeedsAPI(iter);
+    if (!(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* Start with the floating-point exception flags cleared */
+        npy_clear_floatstatus_barrier((char*)&iter);
     }
-    printf("\n");
-#endif
-
-    /* Start with the floating-point exception flags cleared */
-    npy_clear_floatstatus_barrier((char*)&iter);
 
     NPY_UF_DBG_PRINT("Executing inner loop\n");
 
@@ -2427,29 +2495,28 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
         }
         dataptr = NpyIter_GetDataPtrArray(iter);
         count_ptr = NpyIter_GetInnerLoopSizePtr(iter);
-        needs_api = NpyIter_IterationNeedsAPI(iter);
 
-        if (!needs_api && !NpyIter_IterationNeedsAPI(iter)) {
+        if (!needs_api) {
             NPY_BEGIN_THREADS_THRESHOLDED(total_problem_size);
         }
         do {
             inner_dimensions[0] = *count_ptr;
-            innerloop(dataptr, inner_dimensions, inner_strides, innerloopdata);
-        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+            retval = strided_loop(&context,
+                    dataptr, inner_dimensions, inner_strides, auxdata);
+        } while (retval == 0 && iternext(iter));
 
         if (!needs_api && !NpyIter_IterationNeedsAPI(iter)) {
             NPY_END_THREADS;
         }
     }
 
-    /* Check whether any errors occurred during the loop */
-    if (PyErr_Occurred() ||
-        _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
-        retval = -1;
-        goto fail;
+    if (retval == 0 && !(flags & NPY_METH_NO_FLOATINGPOINT_ERRORS)) {
+        /* NOTE: We could check float errors even when `res < 0` */
+        retval = _check_ufunc_fperr(errormask, extobj, ufunc_name);
     }
 
     PyArray_free(inner_strides);
+    NPY_AUXDATA_FREE(auxdata);
     if (NpyIter_Deallocate(iter) < 0) {
         retval = -1;
     }
@@ -2464,6 +2531,7 @@ PyUFunc_GeneralizedFunctionInternal(PyUFuncObject *ufunc,
 fail:
     NPY_UF_DBG_PRINT1("Returning failure code %d\n", retval);
     PyArray_free(inner_strides);
+    NPY_AUXDATA_FREE(auxdata);
     NpyIter_Deallocate(iter);
     PyArray_free(remap_axis_memory);
     PyArray_free(remap_axis);
@@ -2473,17 +2541,18 @@ fail:
 
 static int
 PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
-        PyArray_Descr *operation_descrs[],
-        PyArrayObject *op[], PyObject *extobj, NPY_ORDER order,
+        PyArrayMethodObject *ufuncimpl, PyArray_Descr *operation_descrs[],
+        PyArrayObject *op[], PyObject *extobj,
+        NPY_CASTING casting, NPY_ORDER order,
         PyObject *output_array_prepare[], ufunc_full_args full_args,
         PyArrayObject *wheremask)
 {
     int nin = ufunc->nin, nout = ufunc->nout, nop = nin + nout;
 
-    const char *ufunc_name = ufunc_name = ufunc_get_name_cstr(ufunc);;
-    int retval = -1;
-    npy_uint32 op_flags[NPY_MAXARGS];
+    const char *ufunc_name = ufunc_get_name_cstr(ufunc);
+
     npy_intp default_op_out_flags;
+    npy_uint32 op_flags[NPY_MAXARGS];
 
     /* These parameters come from extobj= or from a TLS global */
     int buffersize = 0, errormask = 0;
@@ -2495,8 +2564,6 @@ PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
         return -1;
     }
 
-    NPY_UF_DBG_PRINT("Finding inner loop\n");
-
     if (wheremask != NULL) {
         /* Set up the flags. */
         default_op_out_flags = NPY_ITER_NO_SUBTYPE |
@@ -2513,6 +2580,13 @@ PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
                            default_op_out_flags, op_flags);
     }
 
+    /* Final preparation of the arraymethod call */
+    PyArrayMethod_Context context = {
+            .caller = (PyObject *)ufunc,
+            .method = ufuncimpl,
+            .descriptors = operation_descrs,
+    };
+
     /* Do the ufunc loop */
     if (wheremask != NULL) {
         NPY_UF_DBG_PRINT("Executing masked inner loop\n");
@@ -2525,52 +2599,38 @@ PyUFunc_GenericFunctionInternal(PyUFuncObject *ufunc,
         op[nop] = wheremask;
         operation_descrs[nop] = NULL;
 
-        /* Set up the flags */
-
-        npy_clear_floatstatus_barrier((char*)&ufunc);
-        retval = execute_ufunc_loop(ufunc, 1,
-                            op, operation_descrs, order,
-                            buffersize, output_array_prepare,
-                            full_args, op_flags);
+        return execute_ufunc_loop(&context, 1,
+                op, order, buffersize, casting,
+                output_array_prepare, full_args, op_flags,
+                errormask, extobj);
     }
     else {
-        NPY_UF_DBG_PRINT("Executing legacy inner loop\n");
+        NPY_UF_DBG_PRINT("Executing normal inner loop\n");
 
         /*
          * This checks whether a trivial loop is ok, making copies of
-         * scalar and one dimensional operands if that will help.
-         * Since it requires dtypes, it can only be called after
-         * ufunc->type_resolver
+         * scalar and one dimensional operands if that should help.
          */
-        int trivial_ok = check_for_trivial_loop(ufunc,
-                op, operation_descrs, buffersize);
+        int trivial_ok = check_for_trivial_loop(ufuncimpl,
+                op, operation_descrs, casting, buffersize);
         if (trivial_ok < 0) {
             return -1;
         }
+        if (trivial_ok && context.method->nout == 1) {
+            /* Try to handle everything without using the (heavy) iterator */
+            int retval = try_trivial_single_output_loop(&context,
+                    op, order, output_array_prepare, full_args,
+                    errormask, extobj);
+            if (retval != -2) {
+                return retval;
+            }
+        }
 
-        /* check_for_trivial_loop on half-floats can overflow */
-        npy_clear_floatstatus_barrier((char*)&ufunc);
-
-        retval = execute_legacy_ufunc_loop(ufunc, trivial_ok,
-                            op, operation_descrs, order,
-                            buffersize, output_array_prepare,
-                            full_args, op_flags);
-    }
-    if (retval < 0) {
-        return -1;
-    }
-
-    /*
-     * Check whether any errors occurred during the loop. The loops should
-     * indicate this in retval, but since the inner-loop currently does not
-     * report errors, this does not happen in all branches (at this time).
-     */
-    if (PyErr_Occurred() ||
-            _check_ufunc_fperr(errormask, extobj, ufunc_name) < 0) {
-        return -1;
+        return execute_ufunc_loop(&context, 0,
+                op, order, buffersize, casting,
+                output_array_prepare, full_args, op_flags,
+                errormask, extobj);
     }
-
-    return retval;
 }
 
 
@@ -4248,83 +4308,30 @@ _get_dtype(PyObject *dtype_obj) {
 }
 
 
-static int
-_make_new_typetup(
-        int nop, PyArray_DTypeMeta *signature[], PyObject **out_typetup) {
-    *out_typetup = PyTuple_New(nop);
-    if (*out_typetup == NULL) {
-        return -1;
-    }
-
-    int noncount = 0;
-    for (int i = 0; i < nop; i++) {
-        PyObject *item;
-        if (signature[i] == NULL) {
-            item = Py_None;
-            noncount++;
-        }
-        else {
-            if (!signature[i]->legacy || signature[i]->abstract) {
-                /*
-                 * The legacy type resolution can't deal with these.
-                 * This path will return `None` or so in the future to
-                 * set an error later if the legacy type resolution is used.
-                 */
-                PyErr_SetString(PyExc_RuntimeError,
-                        "Internal NumPy error: new DType in signature not yet "
-                        "supported. (This should be unreachable code!)");
-                Py_SETREF(*out_typetup, NULL);
-                return -1;
-            }
-            item = (PyObject *)signature[i]->singleton;
-        }
-        Py_INCREF(item);
-        PyTuple_SET_ITEM(*out_typetup, i, item);
-    }
-    if (noncount == nop) {
-        /* The whole signature was None, simply ignore type tuple */
-        Py_DECREF(*out_typetup);
-        *out_typetup = NULL;
-    }
-    return 0;
-}
-
-
 /*
- * Finish conversion parsing of the type tuple.  NumPy always only honored
- * the type number for passed in descriptors/dtypes.
+ * Finish conversion parsing of the DType signature.  NumPy always only
+ * honored the type number for passed in descriptors/dtypes.
  * The `dtype` argument is interpreted as the first output DType (not
  * descriptor).
  * Unlike the dtype of an `out` array, it influences loop selection!
  *
- * NOTE: This function replaces the type tuple if passed in (it steals
- *       the original reference and returns a new object and reference)!
- *       The caller must XDECREF the type tuple both on error or success.
- *
- * The function returns a new, normalized type-tuple.
+ * It is the callers responsibility to clean `signature` and NULL it before
+ * calling.
  */
 static int
-_get_normalized_typetup(PyUFuncObject *ufunc,
-        PyObject *dtype_obj, PyObject *signature_obj, PyObject **out_typetup)
+_get_fixed_signature(PyUFuncObject *ufunc,
+        PyObject *dtype_obj, PyObject *signature_obj,
+        PyArray_DTypeMeta **signature)
 {
     if (dtype_obj == NULL && signature_obj == NULL) {
         return 0;
     }
 
-    int res = -1;
     int nin = ufunc->nin, nout = ufunc->nout, nop = nin + nout;
-    /*
-     * TODO: `signature` will be the main result in the future and
-     *       not the typetup. (Type tuple construction can be deffered to when
-     *       the legacy fallback is used).
-     */
-    PyArray_DTypeMeta *signature[NPY_MAXARGS];
-    memset(signature, '\0', sizeof(*signature) * nop);
 
     if (dtype_obj != NULL) {
         if (dtype_obj == Py_None) {
             /* If `dtype=None` is passed, no need to do anything */
-            assert(*out_typetup == NULL);
             return 0;
         }
         if (nout == 0) {
@@ -4342,8 +4349,7 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
             signature[i] = dtype;
         }
         Py_DECREF(dtype);
-        res = _make_new_typetup(nop, signature, out_typetup);
-        goto finish;
+        return 0;
     }
 
     assert(signature_obj != NULL);
@@ -4359,32 +4365,46 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
             if (PyTuple_GET_ITEM(signature_obj, 0) == Py_None) {
                 PyErr_SetString(PyExc_TypeError,
                         "a single item type tuple cannot contain None.");
-                goto finish;
+                return -1;
             }
             if (DEPRECATE("The use of a length 1 tuple for the ufunc "
                           "`signature` is deprecated. Use `dtype` or  fill the"
                           "tuple with `None`s.") < 0) {
-                goto finish;
+                return -1;
             }
             /* Use the same logic as for `dtype=` */
-            res = _get_normalized_typetup(ufunc,
-                    PyTuple_GET_ITEM(signature_obj, 0), NULL, out_typetup);
-            goto finish;
+            return _get_fixed_signature(ufunc,
+                    PyTuple_GET_ITEM(signature_obj, 0), NULL, signature);
         }
         if (n != nop) {
             PyErr_Format(PyExc_ValueError,
                     "a type-tuple must be specified of length %d for ufunc '%s'",
                     nop, ufunc_get_name_cstr(ufunc));
-            goto finish;
+            return -1;
         }
         for (int i = 0; i < nop; ++i) {
             PyObject *item = PyTuple_GET_ITEM(signature_obj, i);
             if (item == Py_None) {
                 continue;
             }
-            signature[i] = _get_dtype(item);
-            if (signature[i] == NULL) {
-                goto finish;
+            else {
+                signature[i] = _get_dtype(item);
+                if (signature[i] == NULL) {
+                    return -1;
+                }
+                else if (i < nin && signature[i]->abstract) {
+                    /*
+                     * We reject abstract input signatures for now.  These
+                     * can probably be defined by finding the common DType with
+                     * the actual input and using the result of this for the
+                     * promotion.
+                     */
+                    PyErr_SetString(PyExc_TypeError,
+                            "Input DTypes to the signature must not be "
+                            "abstract.  The behaviour may be defined in the "
+                            "future.");
+                    return -1;
+                }
             }
         }
     }
@@ -4394,7 +4414,7 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
         if (PyBytes_Check(signature_obj)) {
             str_object = PyUnicode_FromEncodedObject(signature_obj, NULL, NULL);
             if (str_object == NULL) {
-                goto finish;
+                return -1;
             }
         }
         else {
@@ -4406,7 +4426,7 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
         const char *str = PyUnicode_AsUTF8AndSize(str_object, &length);
         if (str == NULL) {
             Py_DECREF(str_object);
-            goto finish;
+            return -1;
         }
 
         if (length != 1 && (length != nin+nout + 2 ||
@@ -4415,18 +4435,17 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
                     "a type-string for %s, %d typecode(s) before and %d after "
                     "the -> sign", ufunc_get_name_cstr(ufunc), nin, nout);
             Py_DECREF(str_object);
-            goto finish;
+            return -1;
         }
         if (length == 1 && nin+nout != 1) {
             Py_DECREF(str_object);
             if (DEPRECATE("The use of a length 1 string for the ufunc "
                           "`signature` is deprecated. Use `dtype` attribute or "
                           "pass a tuple with `None`s.") < 0) {
-                goto finish;
+                return -1;
             }
             /* `signature="l"` is the same as `dtype="l"` */
-            res = _get_normalized_typetup(ufunc, str_object, NULL, out_typetup);
-            goto finish;
+            return _get_fixed_signature(ufunc, str_object, NULL, signature);
         }
         else {
             for (int i = 0; i < nin+nout; ++i) {
@@ -4434,7 +4453,7 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
                 PyArray_Descr *descr = PyArray_DescrFromType(str[istr]);
                 if (descr == NULL) {
                     Py_DECREF(str_object);
-                    goto finish;
+                    return -1;
                 }
                 signature[i] = NPY_DTYPE(descr);
                 Py_INCREF(signature[i]);
@@ -4446,15 +4465,79 @@ _get_normalized_typetup(PyUFuncObject *ufunc,
     else {
         PyErr_SetString(PyExc_TypeError,
                 "the signature object to ufunc must be a string or a tuple.");
-        goto finish;
+        return -1;
+    }
+    return 0;
+}
+
+
+/*
+ * Fill in the actual descriptors used for the operation.  This function
+ * supports falling back to the legacy `ufunc->type_resolver`.
+ *
+ * We guarantee the array-method that all passed in descriptors are of the
+ * correct DType instance (i.e. a string can just fetch the length, it doesn't
+ * need to "cast" to string first).
+ */
+static int
+resolve_descriptors(int nop,
+        PyUFuncObject *ufunc, PyArrayMethodObject *ufuncimpl,
+        PyArrayObject *operands[], PyArray_Descr *dtypes[],
+        PyArray_DTypeMeta *signature[], NPY_CASTING casting)
+{
+    int retval = -1;
+    PyArray_Descr *original_dtypes[NPY_MAXARGS];
+
+    for (int i = 0; i < nop; ++i) {
+        if (operands[i] == NULL) {
+            original_dtypes[i] = NULL;
+        }
+        else {
+            /*
+             * The dtype may mismatch the signature, in which case we need
+             * to make it fit before calling the resolution.
+             */
+            PyArray_Descr *descr = PyArray_DTYPE(operands[i]);
+            original_dtypes[i] = PyArray_CastDescrToDType(descr, signature[i]);
+            if (original_dtypes[i] == NULL) {
+                nop = i;  /* only this much is initialized */
+                goto finish;
+            }
+        }
+    }
+
+    NPY_UF_DBG_PRINT("Resolving the descriptors\n");
+
+    if (ufuncimpl->resolve_descriptors != &wrapped_legacy_resolve_descriptors) {
+        /* The default: use the `ufuncimpl` as nature intended it */
+        NPY_CASTING safety = ufuncimpl->resolve_descriptors(ufuncimpl,
+                signature, original_dtypes, dtypes);
+        if (safety < 0) {
+            goto finish;
+        }
+        if (NPY_UNLIKELY(PyArray_MinCastSafety(safety, casting) != casting)) {
+            /* TODO: Currently impossible to reach (specialized unsafe loop) */
+            PyErr_Format(PyExc_TypeError,
+                    "The ufunc implementation for %s with the given dtype "
+                    "signature is not possible under the casting rule %s",
+                    ufunc_get_name_cstr(ufunc), npy_casting_to_string(casting));
+            goto finish;
+        }
+        retval = 0;
+    }
+    else {
+        /*
+         * Fall-back to legacy resolver using `operands`, used exclusively
+         * for datetime64/timedelta64 and custom ufuncs (in pyerfa/astropy).
+         */
+        retval = ufunc->type_resolver(ufunc, casting, operands, NULL, dtypes);
     }
-    res = _make_new_typetup(nop, signature, out_typetup);
 
   finish:
-    for (int i =0; i < nop; i++) {
-        Py_XDECREF(signature[i]);
+    for (int i = 0; i < nop; i++) {
+        Py_XDECREF(original_dtypes[i]);
     }
-    return res;
+    return retval;
 }
 
 
@@ -4553,13 +4636,16 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
     /* All following variables are cleared in the `fail` error path */
     ufunc_full_args full_args;
     PyArrayObject *wheremask = NULL;
-    PyObject *typetup = NULL;
 
+    PyArray_DTypeMeta *signature[NPY_MAXARGS];
     PyArrayObject *operands[NPY_MAXARGS];
+    PyArray_DTypeMeta *operand_DTypes[NPY_MAXARGS];
     PyArray_Descr *operation_descrs[NPY_MAXARGS];
     PyObject *output_array_prepare[NPY_MAXARGS];
     /* Initialize all arrays (we usually only need a small part) */
+    memset(signature, 0, nop * sizeof(*signature));
     memset(operands, 0, nop * sizeof(*operands));
+    memset(operand_DTypes, 0, nop * sizeof(*operation_descrs));
     memset(operation_descrs, 0, nop * sizeof(*operation_descrs));
     memset(output_array_prepare, 0, nout * sizeof(*output_array_prepare));
 
@@ -4572,7 +4658,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
      */
 
     /* Check number of arguments */
-    if ((len_args < nin) || (len_args > nop)) {
+    if (NPY_UNLIKELY((len_args < nin) || (len_args > nop))) {
         PyErr_Format(PyExc_TypeError,
                 "%s() takes from %d to %d positional arguments but "
                 "%zd were given",
@@ -4731,7 +4817,8 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
      * Parse the passed `dtype` or `signature` into an array containing
      * PyArray_DTypeMeta and/or None.
      */
-    if (_get_normalized_typetup(ufunc, dtype_obj, signature_obj, &typetup) < 0) {
+    if (_get_fixed_signature(ufunc,
+            dtype_obj, signature_obj, signature) < 0) {
         goto fail;
     }
 
@@ -4739,7 +4826,13 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
     NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
     npy_bool subok = NPY_TRUE;
     int keepdims = -1;  /* We need to know if it was passed */
-    if (convert_ufunc_arguments(ufunc, full_args, operands,
+    npy_bool force_legacy_promotion;
+    npy_bool allow_legacy_promotion;
+    if (convert_ufunc_arguments(ufunc,
+            /* extract operand related information: */
+            full_args, operands,
+            operand_DTypes, &force_legacy_promotion, &allow_legacy_promotion,
+            /* extract general information: */
             order_obj, &order,
             casting_obj, &casting,
             subok_obj, &subok,
@@ -4748,8 +4841,24 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
         goto fail;
     }
 
-    if (ufunc->type_resolver(ufunc,
-            casting, operands, typetup, operation_descrs) < 0) {
+    /*
+     * Note that part of the promotion is to the complete the signature
+     * (until here it only represents the fixed part and is usually NULLs).
+     *
+     * After promotion, we could push the following logic into the ArrayMethod
+     * in the future.  For now, we do it here.  The type resolution step can
+     * be shared between the ufunc and gufunc code.
+     */
+    PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
+            operands, signature,
+            operand_DTypes, force_legacy_promotion, allow_legacy_promotion);
+    if (ufuncimpl == NULL) {
+        goto fail;
+    }
+
+    /* Find the correct descriptors for the operation */
+    if (resolve_descriptors(nop, ufunc, ufuncimpl,
+            operands, operation_descrs, signature, casting) < 0) {
         goto fail;
     }
 
@@ -4761,20 +4870,17 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
      * Do the final preparations and call the inner-loop.
      */
     if (!ufunc->core_enabled) {
-        errval = PyUFunc_GenericFunctionInternal(ufunc,
-                operation_descrs, operands,
-                extobj, order,
+        errval = PyUFunc_GenericFunctionInternal(ufunc, ufuncimpl,
+                operation_descrs, operands, extobj, casting, order,
                 output_array_prepare, full_args,  /* for __array_prepare__ */
                 wheremask);
     }
     else {
-        errval = PyUFunc_GeneralizedFunctionInternal(ufunc,
-                operation_descrs, operands,
-                extobj, order,
+        errval = PyUFunc_GeneralizedFunctionInternal(ufunc, ufuncimpl,
+                operation_descrs, operands, extobj, casting, order,
                 /* GUFuncs never (ever) called __array_prepare__! */
                 axis_obj, axes_obj, keepdims);
     }
-
     if (errval < 0) {
         goto fail;
     }
@@ -4785,6 +4891,7 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
      */
     Py_XDECREF(wheremask);
     for (int i = 0; i < nop; i++) {
+        Py_XDECREF(operand_DTypes[i]);
         Py_DECREF(operation_descrs[i]);
         if (i < nin) {
             Py_DECREF(operands[i]);
@@ -4793,22 +4900,21 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
             Py_XDECREF(output_array_prepare[i-nin]);
         }
     }
-    Py_XDECREF(typetup);
-
     /* The following steals the references to the outputs: */
     PyObject *result = replace_with_wrapped_result_and_return(ufunc,
             full_args, subok, operands+nin);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
+
     return result;
 
 fail:
-    Py_XDECREF(typetup);
     Py_XDECREF(full_args.in);
     Py_XDECREF(full_args.out);
     Py_XDECREF(wheremask);
     for (int i = 0; i < ufunc->nargs; i++) {
         Py_XDECREF(operands[i]);
+        Py_XDECREF(operand_DTypes[i]);
         Py_XDECREF(operation_descrs[i]);
         if (i < nout) {
             Py_XDECREF(output_array_prepare[i]);
@@ -5084,6 +5190,28 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
     ufunc->legacy_inner_loop_selector = &PyUFunc_DefaultLegacyInnerLoopSelector;
     ufunc->_always_null_previously_masked_innerloop_selector = NULL;
 
+    ufunc->op_flags = NULL;
+    ufunc->_loops = NULL;
+    if (nin + nout != 0) {
+        ufunc->_dispatch_cache = PyArrayIdentityHash_New(nin + nout);
+        if (ufunc->_dispatch_cache == NULL) {
+            Py_DECREF(ufunc);
+            return NULL;
+        }
+    }
+    else {
+        /*
+         * Work around a test that seems to do this right now, it should not
+         * be a valid ufunc at all though, so. TODO: Remove...
+         */
+        ufunc->_dispatch_cache = NULL;
+    }
+    ufunc->_loops = PyList_New(0);
+    if (ufunc->_loops == NULL) {
+        Py_DECREF(ufunc);
+        return NULL;
+    }
+
     if (name == NULL) {
         ufunc->name = "?";
     }
@@ -5105,6 +5233,29 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
             return NULL;
         }
     }
+
+    char *curr_types = ufunc->types;
+    for (int i = 0; i < ntypes * (nin + nout); i += nin + nout) {
+        /*
+         * Add all legacy wrapping loops here. This is normally not necessary,
+         * but makes sense.  It could also help/be needed to avoid issues with
+         * ambiguous loops such as: `OO->?` and `OO->O` where in theory the
+         * wrong loop could be picked if only the second one is added.
+         */
+        PyObject *info;
+        PyArray_DTypeMeta *op_dtypes[NPY_MAXARGS];
+        for (int arg = 0; arg < nin + nout; arg++) {
+            op_dtypes[arg] = PyArray_DTypeFromTypeNum(curr_types[arg]);
+            /* These DTypes are immortal and adding INCREFs: so borrow it */
+            Py_DECREF(op_dtypes[arg]);
+        }
+        curr_types += nin + nout;
+
+        info = add_and_return_legacy_wrapping_ufunc_loop(ufunc, op_dtypes, 1);
+        if (info == NULL) {
+            return NULL;
+        }
+    }
     return (PyObject *)ufunc;
 }
 
@@ -5320,6 +5471,8 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     PyArray_Descr *descr;
     PyUFunc_Loop1d *funcdata;
     PyObject *key, *cobj;
+    PyArray_DTypeMeta *signature[NPY_MAXARGS];
+    PyObject *signature_tuple = NULL;
     int i;
     int *newtypes=NULL;
 
@@ -5348,13 +5501,67 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
     if (arg_types != NULL) {
         for (i = 0; i < ufunc->nargs; i++) {
             newtypes[i] = arg_types[i];
+            signature[i] = PyArray_DTypeFromTypeNum(arg_types[i]);
+            Py_DECREF(signature[i]);  /* DType can't be deleted... */
         }
     }
     else {
         for (i = 0; i < ufunc->nargs; i++) {
             newtypes[i] = usertype;
+            signature[i] = PyArray_DTypeFromTypeNum(usertype);
+            Py_DECREF(signature[i]);  /* DType can't be deleted... */
+        }
+    }
+
+    signature_tuple = PyArray_TupleFromItems(
+            ufunc->nargs, (PyObject **)signature, 0);
+    if (signature_tuple == NULL) {
+        goto fail;
+    }
+    /*
+     * We add the loop to the list of all loops and promoters.  If the
+     * equivalent loop was already added, skip this.
+     * Note that even then the ufunc is still modified: The legacy ArrayMethod
+     * already looks up the inner-loop from the ufunc (and this is replaced
+     * below!).
+     * If the existing one is not a legacy ArrayMethod, we raise currently:
+     * A new-style loop should not be replaced by an old-style one.
+     */
+    int add_new_loop = 1;
+    for (Py_ssize_t j = 0; j < PyList_GET_SIZE(ufunc->_loops); j++) {
+        PyObject *item = PyList_GET_ITEM(ufunc->_loops, j);
+        PyObject *existing_tuple = PyTuple_GET_ITEM(item, 0);
+
+        int cmp = PyObject_RichCompareBool(existing_tuple, signature_tuple, Py_EQ);
+        if (cmp < 0) {
+            goto fail;
+        }
+        if (!cmp) {
+            continue;
+        }
+        PyObject *registered = PyTuple_GET_ITEM(item, 1);
+        if (!PyObject_TypeCheck(registered, &PyArrayMethod_Type) || (
+                (PyArrayMethodObject *)registered)->get_strided_loop !=
+                        &get_wrapped_legacy_ufunc_loop) {
+            PyErr_Format(PyExc_TypeError,
+                    "A non-compatible loop was already registered for "
+                    "ufunc %s and DTypes %S.",
+                    ufunc_get_name_cstr(ufunc), signature_tuple);
+            goto fail;
+        }
+        /* The loop was already added */
+        add_new_loop = 0;
+        break;
+    }
+    if (add_new_loop) {
+        PyObject *info = add_and_return_legacy_wrapping_ufunc_loop(
+                ufunc, signature, 0);
+        if (info == NULL) {
+            goto fail;
         }
     }
+    /* Clearing sets it to NULL for the error paths */
+    Py_CLEAR(signature_tuple);
 
     funcdata->func = function;
     funcdata->arg_types = newtypes;
@@ -5429,6 +5636,7 @@ PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc,
 
  fail:
     Py_DECREF(key);
+    Py_XDECREF(signature_tuple);
     PyArray_free(funcdata);
     PyArray_free(newtypes);
     if (!PyErr_Occurred()) PyErr_NoMemory();
@@ -5454,8 +5662,10 @@ ufunc_dealloc(PyUFuncObject *ufunc)
     if (ufunc->identity == PyUFunc_IdentityValue) {
         Py_DECREF(ufunc->identity_value);
     }
-    if (ufunc->obj != NULL) {
-        Py_DECREF(ufunc->obj);
+    Py_XDECREF(ufunc->obj);
+    Py_XDECREF(ufunc->_loops);
+    if (ufunc->_dispatch_cache != NULL) {
+        PyArrayIdentityHash_Dealloc(ufunc->_dispatch_cache);
     }
     PyObject_GC_Del(ufunc);
 }
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 468327b8c..a7d536656 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -1,4 +1,16 @@
 /*
+ * NOTE: The type resolution defined in this file is considered legacy.
+ *
+ * The new mechanism separates type resolution and promotion into two
+ * distinct steps, as per NEP 43.
+ * Further, the functions in this file rely on the operands rather than
+ * only the DTypes/descriptors.  They are still called and at this point
+ * vital (NumPy ~1.21), but should hopefully become largely irrelevant very
+ * quickly.
+ *
+ * At that point, this file should be deletable in its entirety.
+ *
+ *
  * This file implements type resolution for NumPy element-wise ufuncs.
  * This mechanism is still backwards-compatible with the pre-existing
  * legacy mechanism, so performs much slower than is necessary.
@@ -89,9 +101,9 @@ raise_binary_type_reso_error(PyUFuncObject *ufunc, PyArrayObject **operands) {
 /** Helper function to raise UFuncNoLoopError
  * Always returns -1 to indicate the exception was raised, for convenience
  */
-static int
+NPY_NO_EXPORT int
 raise_no_loop_found_error(
-        PyUFuncObject *ufunc, PyArray_Descr **dtypes)
+        PyUFuncObject *ufunc, PyObject **dtypes)
 {
     static PyObject *exc_type = NULL;
 
@@ -102,8 +114,7 @@ raise_no_loop_found_error(
         return -1;
     }
 
-    PyObject *dtypes_tup = PyArray_TupleFromItems(
-            ufunc->nargs, (PyObject **)dtypes, 1);
+    PyObject *dtypes_tup = PyArray_TupleFromItems(ufunc->nargs, dtypes, 1);
     if (dtypes_tup == NULL) {
         return -1;
     }
@@ -119,6 +130,7 @@ raise_no_loop_found_error(
     return -1;
 }
 
+
 static int
 raise_casting_error(
         PyObject *exc_type,
@@ -333,22 +345,30 @@ PyUFunc_SimpleBinaryComparisonTypeResolver(PyUFuncObject *ufunc,
             if (out_dtypes[0] == NULL) {
                 return -1;
             }
+            out_dtypes[1] = out_dtypes[0];
+            Py_INCREF(out_dtypes[1]);
         }
         else {
             /* Not doing anything will lead to a loop no found error. */
             out_dtypes[0] = PyArray_DESCR(operands[0]);
             Py_INCREF(out_dtypes[0]);
+            out_dtypes[1] = PyArray_DESCR(operands[1]);
+            Py_INCREF(out_dtypes[1]);
         }
-        out_dtypes[1] = out_dtypes[0];
-        Py_INCREF(out_dtypes[1]);
     }
     else {
         PyArray_Descr *descr;
         /*
+         * DEPRECATED 2021-03, NumPy 1.20
+         *
          * If the type tuple was originally a single element (probably),
          * issue a deprecation warning, but otherwise accept it.  Since the
          * result dtype is always boolean, this is not actually valid unless it
          * is `object` (but if there is an object input we already deferred).
+         *
+         * TODO: Once this deprecation is gone, the special case for
+         *       `PyUFunc_SimpleBinaryComparisonTypeResolver` in dispatching.c
+         *       can be removed.
          */
         if (PyTuple_Check(type_tup) && PyTuple_GET_SIZE(type_tup) == 3 &&
                 PyTuple_GET_ITEM(type_tup, 0) == Py_None &&
@@ -527,7 +547,7 @@ PyUFunc_SimpleUniformOperationTypeResolver(
                     out_dtypes[iop] = PyArray_DESCR(operands[iop]);
                     Py_INCREF(out_dtypes[iop]);
                 }
-                raise_no_loop_found_error(ufunc, out_dtypes);
+                raise_no_loop_found_error(ufunc, (PyObject **)out_dtypes);
                 for (iop = 0; iop < ufunc->nin; iop++) {
                     Py_DECREF(out_dtypes[iop]);
                     out_dtypes[iop] = NULL;
@@ -1492,7 +1512,7 @@ PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
         types += nargs;
     }
 
-    return raise_no_loop_found_error(ufunc, dtypes);
+    return raise_no_loop_found_error(ufunc, (PyObject **)dtypes);
 }
 
 
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index cd0ff4a0d..dd88a081a 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -135,4 +135,7 @@ PyUFunc_DefaultLegacyInnerLoopSelector(PyUFuncObject *ufunc,
                                        void **out_innerloopdata,
                                        int *out_needs_api);
 
+NPY_NO_EXPORT int
+raise_no_loop_found_error(PyUFuncObject *ufunc, PyObject **dtypes);
+
 #endif
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index 9d1d514fb..becd65b11 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -307,8 +307,8 @@ class TestModulus:
         # promotes to float which does not fit
         a = np.array([1, 2], np.int64)
         b = np.array([1, 2], np.uint64)
-        pattern = 'could not be coerced to provided output parameter'
-        with assert_raises_regex(TypeError, pattern):
+        with pytest.raises(TypeError,
+                match=r"Cannot cast ufunc 'floor_divide' output from"):
             a //= b
 
 
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 0251f21a9..dab11d948 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -164,8 +164,9 @@ class TestUfuncGenericLoops:
                 except AttributeError:
                     return lambda: getattr(np.core.umath, attr)(val)
 
-        num_arr = np.array([val], dtype=np.float64)
-        obj_arr = np.array([MyFloat(val)], dtype="O")
+        # Use 0-D arrays, to ensure the same element call
+        num_arr = np.array(val, dtype=np.float64)
+        obj_arr = np.array(MyFloat(val), dtype="O")
 
         with np.errstate(all="raise"):
             try:
@@ -1711,9 +1712,17 @@ class TestUfunc:
         target = np.array([0, 2, 4], dtype=_rational_tests.rational)
         assert_equal(result, target)
 
-        # no output type should raise TypeError
+        # The new resolution means that we can (usually) find custom loops
+        # as long as they match exactly:
+        result = _rational_tests.test_add(a, b)
+        assert_equal(result, target)
+
+        # But since we use the old type resolver, this may not work
+        # for dtype variations unless the output dtype is given:
+        result = _rational_tests.test_add(a, b.astype(np.uint16), out=c)
+        assert_equal(result, target)
         with assert_raises(TypeError):
-            _rational_tests.test_add(a, b)
+            _rational_tests.test_add(a, b.astype(np.uint16))
 
     def test_operand_flags(self):
         a = np.arange(16, dtype='l').reshape(4, 4)
@@ -2029,8 +2038,7 @@ class TestUfunc:
             np.true_divide, np.floor_divide, np.bitwise_and, np.bitwise_or,
             np.bitwise_xor, np.left_shift, np.right_shift, np.fmax,
             np.fmin, np.fmod, np.hypot, np.logaddexp, np.logaddexp2,
-            np.logical_and, np.logical_or, np.logical_xor, np.maximum,
-            np.minimum, np.mod,
+            np.maximum, np.minimum, np.mod,
             np.greater, np.greater_equal, np.less, np.less_equal,
             np.equal, np.not_equal]