7 files changed, 456 insertions, 133 deletions
diff --git a/numpy/core/include/numpy/experimental_dtype_api.h b/numpy/core/include/numpy/experimental_dtype_api.h
index 554c7fb6c..effa66baf 100644
--- a/numpy/core/include/numpy/experimental_dtype_api.h
+++ b/numpy/core/include/numpy/experimental_dtype_api.h
@@ -181,6 +181,12 @@ typedef PyObject *_ufunc_addloop_fromspec_func(
 /*
  * Type of the C promoter function, which must be wrapped into a
  * PyCapsule with name "numpy._ufunc_promoter".
+ *
+ * Note that currently the output dtypes are always NULL unless they are
+ * also part of the signature.  This is an implementation detail and could
+ * change in the future.  However, in general promoters should not have a
+ * need for output dtypes.
+ * (There are potential use-cases, these are currently unsupported.)
  */
 typedef int promoter_function(PyObject *ufunc,
         PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
diff --git a/numpy/core/src/umath/dispatching.c b/numpy/core/src/umath/dispatching.c
index 8e99c0420..664288084 100644
--- a/numpy/core/src/umath/dispatching.c
+++ b/numpy/core/src/umath/dispatching.c
@@ -46,12 +46,16 @@
 
 #include "dispatching.h"
 #include "dtypemeta.h"
+#include "common_dtype.h"
 #include "npy_hashtable.h"
 #include "legacy_array_method.h"
 #include "ufunc_object.h"
 #include "ufunc_type_resolution.h"
 
 
+#define PROMOTION_DEBUG_TRACING 0
+
+
 /* forward declaration */
 static NPY_INLINE PyObject *
 promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
@@ -147,6 +151,23 @@ PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate)
  * (Based on `isinstance()`, the knowledge that non-abstract DTypes cannot
  * be subclassed is used, however.)
  *
+ * NOTE: This currently does not take into account output dtypes which do not
+ *       have to match.  The possible extension here is that if an output
+ *       is given (and thus an output dtype), but not part of the signature
+ *       we could ignore it for matching, but *prefer* a loop that matches
+ *       better.
+ *       Why is this not done currently?  First, it seems a niche feature that
+ *       loops can only be distinguished based on the output dtype.  Second,
+ *       there are some nasty theoretical things because:
+ *
+ *            np.add(f4, f4, out=f8)
+ *            np.add(f4, f4, out=f8, dtype=f8)
+ *
+ *       are different, the first uses the f4 loop, the second the f8 loop.
+ *       The problem is, that the current cache only uses the op_dtypes and
+ *       both are `(f4, f4, f8)`.  The cache would need to store also which
+ *       output was provided by `dtype=`/`signature=`.
+ *
  * @param ufunc
  * @param op_dtypes The DTypes that are either passed in (defined by an
  *        operand) or defined by the `signature` as also passed in as
@@ -159,17 +180,35 @@ PyUFunc_AddLoop(PyUFuncObject *ufunc, PyObject *info, int ignore_duplicate)
  */
 static int
 resolve_implementation_info(PyUFuncObject *ufunc,
-        PyArray_DTypeMeta *op_dtypes[], PyObject **out_info)
+        PyArray_DTypeMeta *op_dtypes[], npy_bool only_promoters,
+        PyObject **out_info)
 {
     int nin = ufunc->nin, nargs = ufunc->nargs;
     Py_ssize_t size = PySequence_Length(ufunc->_loops);
     PyObject *best_dtypes = NULL;
     PyObject *best_resolver_info = NULL;
 
+#if PROMOTION_DEBUG_TRACING
+    printf("Promoting for '%s' promoters only: %d\n",
+            ufunc->name ? ufunc->name : "<unknown>", (int)only_promoters);
+    printf("    DTypes: ");
+    PyObject *tmp = PyArray_TupleFromItems(ufunc->nargs, op_dtypes, 1);
+    PyObject_Print(tmp, stdout, 0);
+    Py_DECREF(tmp);
+    printf("\n");
+    Py_DECREF(tmp);
+#endif
+
     for (Py_ssize_t res_idx = 0; res_idx < size; res_idx++) {
         /* Test all resolvers  */
         PyObject *resolver_info = PySequence_Fast_GET_ITEM(
                 ufunc->_loops, res_idx);
+
+        if (only_promoters && PyObject_TypeCheck(
+                    PyTuple_GET_ITEM(resolver_info, 1), &PyArrayMethod_Type)) {
+            continue;
+        }
+
         PyObject *curr_dtypes = PyTuple_GET_ITEM(resolver_info, 0);
         /*
          * Test if the current resolver matches, it could make sense to
@@ -179,20 +218,31 @@ resolve_implementation_info(PyUFuncObject *ufunc,
 
         npy_bool matches = NPY_TRUE;
         /*
-         * NOTE: We check also the output DType.  In principle we do not
-         *       have to strictly match it (unless it is provided by the
-         *       `signature`).  This assumes that a (fallback) promoter will
-         *       unset the output DType if no exact match is found.
+         * NOTE: We currently match the output dtype exactly here, this is
+         *       actually only necessary if the signature includes.
+         *       Currently, we rely that op-dtypes[nin:nout] is NULLed if not.
          */
         for (Py_ssize_t i = 0; i < nargs; i++) {
             PyArray_DTypeMeta *given_dtype = op_dtypes[i];
             PyArray_DTypeMeta *resolver_dtype = (
                     (PyArray_DTypeMeta *)PyTuple_GET_ITEM(curr_dtypes, i));
             assert((PyObject *)given_dtype != Py_None);
-            if (given_dtype == NULL && i >= nin) {
-                /* Unspecified out always matches (see below for inputs) */
-                continue;
+            if (given_dtype == NULL) {
+                if (i >= nin) {
+                    /* Unspecified out always matches (see below for inputs) */
+                    continue;
+                }
+                /*
+                 * This is a reduce-like operation, which always have the form
+                 * `(res_DType, op_DType, res_DType)`.  If the first and last
+                 * dtype of the loops match, this should be reduce-compatible.
+                 */
+                if (PyTuple_GET_ITEM(curr_dtypes, 0)
+                        == PyTuple_GET_ITEM(curr_dtypes, 2)) {
+                    continue;
+                }
             }
+
             if (resolver_dtype == (PyArray_DTypeMeta *)Py_None) {
                 /* always matches */
                 continue;
@@ -204,24 +254,7 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                 matches = NPY_FALSE;
                 break;
             }
-            if (given_dtype == NULL) {
-                /*
-                 * If an input was not specified, this is a reduce-like
-                 * operation: reductions use `(operand_DType, NULL, out_DType)`
-                 * as they only have a single operand.  This allows special
-                 * reduce promotion rules useful for example for sum/product.
-                 * E.g. `np.add.reduce([True, True])` promotes to integer.
-                 *
-                 * Continuing here allows a promoter to handle reduce-like
-                 * promotions explicitly if necessary.
-                 * TODO: The `!NPY_DT_is_abstract(resolver_dtype)` currently
-                 *       ensures that this is a promoter.  If we allow
-                 *       `ArrayMethods` to use abstract DTypes, we may have to
-                 *       reject it here or the `ArrayMethod` has to implement
-                 *       the reduce promotion.
-                 */
-                continue;
-            }
+
             int subclass = PyObject_IsSubclass(
                     (PyObject *)given_dtype, (PyObject *)resolver_dtype);
             if (subclass < 0) {
@@ -254,8 +287,12 @@ resolve_implementation_info(PyUFuncObject *ufunc,
              * In all cases, we give up resolution, since it would be
              * necessary to compare to two "best" cases.
              */
-            int unambiguously_equally_good = 1;
             for (Py_ssize_t i = 0; i < nargs; i++) {
+                if (i == ufunc->nin && current_best != -1) {
+                    /* inputs prefer one loop and outputs have lower priority */
+                    break;
+                }
+
                 int best;
 
                 PyObject *prev_dtype = PyTuple_GET_ITEM(best_dtypes, i);
@@ -265,50 +302,18 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                     /* equivalent, so this entry does not matter */
                     continue;
                 }
-                /*
-                 * TODO: Even if the input is not specified, if we have
-                 *       abstract DTypes and one is a subclass of the other,
-                 *       the subclass should be considered a better match
-                 *       (subclasses are always more specific).
-                 */
-                /* Whether this (normally output) dtype was specified at all */
                 if (op_dtypes[i] == NULL) {
                     /*
-                     * When DType is completely unspecified, prefer abstract
-                     * over concrete, assuming it will resolve.
-                     * Furthermore, we cannot decide which abstract/None
-                     * is "better", only concrete ones which are subclasses
-                     * of Abstract ones are defined as worse.
+                     * If an a dtype is NULL it always matches, so there is no
+                     * point in defining one as more precise than the other.
                      */
-                    npy_bool prev_is_concrete = NPY_FALSE;
-                    npy_bool new_is_concrete = NPY_FALSE;
-                    if ((prev_dtype != Py_None) &&
-                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
-                        prev_is_concrete = NPY_TRUE;
-                    }
-                    if ((new_dtype != Py_None) &&
-                            !NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
-                        new_is_concrete = NPY_TRUE;
-                    }
-                    if (prev_is_concrete == new_is_concrete) {
-                        best = -1;
-                    }
-                    else if (prev_is_concrete) {
-                        unambiguously_equally_good = 0;
-                        best = 1;
-                    }
-                    else {
-                        unambiguously_equally_good = 0;
-                        best = 0;
-                    }
+                    continue;
                 }
                 /* If either is None, the other is strictly more specific */
-                else if (prev_dtype == Py_None) {
-                    unambiguously_equally_good = 0;
+                if (prev_dtype == Py_None) {
                     best = 1;
                 }
                 else if (new_dtype == Py_None) {
-                    unambiguously_equally_good = 0;
                     best = 0;
                 }
                 /*
@@ -318,20 +323,25 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                 else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype) &&
                          !NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
                     /*
-                     * Ambiguous unless the are identical (checked above),
-                     * but since they are concrete it does not matter which
-                     * best to compare.
+                     * Ambiguous unless they are identical (checked above),
+                     * or one matches exactly.
                      */
-                    best = -1;
+                    if (prev_dtype == (PyObject *)op_dtypes[i]) {
+                        best = 0;
+                    }
+                    else if (new_dtype == (PyObject *)op_dtypes[i]) {
+                        best = 1;
+                    }
+                    else {
+                        best = -1;
+                    }
                 }
                 else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)prev_dtype)) {
                     /* old is not abstract, so better (both not possible) */
-                    unambiguously_equally_good = 0;
                     best = 0;
                 }
                 else if (!NPY_DT_is_abstract((PyArray_DTypeMeta *)new_dtype)) {
                     /* new is not abstract, so better (both not possible) */
-                    unambiguously_equally_good = 0;
                     best = 1;
                 }
                 /*
@@ -349,6 +359,10 @@ resolve_implementation_info(PyUFuncObject *ufunc,
                     return -1;
                 }
 
+                if (best == -1) {
+                    /* no new info, nothing to update */
+                    continue;
+                }
                 if ((current_best != -1) && (current_best != best)) {
                     /*
                      * We need a clear best, this could be tricky, unless
@@ -367,15 +381,34 @@ resolve_implementation_info(PyUFuncObject *ufunc,
 
             if (current_best == -1) {
                 /*
-                 * TODO: It would be nice to have a "diagnostic mode" that
-                 *       informs if this happens! (An immediate error currently
-                 *       blocks later legacy resolution, but may work in the
-                 *       future.)
+                 * We could not find a best loop, but promoters should be
+                 * designed in a way to disambiguate such scenarios, so we
+                 * retry the whole lookup using only promoters.
+                 * (There is a small chance we already got two promoters.
+                 * We just redo it anyway for simplicity.)
                  */
-                if (unambiguously_equally_good) {
-                    /* unset the best resolver to indicate this */
-                    best_resolver_info = NULL;
-                    continue;
+                if (!only_promoters) {
+                    return resolve_implementation_info(ufunc,
+                            op_dtypes, NPY_TRUE, out_info);
+                }
+                /*
+                 * If this is already the retry, we are out of luck.  Promoters
+                 * should be designed in a way that this cannot happen!
+                 * (It should be noted, that the retry might not find anything
+                 * and we still do a legacy lookup later.)
+                 */
+                PyObject *given = PyArray_TupleFromItems(
+                        ufunc->nargs, (PyObject **)op_dtypes, 1);
+                if (given != NULL) {
+                    PyErr_Format(PyExc_RuntimeError,
+                            "Could not find a loop for the inputs:\n    %S\n"
+                            "The two promoters %S and %S matched the input "
+                            "equally well.  Promoters must be designed "
+                            "to be unambiguous.  NOTE: This indicates an error "
+                            "in NumPy or an extending library and should be "
+                            "reported.",
+                            given, best_dtypes, curr_dtypes);
+                    Py_DECREF(given);
                 }
                 *out_info = NULL;
                 return 0;
@@ -648,7 +681,8 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
      * in `resolve_implementation_info`.
      */
     if (info == NULL) {
-        if (resolve_implementation_info(ufunc, op_dtypes, &info) < 0) {
+        if (resolve_implementation_info(ufunc,
+                op_dtypes, NPY_FALSE, &info) < 0) {
             return NULL;
         }
         if (info != NULL && PyObject_TypeCheck(
@@ -663,35 +697,6 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
             }
             return info;
         }
-        else if (info == NULL && op_dtypes[0] == NULL) {
-            /*
-             * If we have a reduction, fill in the unspecified input/array
-             * assuming it should have the same dtype as the operand input
-             * (or the output one if given).
-             * Then, try again.  In some cases, this will choose different
-             * paths, such as `ll->?` instead of an `??->?` loop for `np.equal`
-             * when the input is `.l->.` (`.` meaning undefined).  This will
-             * then cause an error.  But cast to `?` would always lose
-             * information, and in many cases important information:
-             *
-             * ```python
-             * from operator import eq
-             * from functools import reduce
-             *
-             * reduce(eq, [1, 2, 3]) != reduce(eq, [True, True, True])
-             * ```
-             *
-             * The special cases being `logical_(and|or|xor)` which can always
-             * cast to boolean ahead of time and still give the right answer
-             * (unsafe cast to bool is fine here). We special case these at
-             * the time of this comment (NumPy 1.21).
-             */
-            assert(ufunc->nin == 2 && ufunc->nout == 1);
-            op_dtypes[0] = op_dtypes[2] != NULL ? op_dtypes[2] : op_dtypes[1];
-            Py_INCREF(op_dtypes[0]);
-            return promote_and_get_info_and_ufuncimpl(ufunc,
-                    ops, signature, op_dtypes, allow_legacy_promotion, 1);
-        }
     }
 
     /*
@@ -745,6 +750,14 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
  * only work with DType (classes/types).  This is because it has to ensure
  * that legacy (value-based promotion) is used when necessary.
  *
+ * NOTE: The machinery here currently ignores output arguments unless
+ *       they are part of the signature.  This slightly limits unsafe loop
+ *       specializations, which is important for the `ensure_reduce_compatible`
+ *       fallback mode.
+ *       To fix this, the caching mechanism (and dispatching) can be extended.
+ *       When/if that happens, the `ensure_reduce_compatible` could be
+ *       deprecated (it should never kick in because promotion kick in first).
+ *
  * @param ufunc The ufunc object, used mainly for the fallback.
  * @param ops The array operands (used only for the fallback).
  * @param signature As input, the DType signature fixed explicitly by the user.
@@ -754,9 +767,16 @@ promote_and_get_info_and_ufuncimpl(PyUFuncObject *ufunc,
  *        either by the `signature` or by an `operand`.
  *        (outputs and the second input can be NULL for reductions).
  *        NOTE: In some cases, the promotion machinery may currently modify
- *        these.
+ *        these including clearing the output.
  * @param force_legacy_promotion If set, we have to use the old type resolution
  *        to implement value-based promotion/casting.
+ * @param ensure_reduce_compatible Must be set for reductions, in which case
+ *        the found implementation is checked for reduce-like compatibility.
+ *        If it is *not* compatible and `signature[2] != NULL`, we assume its
+ *        output DType is correct (see NOTE above).
+ *        If removed, promotion may require information about whether this
+ *        is a reduction, so the more likely case is to always keep fixing this
+ *        when necessary, but push down the handling so it can be cached.
  */
 NPY_NO_EXPORT PyArrayMethodObject *
 promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
@@ -764,9 +784,10 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *signature[],
         PyArray_DTypeMeta *op_dtypes[],
         npy_bool force_legacy_promotion,
-        npy_bool allow_legacy_promotion)
+        npy_bool allow_legacy_promotion,
+        npy_bool ensure_reduce_compatible)
 {
-    int nargs = ufunc->nargs;
+    int nin = ufunc->nin, nargs = ufunc->nargs;
 
     /*
      * Get the actual DTypes we operate with by mixing the operand array
@@ -782,6 +803,15 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
             Py_XSETREF(op_dtypes[i], signature[i]);
             assert(i >= ufunc->nin || !NPY_DT_is_abstract(signature[i]));
         }
+        else if (i >= nin) {
+            /*
+             * We currently just ignore outputs if not in signature, this will
+             * always give the/a correct result (limits registering specialized
+             * loops which include the cast).
+             * (See also comment in resolve_implementation_info.)
+             */
+            Py_CLEAR(op_dtypes[i]);
+        }
     }
 
     if (force_legacy_promotion) {
@@ -809,8 +839,26 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
 
     PyArrayMethodObject *method = (PyArrayMethodObject *)PyTuple_GET_ITEM(info, 1);
 
-    /* Fill `signature` with final DTypes used by the ArrayMethod/inner-loop */
+    /*
+     * In certain cases (only the logical ufuncs really), the loop we found may
+     * not be reduce-compatible.  Since the machinery can't distinguish a
+     * reduction with an output from a normal ufunc call, we have to assume
+     * the result DType is correct and force it for the input (if not forced
+     * already).
+     * NOTE: This does assume that all loops are "safe" see the NOTE in this
+     *       comment.  That could be relaxed, in which case we may need to
+     *       cache if a call was for a reduction.
+     */
     PyObject *all_dtypes = PyTuple_GET_ITEM(info, 0);
+    if (ensure_reduce_compatible && signature[0] == NULL &&
+            PyTuple_GET_ITEM(all_dtypes, 0) != PyTuple_GET_ITEM(all_dtypes, 2)) {
+        signature[0] = (PyArray_DTypeMeta *)PyTuple_GET_ITEM(all_dtypes, 2);
+        Py_INCREF(signature[0]);
+        return promote_and_get_ufuncimpl(ufunc,
+                ops, signature, op_dtypes,
+                force_legacy_promotion, allow_legacy_promotion, NPY_FALSE);
+    }
+
     for (int i = 0; i < nargs; i++) {
         if (signature[i] == NULL) {
             signature[i] = (PyArray_DTypeMeta *)PyTuple_GET_ITEM(all_dtypes, i);
@@ -826,6 +874,112 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
 
 
 /*
+ * Generic promoter used by as a final fallback on ufuncs.  Most operations are
+ * homogeneous, so we can try to find the homogeneous dtype on the inputs
+ * and use that.
+ * We need to special case the reduction case, where op_dtypes[0] == NULL
+ * is possible.
+ */
+NPY_NO_EXPORT int
+default_ufunc_promoter(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[])
+{
+    if (ufunc->type_resolver == &PyUFunc_SimpleBinaryComparisonTypeResolver
+            && signature[0] == NULL && signature[1] == NULL
+            && signature[2] != NULL && signature[2]->type_num != NPY_BOOL) {
+        /* bail out, this is _only_ to give future/deprecation warning! */
+        return -1;
+    }
+
+    /* If nin < 2 promotion is a no-op, so it should not be registered */
+    assert(ufunc->nin > 1);
+    if (op_dtypes[0] == NULL) {
+        assert(ufunc->nin == 2 && ufunc->nout == 1);  /* must be reduction */
+        Py_INCREF(op_dtypes[1]);
+        new_op_dtypes[0] = op_dtypes[1];
+        Py_INCREF(op_dtypes[1]);
+        new_op_dtypes[1] = op_dtypes[1];
+        Py_INCREF(op_dtypes[1]);
+        new_op_dtypes[2] = op_dtypes[1];
+        return 0;
+    }
+    PyArray_DTypeMeta *common = NULL;
+    /*
+     * If a signature is used and homogeneous in its outputs use that
+     * (Could/should likely be rather applied to inputs also, although outs
+     * only could have some advantage and input dtypes are rarely enforced.)
+     */
+    for (int i = ufunc->nin; i < ufunc->nargs; i++) {
+        if (signature[i] != NULL) {
+            if (common == NULL) {
+                Py_INCREF(signature[i]);
+                common = signature[i];
+            }
+            else if (common != signature[i]) {
+                Py_CLEAR(common);  /* Not homogeneous, unset common */
+                break;
+            }
+        }
+    }
+    /* Otherwise, use the common DType of all input operands */
+    if (common == NULL) {
+        common = PyArray_PromoteDTypeSequence(ufunc->nin, op_dtypes);
+        if (common == NULL) {
+            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+                PyErr_Clear();  /* Do not propagate normal promotion errors */
+            }
+            return -1;
+        }
+    }
+
+    for (int i = 0; i < ufunc->nargs; i++) {
+        PyArray_DTypeMeta *tmp = common;
+        if (signature[i]) {
+            tmp = signature[i];  /* never replace a fixed one. */
+        }
+        Py_INCREF(tmp);
+        new_op_dtypes[i] = tmp;
+    }
+    for (int i = ufunc->nin; i < ufunc->nargs; i++) {
+        Py_XINCREF(op_dtypes[i]);
+        new_op_dtypes[i] = op_dtypes[i];
+    }
+
+    Py_DECREF(common);
+    return 0;
+}
+
+
+/*
+ * In some cases, we assume that there will only ever be object loops,
+ * and the object loop should *always* be chosen.
+ * (in those cases more specific loops should not really be registered, but
+ * we do not check that.)
+ *
+ * We default to this for "old-style" ufuncs which have exactly one loop
+ * consisting only of objects (during registration time, numba mutates this
+ * but presumably).
+ */
+NPY_NO_EXPORT int
+object_only_ufunc_promoter(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *NPY_UNUSED(op_dtypes[]),
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[])
+{
+    PyArray_DTypeMeta *object_DType = PyArray_DTypeFromTypeNum(NPY_OBJECT);
+
+    for (int i = 0; i < ufunc->nargs; i++) {
+        if (signature[i] == NULL) {
+            Py_INCREF(object_DType);
+            new_op_dtypes[i] = object_DType;
+        }
+    }
+    Py_DECREF(object_DType);
+    return 0;
+}
+
+/*
  * Special promoter for the logical ufuncs.  The logical ufuncs can always
  * use the ??->? and still get the correct output (as long as the output
  * is not supposed to be `object`).
@@ -843,6 +997,12 @@ logical_ufunc_promoter(PyUFuncObject *NPY_UNUSED(ufunc),
      */
     int force_object = 0;
 
+    if (signature[0] == NULL && signature[1] == NULL
+            && signature[2] != NULL && signature[2]->type_num != NPY_BOOL) {
+        /* bail out, this is _only_ to give future/deprecation warning! */
+        return -1;
+    }
+
     for (int i = 0; i < 3; i++) {
         PyArray_DTypeMeta *item;
         if (signature[i] != NULL) {
@@ -913,4 +1073,3 @@ install_logical_ufunc_promoter(PyObject *ufunc)
 
     return PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
 }
-
diff --git a/numpy/core/src/umath/dispatching.h b/numpy/core/src/umath/dispatching.h
index 2f314615d..a7e9e88d0 100644
--- a/numpy/core/src/umath/dispatching.h
+++ b/numpy/core/src/umath/dispatching.h
@@ -20,13 +20,25 @@ promote_and_get_ufuncimpl(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *signature[],
         PyArray_DTypeMeta *op_dtypes[],
         npy_bool force_legacy_promotion,
-        npy_bool allow_legacy_promotion);
+        npy_bool allow_legacy_promotion,
+        npy_bool ensure_reduce_compatible);
 
 NPY_NO_EXPORT PyObject *
 add_and_return_legacy_wrapping_ufunc_loop(PyUFuncObject *ufunc,
         PyArray_DTypeMeta *operation_dtypes[], int ignore_duplicate);
 
 NPY_NO_EXPORT int
+default_ufunc_promoter(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[]);
+
+NPY_NO_EXPORT int
+object_only_ufunc_promoter(PyUFuncObject *ufunc,
+        PyArray_DTypeMeta *NPY_UNUSED(op_dtypes[]),
+        PyArray_DTypeMeta *signature[],
+        PyArray_DTypeMeta *new_op_dtypes[]);
+
+NPY_NO_EXPORT int
 install_logical_ufunc_promoter(PyObject *ufunc);
 
 
diff --git a/numpy/core/src/umath/legacy_array_method.c b/numpy/core/src/umath/legacy_array_method.c
index a423823d4..99de63aac 100644
--- a/numpy/core/src/umath/legacy_array_method.c
+++ b/numpy/core/src/umath/legacy_array_method.c
@@ -123,10 +123,40 @@ simple_legacy_resolve_descriptors(
         PyArray_Descr **given_descrs,
         PyArray_Descr **output_descrs)
 {
+    int i = 0;
     int nin = method->nin;
     int nout = method->nout;
 
-    for (int i = 0; i < nin + nout; i++) {
+    if (nin == 2 && nout == 1 && given_descrs[2] != NULL
+            && dtypes[0] == dtypes[2]) {
+        /*
+         * Could be a reduction, which requires `descr[0] is descr[2]`
+         * (identity) at least currently. This is because `op[0] is op[2]`.
+         * (If the output descriptor is not passed, the below works.)
+         */
+        output_descrs[2] = ensure_dtype_nbo(given_descrs[2]);
+        if (output_descrs[2] == NULL) {
+            Py_CLEAR(output_descrs[2]);
+            return -1;
+        }
+        Py_INCREF(output_descrs[2]);
+        output_descrs[0] = output_descrs[2];
+        if (dtypes[1] == dtypes[2]) {
+            /* Same for the second one (accumulation is stricter) */
+            Py_INCREF(output_descrs[2]);
+            output_descrs[1] = output_descrs[2];
+        }
+        else {
+            output_descrs[1] = ensure_dtype_nbo(given_descrs[1]);
+            if (output_descrs[1] == NULL) {
+                i = 2;
+                goto fail;
+            }
+        }
+        return NPY_NO_CASTING;
+    }
+
+    for (; i < nin + nout; i++) {
         if (given_descrs[i] != NULL) {
             output_descrs[i] = ensure_dtype_nbo(given_descrs[i]);
         }
@@ -146,7 +176,7 @@ simple_legacy_resolve_descriptors(
     return NPY_NO_CASTING;
 
   fail:
-    for (int i = 0; i < nin + nout; i++) {
+    for (; i >= 0; i--) {
         Py_CLEAR(output_descrs[i]);
     }
     return -1;
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 186f18a62..97dc9f00f 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2717,11 +2717,11 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
         char *method)
 {
     /*
-     * Note that the `ops` is not realy correct.  But legacy resolution
+     * Note that the `ops` is not really correct.  But legacy resolution
      * cannot quite handle the correct ops (e.g. a NULL first item if `out`
-     * is NULL), and it should only matter in very strange cases.
+     * is NULL) so we pass `arr` instead in that case.
      */
-    PyArrayObject *ops[3] = {arr, arr, NULL};
+    PyArrayObject *ops[3] = {out ? out : arr, arr, out};
     /*
      * TODO: If `out` is not provided, arguably `initial` could define
      *       the first DType (and maybe also the out one), that way
@@ -2741,11 +2741,12 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
     }
 
     PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
-            ops, signature, operation_DTypes, NPY_FALSE, NPY_TRUE);
-    Py_DECREF(operation_DTypes[1]);
+            ops, signature, operation_DTypes, NPY_FALSE, NPY_FALSE, NPY_TRUE);
+    /* Output can currently get cleared, others XDECREF in case of error */
+    Py_XDECREF(operation_DTypes[1]);
     if (out != NULL) {
-        Py_DECREF(operation_DTypes[0]);
-        Py_DECREF(operation_DTypes[2]);
+        Py_XDECREF(operation_DTypes[0]);
+        Py_XDECREF(operation_DTypes[2]);
     }
     if (ufuncimpl == NULL) {
         return NULL;
@@ -2771,8 +2772,10 @@ reducelike_promote_and_resolve(PyUFuncObject *ufunc,
     if (out_descrs[0] != out_descrs[2] || (
             enforce_uniform_args && out_descrs[0] != out_descrs[1])) {
         PyErr_Format(PyExc_TypeError,
-                "the resolved dtypes are not compatible with %s.%s",
-                ufunc_get_name_cstr(ufunc), method);
+                "the resolved dtypes are not compatible with %s.%s. "
+                "Resolved (%R, %R, %R)",
+                ufunc_get_name_cstr(ufunc), method,
+                out_descrs[0], out_descrs[1], out_descrs[2]);
         goto fail;
     }
     /* TODO: This really should _not_ be unsafe casting (same above)! */
@@ -4852,7 +4855,8 @@ ufunc_generic_fastcall(PyUFuncObject *ufunc,
      */
     PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
             operands, signature,
-            operand_DTypes, force_legacy_promotion, allow_legacy_promotion);
+            operand_DTypes, force_legacy_promotion, allow_legacy_promotion,
+            NPY_FALSE);
     if (ufuncimpl == NULL) {
         goto fail;
     }
@@ -5190,6 +5194,61 @@ PyUFunc_FromFuncAndDataAndSignatureAndIdentity(PyUFuncGenericFunction *func, voi
 
         info = add_and_return_legacy_wrapping_ufunc_loop(ufunc, op_dtypes, 1);
         if (info == NULL) {
+            Py_DECREF(ufunc);
+            return NULL;
+        }
+    }
+
+    PyObject *promoter = NULL;
+    if (ufunc->ntypes == 1) {
+        npy_bool all_object = NPY_TRUE;
+        for (int i = 0; i < ufunc->nargs; i++) {
+            if (ufunc->types[i] != NPY_OBJECT) {
+                all_object = NPY_FALSE;
+                break;
+            }
+        }
+        if (all_object) {
+            promoter = PyCapsule_New(&object_only_ufunc_promoter,
+                    "numpy._ufunc_promoter", NULL);
+            if (promoter == NULL) {
+                Py_DECREF(ufunc);
+                return NULL;
+            }
+        }
+    }
+    if (promoter == NULL && ufunc->nin > 1) {
+        promoter = PyCapsule_New(&default_ufunc_promoter,
+                "numpy._ufunc_promoter", NULL);
+        if (promoter == NULL) {
+            Py_DECREF(ufunc);
+            return NULL;
+        }
+    }
+    if (promoter != NULL) {
+        /* Always install default promoter using the common DType */
+        PyObject *dtype_tuple = PyTuple_New(ufunc->nargs);
+        if (dtype_tuple == NULL) {
+            Py_DECREF(promoter);
+            Py_DECREF(ufunc);
+            return NULL;
+        }
+        for (int i = 0; i < ufunc->nargs; i++) {
+            Py_INCREF(Py_None);
+            PyTuple_SET_ITEM(dtype_tuple, i, Py_None);
+        }
+        PyObject *info = PyTuple_Pack(2, dtype_tuple, promoter);
+        Py_DECREF(dtype_tuple);
+        Py_DECREF(promoter);
+        if (info == NULL) {
+            Py_DECREF(ufunc);
+            return NULL;
+        }
+
+        int res = PyUFunc_AddLoop((PyUFuncObject *)ufunc, info, 0);
+        Py_DECREF(info);
+        if (res < 0) {
+            Py_DECREF(ufunc);
             return NULL;
         }
     }
@@ -5963,7 +6022,7 @@ ufunc_at(PyUFuncObject *ufunc, PyObject *args)
 
     PyArrayMethodObject *ufuncimpl = promote_and_get_ufuncimpl(ufunc,
             operands, signature, operand_DTypes,
-            force_legacy_promotion, allow_legacy_promotion);
+            force_legacy_promotion, allow_legacy_promotion, NPY_FALSE);
     if (ufuncimpl == NULL) {
         goto fail;
     }
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index b95d669a8..50da7b800 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -2033,15 +2033,15 @@ class TestDateTime:
         # subtracting two datetime64 works, but we cannot reduce it, since
         # the result of that subtraction will have a different dtype.
         arr = np.array(["2021-12-02", "2019-05-12"], dtype="M8[ms]")
-        msg = r"the resolved dtypes are not compatible with subtract\."
+        msg = r"ufunc 'subtract' did not contain a loop with signature "
 
-        with pytest.raises(TypeError, match=msg + "reduce"):
+        with pytest.raises(TypeError, match=msg):
             np.subtract.reduce(arr)
 
-        with pytest.raises(TypeError, match=msg + "accumulate"):
+        with pytest.raises(TypeError, match=msg):
             np.subtract.accumulate(arr)
 
-        with pytest.raises(TypeError, match=msg + "reduceat"):
+        with pytest.raises(TypeError, match=msg):
             np.subtract.reduceat(arr, [0])
 
     def test_datetime_busday_offset(self):
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index ef0bac957..76e4cdcfd 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -1762,12 +1762,15 @@ class TestUfunc:
         result = _rational_tests.test_add(a, b)
         assert_equal(result, target)
 
-        # But since we use the old type resolver, this may not work
-        # for dtype variations unless the output dtype is given:
+        # This works even more generally, so long the default common-dtype
+        # promoter works out:
         result = _rational_tests.test_add(a, b.astype(np.uint16), out=c)
         assert_equal(result, target)
+
+        # But, it can be fooled, e.g. (use scalars, which forces legacy
+        # type resolution to kick in, which then fails):
         with assert_raises(TypeError):
-            _rational_tests.test_add(a, b.astype(np.uint16))
+            _rational_tests.test_add(a, np.uint16(2))
 
     def test_operand_flags(self):
         a = np.arange(16, dtype='l').reshape(4, 4)
@@ -2123,6 +2126,17 @@ class TestUfunc:
         c = np.array([1., 2.])
         assert_array_equal(ufunc(a, c), ufunc([True, True], True))
         assert ufunc.reduce(a) == True
+        # check that the output has no effect:
+        out = np.zeros(2, dtype=np.int32)
+        expected = ufunc([True, True], True).astype(out.dtype)
+        assert_array_equal(ufunc(a, c, out=out), expected)
+        out = np.zeros((), dtype=np.int32)
+        assert ufunc.reduce(a, out=out) == True
+        # Last check, test reduction when out and a match (the complexity here
+        # is that the "i,i->?" may seem right, but should not match.
+        a = np.array([3], dtype="i")
+        out = np.zeros((), dtype=a.dtype)
+        assert ufunc.reduce(a, out=out) == 1
 
     @pytest.mark.parametrize("ufunc",
              [np.logical_and, np.logical_or, np.logical_xor])
@@ -2134,6 +2148,49 @@ class TestUfunc:
             # It would be safe, but not equiv casting:
             ufunc(a, c, out=out, casting="equiv")
 
+    def test_reducelike_out_promotes(self):
+        # Check that the out argument to reductions is considered for
+        # promotion.  See also gh-20455.
+        # Note that these paths could prefer `initial=` in the future and
+        # do not up-cast to the default integer for add and prod
+        arr = np.ones(1000, dtype=np.uint8)
+        out = np.zeros((), dtype=np.uint16)
+        assert np.add.reduce(arr, out=out) == 1000
+        arr[:10] = 2
+        assert np.multiply.reduce(arr, out=out) == 2**10
+
+        # For legacy dtypes, the signature currently has to be forced if `out=`
+        # is passed.  The two paths below should differ, without `dtype=` the
+        # expected result should be: `np.prod(arr.astype("f8")).astype("f4")`!
+        arr = np.full(5, 2**25-1, dtype=np.int64)
+
+        # float32 and int64 promote to float64:
+        res = np.zeros((), dtype=np.float32)
+        # If `dtype=` is passed, the calculation is forced to float32:
+        single_res = np.zeros((), dtype=np.float32)
+        np.multiply.reduce(arr, out=single_res, dtype=np.float32)
+        assert single_res != res
+
+    def test_reducelike_output_needs_identical_cast(self):
+        # Checks the case where the we have a simple byte-swap works, maily
+        # tests that this is not rejected directly.
+        # (interesting because we require descriptor identity in reducelikes).
+        arr = np.ones(20, dtype="f8")
+        out = np.empty((), dtype=arr.dtype.newbyteorder())
+        expected = np.add.reduce(arr)
+        np.add.reduce(arr, out=out)
+        assert_array_equal(expected, out)
+        # Check reduceat:
+        out = np.empty(2, dtype=arr.dtype.newbyteorder())
+        expected = np.add.reduceat(arr, [0, 1])
+        np.add.reduceat(arr, [0, 1], out=out)
+        assert_array_equal(expected, out)
+        # And accumulate:
+        out = np.empty(arr.shape, dtype=arr.dtype.newbyteorder())
+        expected = np.add.accumulate(arr)
+        np.add.accumulate(arr, out=out)
+        assert_array_equal(expected, out)
+
     def test_reduce_noncontig_output(self):
         # Check that reduction deals with non-contiguous output arrays
         # appropriately.