Merge pull request #21188 from seberg/scalar-math-rewrite

MAINT,ENH: Rewrite scalar math logic
author: Matti Picus <matti.picus@gmail.com> 2022-05-07 20:02:19 +0300
committer: GitHub <noreply@github.com> 2022-05-07 20:02:19 +0300
commit: 37cb0f8463345648add0e1e98fd146c966146981 (patch)
tree: d2771e0f43118ef11abc5858bd124f18ce67d6c8
parent: e3a9e1a4188a27359950029be4d82d87c4a618b2 (diff)
parent: c2af30347d99d8bfcac23a7002b9229b829b9030 (diff)
download: numpy-37cb0f8463345648add0e1e98fd146c966146981.tar.gz
9 files changed, 1010 insertions, 491 deletions
diff --git a/benchmarks/benchmarks/bench_scalar.py b/benchmarks/benchmarks/bench_scalar.py
index 219e48bed..650daa89d 100644
--- a/benchmarks/benchmarks/bench_scalar.py
+++ b/benchmarks/benchmarks/bench_scalar.py
@@ -10,6 +10,8 @@ class ScalarMath(Benchmark):
     param_names = ["type"]
     def setup(self, typename):
         self.num = np.dtype(typename).type(2)
+        self.int32 = np.int32(2)
+        self.int32arr = np.array(2, dtype=np.int32)
 
     def time_addition(self, typename):
         n = self.num
@@ -31,3 +33,35 @@ class ScalarMath(Benchmark):
         n = self.num
         res = abs(abs(abs(abs(abs(abs(abs(abs(abs(abs(n))))))))))
 
+    def time_add_int32_other(self, typename):
+        # Some mixed cases are fast, some are slow, this documents these
+        # differences.  (When writing, it was fast if the type of the result
+        # is one of the inputs.)
+        int32 = self.int32
+        other = self.num
+        int32 + other
+        int32 + other
+        int32 + other
+        int32 + other
+        int32 + other
+
+    def time_add_int32arr_and_other(self, typename):
+        # `arr + scalar` hits the normal ufunc (array) paths.
+        int32 = self.int32arr
+        other = self.num
+        int32 + other
+        int32 + other
+        int32 + other
+        int32 + other
+        int32 + other
+
+    def time_add_other_and_int32arr(self, typename):
+        # `scalar + arr` at some point hit scalar paths in some cases, and
+        # these paths could be optimized more easily
+        int32 = self.int32arr
+        other = self.num
+        other + int32
+        other + int32
+        other + int32
+        other + int32
+        other + int32
diff --git a/doc/release/upcoming_changes/21188.performance.rst b/doc/release/upcoming_changes/21188.performance.rst
new file mode 100644
index 000000000..ce497572a
--- /dev/null
+++ b/doc/release/upcoming_changes/21188.performance.rst
@@ -0,0 +1,8 @@
+Faster operations on NumPy scalars
+----------------------------------
+Many operations on NumPy scalars are now significantly faster, although
+rare operations (e.g. with 0-D arrays rather than scalars) may be slower
+in some cases.
+However, even with these improvements users who want the best performance
+for their scalars, may want to convert a known NumPy scalar into a Python
+one using `scalar.item()`.
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 1143872b1..44fa0a651 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -183,6 +183,8 @@ OPTIONAL_FUNCTION_ATTRIBUTES = [('__attribute__((optimize("unroll-loops")))',
                                 'attribute_optimize_unroll_loops'),
                                 ('__attribute__((optimize("O3")))',
                                  'attribute_optimize_opt_3'),
+                                ('__attribute__((optimize("O2")))',
+                                 'attribute_optimize_opt_2'),
                                 ('__attribute__((nonnull (1)))',
                                  'attribute_nonnull'),
                                 ('__attribute__((target ("avx")))',
diff --git a/numpy/core/src/multiarray/abstractdtypes.c b/numpy/core/src/multiarray/abstractdtypes.c
index cc1d7fad8..b0345c46b 100644
--- a/numpy/core/src/multiarray/abstractdtypes.c
+++ b/numpy/core/src/multiarray/abstractdtypes.c
@@ -259,6 +259,7 @@ NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyIntAbstractDType = {{{
         .tp_name = "numpy._IntegerAbstractDType",
     },},
     .flags = NPY_DT_ABSTRACT,
+    .type_num = -1,
     .dt_slots = &pyintabstractdtype_slots,
 };
 
@@ -276,6 +277,7 @@ NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyFloatAbstractDType = {{{
         .tp_name = "numpy._FloatAbstractDType",
     },},
     .flags = NPY_DT_ABSTRACT,
+    .type_num = -1,
     .dt_slots = &pyfloatabstractdtype_slots,
 };
 
@@ -293,5 +295,6 @@ NPY_NO_EXPORT PyArray_DTypeMeta PyArray_PyComplexAbstractDType = {{{
         .tp_name = "numpy._ComplexAbstractDType",
     },},
     .flags = NPY_DT_ABSTRACT,
+    .type_num = -1,
     .dt_slots = &pycomplexabstractdtype_slots,
 };
diff --git a/numpy/core/src/multiarray/array_coercion.c b/numpy/core/src/multiarray/array_coercion.c
index ff77d6883..562e4f008 100644
--- a/numpy/core/src/multiarray/array_coercion.c
+++ b/numpy/core/src/multiarray/array_coercion.c
@@ -230,6 +230,16 @@ npy_discover_dtype_from_pytype(PyTypeObject *pytype)
     return (PyArray_DTypeMeta *)DType;
 }
 
+/*
+ * Note: This function never fails, but will return `NULL` for unknown scalars
+ *       and `None` for known array-likes (e.g. tuple, list, ndarray).
+ */
+NPY_NO_EXPORT PyObject *
+PyArray_DiscoverDTypeFromScalarType(PyTypeObject *pytype)
+{
+    return (PyObject *)npy_discover_dtype_from_pytype(pytype);
+}
+
 
 /**
  * Find the correct DType class for the given python type. If flags is NULL
diff --git a/numpy/core/src/multiarray/array_coercion.h b/numpy/core/src/multiarray/array_coercion.h
index f2482cecc..63d543cf7 100644
--- a/numpy/core/src/multiarray/array_coercion.h
+++ b/numpy/core/src/multiarray/array_coercion.h
@@ -19,6 +19,9 @@ NPY_NO_EXPORT int
 _PyArray_MapPyTypeToDType(
         PyArray_DTypeMeta *DType, PyTypeObject *pytype, npy_bool userdef);
 
+NPY_NO_EXPORT PyObject *
+PyArray_DiscoverDTypeFromScalarType(PyTypeObject *pytype);
+
 NPY_NO_EXPORT int
 PyArray_Pack(PyArray_Descr *descr, char *item, PyObject *value);
 
diff --git a/numpy/core/src/multiarray/scalarapi.c b/numpy/core/src/multiarray/scalarapi.c
index 8ed91d26c..40f1da2c4 100644
--- a/numpy/core/src/multiarray/scalarapi.c
+++ b/numpy/core/src/multiarray/scalarapi.c
@@ -312,6 +312,14 @@ PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode)
 NPY_NO_EXPORT PyObject *
 PyArray_ScalarFromObject(PyObject *object)
 {
+    if (DEPRECATE(
+            "PyArray_ScalarFromObject() is deprecated and scheduled for "
+            "removal. If you are using this (undocumented) function, "
+            "please notify the NumPy developers to look for solutions."
+            "(Deprecated in NumPy 1.23)") < 0) {
+        return NULL;
+    }
+
     PyObject *ret = NULL;
 
     if (PyArray_IsZeroDim(object)) {
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index 402e6b561..f273cdd91 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -26,6 +26,13 @@
 #include "binop_override.h"
 #include "npy_longdouble.h"
 
+#include "array_coercion.h"
+#include "common.h"
+#include "can_cast_table.h"
+
+/* TODO: Used for some functions, should possibly move these to npy_math.h */
+#include "loops.h"
+
 /* Basic operations:
  *
  *  BINARY:
@@ -45,23 +52,22 @@
  *  #name = byte, short, int, long, longlong#
  *  #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
     *out = a + b;
     if ((*out^a) >= 0 || (*out^b) >= 0) {
-        return;
+        return 0;
     }
-    npy_set_floatstatus_overflow();
-    return;
+    return NPY_FPE_OVERFLOW;
 }
-static void
+
+static NPY_INLINE int
 @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
     *out = a - b;
     if ((*out^a) >= 0 || (*out^~b) >= 0) {
-        return;
+        return 0;
     }
-    npy_set_floatstatus_overflow();
-    return;
+    return NPY_FPE_OVERFLOW;
 }
 /**end repeat**/
 
@@ -69,23 +75,22 @@ static void
  *  #name = ubyte, ushort, uint, ulong, ulonglong#
  *  #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_add(@type@ a, @type@ b, @type@ *out) {
     *out = a + b;
     if (*out >= a && *out >= b) {
-        return;
+        return 0;
     }
-    npy_set_floatstatus_overflow();
-    return;
+    return NPY_FPE_OVERFLOW;
 }
-static void
+
+static NPY_INLINE int
 @name@_ctype_subtract(@type@ a, @type@ b, @type@ *out) {
     *out = a - b;
     if (a >= b) {
-        return;
+        return 0;
     }
-    npy_set_floatstatus_overflow();
-    return;
+    return NPY_FPE_OVERFLOW;
 }
 /**end repeat**/
 
@@ -108,18 +113,19 @@ static void
  * #neg = (1,0)*4#
  */
 #if NPY_SIZEOF_@SIZE@ > NPY_SIZEOF_@SIZENAME@
-static void
+static NPY_INLINE int
 @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
     @big@ temp;
     temp = ((@big@) a) * ((@big@) b);
     *out = (@type@) temp;
 #if @neg@
-    if (temp > NPY_MAX_@NAME@ || temp < NPY_MIN_@NAME@)
+    if (temp > NPY_MAX_@NAME@ || temp < NPY_MIN_@NAME@) {
 #else
-        if (temp > NPY_MAX_@NAME@)
+    if (temp > NPY_MAX_@NAME@) {
 #endif
-            npy_set_floatstatus_overflow();
-    return;
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
 }
 #endif
 /**end repeat**/
@@ -133,12 +139,12 @@ static void
  * #SIZE = INT*2, LONG*2, LONGLONG*2#
  */
 #if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_@SIZE@
-static void
+static NPY_INLINE int
 @name@_ctype_multiply(@type@ a, @type@ b, @type@ *out) {
     if (npy_mul_with_overflow_@name@(out, a, b)) {
-        npy_set_floatstatus_overflow();
+        return NPY_FPE_OVERFLOW;
     }
-    return;
+    return 0;
 }
 #endif
 /**end repeat**/
@@ -151,16 +157,16 @@ static void
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
  * #neg = (1,0)*5#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_divide(@type@ a, @type@ b, @type@ *out) {
     if (b == 0) {
-        npy_set_floatstatus_divbyzero();
         *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
     }
 #if @neg@
     else if (b == -1 && a < 0 && a == -a) {
-        npy_set_floatstatus_overflow();
         *out = a / b;
+        return NPY_FPE_OVERFLOW;
     }
 #endif
     else {
@@ -174,17 +180,20 @@ static void
 #else
         *out = a / b;
 #endif
+        return 0;
     }
 }
 
 #define @name@_ctype_floor_divide @name@_ctype_divide
 
-static void
+static NPY_INLINE int
 @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
     if (a == 0 || b == 0) {
-        if (b == 0) npy_set_floatstatus_divbyzero();
         *out = 0;
-        return;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
     }
 #if @neg@
     else if ((a > 0) == (b > 0)) {
@@ -198,6 +207,7 @@ static void
 #else
     *out = a % b;
 #endif
+    return 0;
 }
 /**end repeat**/
 
@@ -205,10 +215,15 @@ static void
  *
  * #name = byte, ubyte, short, ushort, int, uint, long,
  *         ulong, longlong, ulonglong#
- * #otyp = npy_float*4, npy_double*6#
  */
-#define @name@_ctype_true_divide(a, b, out)     \
-    *(out) = ((@otyp@) (a)) / ((@otyp@) (b));
+
+static NPY_INLINE int
+@name@_ctype_true_divide(npy_@name@ a, npy_@name@ b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
 /**end repeat**/
 
 /* b will always be positive in this call */
@@ -221,17 +236,17 @@ static void
  * #upc = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *        LONG, ULONG, LONGLONG, ULONGLONG#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out) {
     @type@ tmp;
 
     if (b == 0) {
         *out = 1;
-        return;
+        return 0;
     }
     if (a == 1) {
         *out = 1;
-        return;
+        return 0;
     }
 
     tmp = b & 1 ? a : 1;
@@ -244,6 +259,7 @@ static void
         b >>= 1;
     }
     *out = tmp;
+    return 0;
 }
 /**end repeat**/
 
@@ -261,12 +277,28 @@ static void
  * #op = &, ^, |#
  */
 
-#define @name@_ctype_@oper@(arg1, arg2, out) *(out) = (arg1) @op@ (arg2)
+static NPY_INLINE int
+@name@_ctype_@oper@(@type@ arg1, @type@ arg2, @type@ *out)
+{
+    *out = arg1 @op@ arg2;
+    return 0;
+}
 
 /**end repeat1**/
 
-#define @name@_ctype_lshift(arg1, arg2, out) *(out) = npy_lshift@suffix@(arg1, arg2)
-#define @name@_ctype_rshift(arg1, arg2, out) *(out) = npy_rshift@suffix@(arg1, arg2)
+static NPY_INLINE int
+@name@_ctype_lshift(@type@ arg1, @type@ arg2, @type@ *out)
+{
+    *out = npy_lshift@suffix@(arg1, arg2);
+    return 0;
+}
+
+static NPY_INLINE int
+@name@_ctype_rshift(@type@ arg1, @type@ arg2, @type@ *out)
+{
+    *out = npy_rshift@suffix@(arg1, arg2);
+    return 0;
+}
 
 /**end repeat**/
 
@@ -275,135 +307,162 @@ static void
  * #type = npy_float, npy_double, npy_longdouble#
  * #c = f, , l#
  */
-#define @name@_ctype_add(a, b, outp) *(outp) = (a) + (b)
-#define @name@_ctype_subtract(a, b, outp) *(outp) = (a) - (b)
-#define @name@_ctype_multiply(a, b, outp) *(outp) = (a) * (b)
-#define @name@_ctype_divide(a, b, outp) *(outp) = (a) / (b)
+
+/**begin repeat1
+ * #OP = +, -, *, /#
+ * #oper = add, subtract, multiply, divide#
+ */
+
+static NPY_INLINE int
+@name@_ctype_@oper@(@type@ a, @type@ b, @type@ *out)
+{
+    *out = a @OP@ b;
+    return 0;
+}
+
+/**end repeat1**/
+
 #define @name@_ctype_true_divide @name@_ctype_divide
 
 
-static void
+static NPY_INLINE int
 @name@_ctype_floor_divide(@type@ a, @type@ b, @type@ *out) {
     *out = npy_floor_divide@c@(a, b);
+    return 0;
 }
 
 
-static void
+static NPY_INLINE int
 @name@_ctype_remainder(@type@ a, @type@ b, @type@ *out) {
     *out = npy_remainder@c@(a, b);
+    return 0;
 }
 
 
-static void
+static NPY_INLINE int
 @name@_ctype_divmod(@type@ a, @type@ b, @type@ *out1, @type@ *out2) {
     *out1 = npy_divmod@c@(a, b, out2);
+    return 0;
 }
 
 
 /**end repeat**/
 
-#define half_ctype_add(a, b, outp) *(outp) = \
-        npy_float_to_half(npy_half_to_float(a) + npy_half_to_float(b))
-#define half_ctype_subtract(a, b, outp) *(outp) = \
-        npy_float_to_half(npy_half_to_float(a) - npy_half_to_float(b))
-#define half_ctype_multiply(a, b, outp) *(outp) = \
-        npy_float_to_half(npy_half_to_float(a) * npy_half_to_float(b))
-#define half_ctype_divide(a, b, outp) *(outp) = \
-        npy_float_to_half(npy_half_to_float(a) / npy_half_to_float(b))
+/**begin repeat
+ * #OP = +, -, *, /#
+ * #oper = add, subtract, multiply, divide#
+ */
+
+static NPY_INLINE int
+half_ctype_@oper@(npy_half a, npy_half b, npy_half *out)
+{
+    float res = npy_half_to_float(a) @OP@ npy_half_to_float(b);
+    *out = npy_float_to_half(res);
+    return 0;
+}
+
+/**end repeat**/
 #define half_ctype_true_divide half_ctype_divide
 
 
-static void
-half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out) {
+static NPY_INLINE int
+half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
+{
     npy_half mod;
 
     if (!b) {
-        *out = a / b;
-    } else {
+        float res = npy_half_to_float(a) / npy_half_to_float(b);
+        *out = npy_float_to_half(res);
+    }
+    else {
         *out = npy_half_divmod(a, b, &mod);
     }
+    return 0;
 }
 
 
-static void
-half_ctype_remainder(npy_half a, npy_half b, npy_half *out) {
+static NPY_INLINE int
+half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
+{
     npy_half_divmod(a, b, out);
+    return 0;
 }
 
 
-static void
-half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2) {
+static NPY_INLINE int
+half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
+{
     *out1 = npy_half_divmod(a, b, out2);
+    return 0;
 }
 
 /**begin repeat
  * #name = cfloat, cdouble, clongdouble#
+ * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #TYPE = CFLOAT, CDOUBLE, CLONGDOUBLE#
  * #rname = float, double, longdouble#
  * #rtype = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-#define @name@_ctype_add(a, b, outp) do{        \
-    (outp)->real = (a).real + (b).real;         \
-    (outp)->imag = (a).imag + (b).imag;         \
-    } while(0)
-#define @name@_ctype_subtract(a, b, outp) do{   \
-    (outp)->real = (a).real - (b).real;         \
-    (outp)->imag = (a).imag - (b).imag;         \
-    } while(0)
-#define @name@_ctype_multiply(a, b, outp) do{                   \
-    (outp)->real = (a).real * (b).real - (a).imag * (b).imag;   \
-    (outp)->imag = (a).real * (b).imag + (a).imag * (b).real;   \
-    } while(0)
-/* Algorithm identical to that in loops.c.src, for consistency */
-#define @name@_ctype_divide(a, b, outp) do{                         \
-    @rtype@ in1r = (a).real;                                        \
-    @rtype@ in1i = (a).imag;                                        \
-    @rtype@ in2r = (b).real;                                        \
-    @rtype@ in2i = (b).imag;                                        \
-    @rtype@ in2r_abs = npy_fabs@c@(in2r);                           \
-    @rtype@ in2i_abs = npy_fabs@c@(in2i);                           \
-    if (in2r_abs >= in2i_abs) {                                     \
-        if (in2r_abs == 0 && in2i_abs == 0) {                       \
-            /* divide by zero should yield a complex inf or nan */  \
-            (outp)->real = in1r/in2r_abs;                           \
-            (outp)->imag = in1i/in2i_abs;                           \
-        }                                                           \
-        else {                                                      \
-            @rtype@ rat = in2i/in2r;                                \
-            @rtype@ scl = 1.0@c@/(in2r + in2i*rat);                 \
-            (outp)->real = (in1r + in1i*rat)*scl;                   \
-            (outp)->imag = (in1i - in1r*rat)*scl;                   \
-        }                                                           \
-    }                                                               \
-    else {                                                          \
-        @rtype@ rat = in2r/in2i;                                    \
-        @rtype@ scl = 1.0@c@/(in2i + in2r*rat);                     \
-        (outp)->real = (in1r*rat + in1i)*scl;                       \
-        (outp)->imag = (in1i*rat - in1r)*scl;                       \
-    }                                                               \
-    } while(0)
+static NPY_INLINE int
+@name@_ctype_add(@type@ a, @type@ b, @type@ *out)
+{
+    out->real = a.real + b.real;
+    out->imag = a.imag + b.imag;
+    return 0;
+}
+
+static NPY_INLINE int
+@name@_ctype_subtract(@type@ a, @type@ b, @type@ *out)
+{
+    out->real = a.real - b.real;
+    out->imag = a.imag - b.imag;
+    return 0;
+}
+
+
+/*
+ * TODO: Mark as  to work around FPEs not being issues on clang 12.
+ *       This should be removed when possible.
+ */
+static NPY_INLINE int
+@name@_ctype_multiply( @type@ a, @type@ b, @type@ *out)
+{
+    out->real = a.real * b.real - a.imag * b.imag;
+    out->imag = a.real * b.imag + a.imag * b.real;
+    return 0;
+}
+
+/* Use the ufunc loop directly to avoid duplicating the complicated logic */
+static NPY_INLINE int
+@name@_ctype_divide(@type@ a, @type@ b, @type@ *out)
+{
+    char *args[3] = {(char *)&a, (char *)&b, (char *)out};
+    npy_intp steps[3];
+    npy_intp size = 1;
+    @TYPE@_divide(args, &size, steps, NULL);
+    return 0;
+}
 
 #define @name@_ctype_true_divide @name@_ctype_divide
 
-#define @name@_ctype_floor_divide(a, b, outp) do {      \
-    @rname@_ctype_floor_divide(                         \
-        ((a).real*(b).real + (a).imag*(b).imag),        \
-        ((b).real*(b).real + (b).imag*(b).imag),        \
-        &((outp)->real));                               \
-    (outp)->imag = 0;                                   \
-    } while(0)
 /**end repeat**/
 
 
 
 /**begin repeat
  * #name = byte, ubyte, short, ushort, int, uint, long, ulong,
- *         longlong, ulonglong, cfloat, cdouble, clongdouble#
+ *         longlong, ulonglong#
  */
-#define @name@_ctype_divmod(a, b, out, out2) {  \
-    @name@_ctype_floor_divide(a, b, out);       \
-    @name@_ctype_remainder(a, b, out2);         \
-    }
+
+static NPY_INLINE int
+@name@_ctype_divmod(npy_@name@ a, npy_@name@ b, npy_@name@ *out, npy_@name@ *out2)
+{
+    int res = @name@_ctype_floor_divide(a, b, out);
+    res |= @name@_ctype_remainder(a, b, out2);
+    return res;
+}
+
 /**end repeat**/
 
 
@@ -413,20 +472,22 @@ half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2) {
  * #c = f,,l#
  */
 
-static void
+static NPY_INLINE int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out)
 {
     *out = npy_pow@c@(a, b);
+    return 0;
 }
 
 /**end repeat**/
-static void
+static NPY_INLINE int
 half_ctype_power(npy_half a, npy_half b, npy_half *out)
 {
     const npy_float af = npy_half_to_float(a);
     const npy_float bf = npy_half_to_float(b);
     const npy_float outf = npy_powf(af,bf);
     *out = npy_float_to_half(outf);
+    return 0;
 }
 
 /**begin repeat
@@ -438,20 +499,23 @@ half_ctype_power(npy_half a, npy_half b, npy_half *out)
  *         npy_float, npy_double, npy_longdouble#
  * #uns = (0,1)*5,0*3#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_negative(@type@ a, @type@ *out)
 {
+    *out = -a;
 #if @uns@
-    npy_set_floatstatus_overflow();
+    return NPY_FPE_OVERFLOW;
+#else
+    return 0;
 #endif
-    *out = -a;
 }
 /**end repeat**/
 
-static void
+static NPY_INLINE int
 half_ctype_negative(npy_half a, npy_half *out)
 {
     *out = a^0x8000u;
+    return 0;
 }
 
 
@@ -459,11 +523,12 @@ half_ctype_negative(npy_half a, npy_half *out)
  * #name = cfloat, cdouble, clongdouble#
  * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_negative(@type@ a, @type@ *out)
 {
     out->real = -a.real;
     out->imag = -a.imag;
+    return 0;
 }
 /**end repeat**/
 
@@ -475,10 +540,11 @@ static void
  *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_positive(@type@ a, @type@ *out)
 {
     *out = a;
+    return 0;
 }
 /**end repeat**/
 
@@ -487,17 +553,19 @@ static void
  * #type = npy_cfloat, npy_cdouble, npy_clongdouble#
  * #c = f,,l#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_positive(@type@ a, @type@ *out)
 {
     out->real = a.real;
     out->imag = a.imag;
+    return 0;
 }
 
-static void
+static NPY_INLINE int
 @name@_ctype_power(@type@ a, @type@ b, @type@ *out)
 {
     *out = npy_cpow@c@(a, b);
+    return 0;
 }
 /**end repeat**/
 
@@ -515,10 +583,11 @@ static void
  * #name = byte, short, int, long, longlong#
  * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_absolute(@type@ a, @type@ *out)
 {
     *out = (a < 0 ? -a : a);
+    return 0;
 }
 /**end repeat**/
 
@@ -527,17 +596,19 @@ static void
  * #type = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_absolute(@type@ a, @type@ *out)
 {
     *out = npy_fabs@c@(a);
+    return 0;
 }
 /**end repeat**/
 
-static void
+static NPY_INLINE int
 half_ctype_absolute(npy_half a, npy_half *out)
 {
     *out = a&0x7fffu;
+    return 0;
 }
 
 /**begin repeat
@@ -546,10 +617,11 @@ half_ctype_absolute(npy_half a, npy_half *out)
  * #rtype = npy_float, npy_double, npy_longdouble#
  * #c = f,,l#
  */
-static void
+static NPY_INLINE int
 @name@_ctype_absolute(@type@ a, @rtype@ *out)
 {
     *out = npy_cabs@c@(a);
+    return 0;
 }
 /**end repeat**/
 
@@ -558,196 +630,400 @@ static void
  *         ulong, longlong, ulonglong#
  */
 
-#define @name@_ctype_invert(a, out) *(out) = ~a;
+static NPY_INLINE int
+@name@_ctype_invert(npy_@name@ a, npy_@name@ *out)
+{
+    *out = ~a;
+    return 0;
+}
 
 /**end repeat**/
 
 /*** END OF BASIC CODE **/
 
 
-/* The general strategy for commutative binary operators is to
+/*
+ * How binary operators work
+ * -------------------------
+ *
+ * All binary (numeric) operators use the larger of the two types, with the
+ * exception of unsigned int and signed int mixed cases which must promote
+ * to a larger type.
+ *
+ * The strategy employed for all binary operation is that we coerce the other
+ * scalar if it is safe to do.  E.g. `float64 + float32` the `float64` can
+ * convert `float32` and do the operation as `float64 + float64`.
+ * OTOH, for `float32 + float64` it is safe, and we should defer to `float64`.
+ *
+ * So we have multiple possible paths:
+ * - The other scalar is a subclass.  In principle *both* inputs could be
+ *   different subclasses.  In this case it would make sense to defer, but
+ *   Python's `int` does not try this as well, so we do not here:
+ *
+ *      class A(int): pass
+ *      class B(int):
+ *          def __add__(self, other): return "b"
+ *          __radd__ = __add__
+ *
+ *      A(1) + B(1)  # return 2
+ *      B(1) + A(1)  # return "b"
+ *
+ * - The other scalar can be converted:  All is good, we do the operation
+ * - The other scalar cannot be converted, there are two possibilities:
+ *   - The reverse should work, so we return NotImplemented to defer.
+ *     (If self is a subclass, this will end up in the "unknown" path.)
+ *   - Neither works (e.g. `uint8 + int8`):  We currently use the array path.
+ * - The other object is a unknown.  It could be either a scalar, an array,
+ *   or an array-like (including a list!).  Because NumPy scalars pretend to be
+ *   arrays we fall into the array fallback path here _normally_ (through
+ *   the generic scalar path).
+ *   First we check if we should defer, though.
+ *
+ * The last possibility is awkward and leads to very confusing situations.
+ * The problem is that usually we should defer (return NotImplemented)
+ * in that path.
+ * If the other object is a NumPy array (or array-like) it will know what to
+ * do.  If NumPy knows that it is a scalar (not generic `object`), then it
+ * would make sense to try and use the "array path" (i.e. deal with it
+ * using the ufunc machinery).
+ *
+ * But this overlooks two things that currently work:
+ *
+ * 1. `np.float64(3) * [1, 2, 3]`  happily returns an array result.
+ * 2. `np.int32(3) * decimal.Decimal(3)` works!  (see below)
+ *
+ * The first must work, because scalars pretend to be arrays.  Which means
+ * they inherit the greedy "convert the other object to an array" logic.
+ * This may be a questionable choice, but is fine.
+ * (As of now, it is not negotiable, since NumPy often converts 0-D arrays
+ * to scalars.)
+ *
+ * The second one is more confusing.  This works also by using the ufunc
+ * machinery (array path), but it works because:
+ *
+ *     np.add(np.int32(3), decimal.Decimal(3))
+ *
+ * Will convert the `int32` to an int32 array, and the decimal to an object
+ * array.  It then *casts* the `int32` array to an object array.
+ * The casting step CONVERTS the integer to a Python integer.  The ufunc object
+ * loop will then call back into Python scalar logic.
  *
- * 1) Convert the types to the common type if both are scalars (0 return)
- * 2) If both are not scalars use ufunc machinery (-2 return)
- * 3) If both are scalars but cannot be cast to the right type
- * return NotImplemented (-1 return)
+ * The above would be recursive, if it was not for the conversion of the int32
+ * to a Python integer!
+ * This leads us to the EXCEEDINGLY IMPORTANT special case:
  *
- * 4) Perform the function on the C-type.
- * 5) If an error condition occurred, check to see
- * what the current error-handling is and handle the error.
+ * WARNING: longdouble and clongdouble do NOT convert to a Python scalar
+ *          when cast to object.  Thus they MUST NEVER take the array-path.
+ *          However, they STILL should defer at least for
+ *          `np.longdouble(3) + array`.
  *
- * 6) Construct and return the output scalar.
+ *
+ * As a general note, in the above we defer exactly when we know that deferring
+ * will work.  `longdouble` uses the "simple" logic of generally deferring
+ * though, because it would otherwise easily run into an infinite recursion.
+ *
+ *
+ * The future?!
+ * ------------
+ *
+ * This is very tricky and it would be nice to formalize away that "recursive"
+ * path we currently use.  I (seberg) have currently no great idea on this,
+ * this is more brainstorming!
+ *
+ * If both are scalars (known to NumPy), they have a DType and we may be able
+ * to do the ufunc promotion to make sure there is no risk of recursion.
+ *
+ * In principle always deferring would probably be clean.  But we likely cannot
+ * do that?  There is also an issue that it is nice that we allow adding a
+ * DType for an existing Python scalar (which will not know about NumPy
+ * scalars).
+ * The DType/ufunc machinery teaches NumPy how arrays will work with that
+ * Python scalar, but the DType may need to help us decide whether we should
+ * defer (return NotImplemented) or try using the ufunc machinery (or a
+ * simplified ufunc-like machinery limited to scalars).
  */
 
+
+/*
+ * Enum used to describe the space of possibilities when converting the second
+ * argument to a binary operation.
+ */
+typedef enum {
+    /* An error occurred (should not really happen/be possible) */
+    CONVERSION_ERROR = -1,
+    /* A known NumPy scalar, but of higher precision: we defer */
+    DEFER_TO_OTHER_KNOWN_SCALAR,
+    /* Conversion was successful (known scalar of less precision) */
+    CONVERSION_SUCCESS,
+    /*
+     * Other object is an unkown scalar or array-like, we (typically) use
+     * the generic path, which normally ends up in the ufunc machinery.
+     */
+    OTHER_IS_UNKNOWN_OBJECT,
+    /*
+     * Promotion necessary
+     */
+    PROMOTION_REQUIRED,
+    /*
+     * The other object may be a subclass, conversion is successful.  We do
+     * not special case this as Python's `int` does not either
+     */
+    OTHER_IS_SUBCLASS,
+} conversion_result;
+
 /**begin repeat
  * #name = byte, ubyte, short, ushort, int, uint,
  *         long, ulong, longlong, ulonglong,
- *         half, float, longdouble,
+ *         half, float, double, longdouble,
  *         cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_longdouble,
- *         npy_cfloat, npy_cdouble, npy_clongdouble#
  * #Name = Byte, UByte, Short, UShort, Int, UInt,
  *         Long, ULong, LongLong, ULongLong,
- *         Half, Float, LongDouble,
+ *         Half, Float, Double, LongDouble,
  *         CFloat, CDouble, CLongDouble#
- * #TYPE = NPY_BYTE, NPY_UBYTE, NPY_SHORT, NPY_USHORT, NPY_INT, NPY_UINT,
- *         NPY_LONG, NPY_ULONG, NPY_LONGLONG, NPY_ULONGLONG,
- *         NPY_HALF, NPY_FLOAT, NPY_LONGDOUBLE,
- *         NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE#
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG,
+ *         HALF, FLOAT, DOUBLE, LONGDOUBLE,
+ *         CFLOAT, CDOUBLE, CLONGDOUBLE#
+ * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
+ *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
+ *         npy_half, npy_float, npy_double, npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble#
  */
 
-static int
-_@name@_convert_to_ctype(PyObject *a, @type@ *arg1)
-{
-    PyObject *temp;
-
-    if (PyArray_IsScalar(a, @Name@)) {
-        *arg1 = PyArrayScalar_VAL(a, @Name@);
-        return 0;
-    }
-    else if (PyArray_IsScalar(a, Generic)) {
-        PyArray_Descr *descr1;
+#define IS_@TYPE@ 1
 
-        if (!PyArray_IsScalar(a, Number)) {
-            return -1;
-        }
-        descr1 = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(a));
-        if (PyArray_CanCastSafely(descr1->type_num, @TYPE@)) {
-            PyArray_CastScalarDirect(a, descr1, arg1, @TYPE@);
-            Py_DECREF(descr1);
-            return 0;
-        }
-        else {
-            Py_DECREF(descr1);
-            return -1;
-        }
-    }
-    else if (PyArray_GetPriority(a, NPY_PRIORITY) > NPY_PRIORITY) {
-        return -2;
-    }
-    else if ((temp = PyArray_ScalarFromObject(a)) != NULL) {
-        int retval = _@name@_convert_to_ctype(temp, arg1);
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
 
-        Py_DECREF(temp);
-        return retval;
-    }
-    return -2;
-}
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
 
-/**end repeat**/
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
 
 
-/* Same as above but added exact checks against known python types for speed */
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_@TYPE@)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_@TYPE@, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
 
-/**begin repeat
- * #name = double#
- * #type = npy_double#
- * #Name = Double#
- * #TYPE = NPY_DOUBLE#
- * #PYCHECKEXACT = PyFloat_CheckExact#
- * #PYEXTRACTCTYPE = PyFloat_AS_DOUBLE#
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
  */
 
-static int
-_@name@_convert_to_ctype(PyObject *a, @type@ *arg1)
-{
-    PyObject *temp;
-
-    if (@PYCHECKEXACT@(a)){
-        *arg1 = @PYEXTRACTCTYPE@(a);
-        return 0;
-    }
-
-    if (PyArray_IsScalar(a, @Name@)) {
-        *arg1 = PyArrayScalar_VAL(a, @Name@);
-        return 0;
-    }
-    else if (PyArray_IsScalar(a, Generic)) {
-        PyArray_Descr *descr1;
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_@TYPE@)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_@TYPE@, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
 
-        if (!PyArray_IsScalar(a, Number)) {
-            return -1;
-        }
-        descr1 = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(a));
-        if (PyArray_CanCastSafely(descr1->type_num, @TYPE@)) {
-            PyArray_CastScalarDirect(a, descr1, arg1, @TYPE@);
-            Py_DECREF(descr1);
-            return 0;
-        }
-        else {
-            Py_DECREF(descr1);
-            return -1;
-        }
-    }
-    else if (PyArray_GetPriority(a, NPY_PRIORITY) > NPY_PRIORITY) {
-        return -2;
-    }
-    else if ((temp = PyArray_ScalarFromObject(a)) != NULL) {
-        int retval = _@name@_convert_to_ctype(temp, arg1);
+#else
 
-        Py_DECREF(temp);
-        return retval;
-    }
-    return -2;
-}
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_@TYPE@, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
 
-/**end repeat**/
+#endif
 
 
-/**begin repeat
- * #name = byte, ubyte, short, ushort, int, uint,
- *         long, ulong, longlong, ulonglong,
- *         half, float, double, cfloat, cdouble#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong,
- *         npy_half, npy_float, npy_double, npy_cfloat, npy_cdouble#
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
  */
-static int
-_@name@_convert2_to_ctypes(PyObject *a, @type@ *arg1,
-                           PyObject *b, @type@ *arg2)
+static NPY_INLINE conversion_result
+convert_to_@name@(PyObject *value, @type@ *result)
 {
-    int ret;
-    ret = _@name@_convert_to_ctype(a, arg1);
-    if (ret < 0) {
-        return ret;
+    if (Py_TYPE(value) == &Py@Name@ArrType_Type) {
+        *result = PyArrayScalar_VAL(value, @Name@);
+        return CONVERSION_SUCCESS;
     }
-    ret = _@name@_convert_to_ctype(b, arg2);
-    if (ret < 0) {
-        return ret;
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, @Name@)) {
+        *result = PyArrayScalar_VAL(value, @Name@);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * subclasses.  But in practice even we do not bother later.
+         */
+        return OTHER_IS_SUBCLASS;
     }
-    return 0;
-}
-/**end repeat**/
 
-/**begin repeat
- * #name = longdouble, clongdouble#
- * #type = npy_longdouble, npy_clongdouble#
- */
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
 
-static int
-_@name@_convert2_to_ctypes(PyObject *a, @type@ *arg1,
-                           PyObject *b, @type@ *arg2)
-{
-    int ret;
-    ret = _@name@_convert_to_ctype(a, arg1);
-    if (ret == -2) {
-        ret = -3;
+    if (IS_SAFE(NPY_DOUBLE, NPY_@TYPE@) && PyFloat_CheckExact(value)) {
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
     }
-    if (ret < 0) {
-        return ret;
+
+    if (IS_SAFE(NPY_LONG, NPY_@TYPE@) && PyLong_CheckExact(value)) {
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            return OTHER_IS_UNKNOWN_OBJECT;  /* handle as if arbitrary object */
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
     }
-    ret = _@name@_convert_to_ctype(b, arg2);
-    if (ret == -2) {
-        ret = -3;
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    if (IS_SAFE(NPY_CDOUBLE, NPY_@TYPE@) && PyComplex_CheckExact(value)) {
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
     }
-    if (ret < 0) {
-        return ret;
+#endif  /* defined(IS_CFLOAT) || ... */
+
+    PyObject *dtype = PyArray_DiscoverDTypeFromScalarType(Py_TYPE(value));
+    if (dtype == Py_None) {
+        Py_DECREF(dtype);
+        /* Signal that this is an array or array-like: Defer to array logic */
+        return OTHER_IS_UNKNOWN_OBJECT;
     }
-    return 0;
+    else if (dtype == NULL) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         */
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (((PyArray_DTypeMeta *)dtype)->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_@TYPE@)) {
+                assert(Py_TYPE(value) == &PyHalfArrType_Type);
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = 1;
+            }
+            else if (IS_SAFE(NPY_@TYPE@, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(dtype);
+    return ret;
 }
 
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_@TYPE@
+
 /**end repeat**/
 
 
@@ -756,102 +1032,145 @@ _@name@_convert2_to_ctypes(PyObject *a, @type@ *arg1,
  * #name = (byte, ubyte, short, ushort, int, uint,
  *             long, ulong, longlong, ulonglong)*12,
  *         (half, float, double, longdouble,
- *             cfloat, cdouble, clongdouble)*5,
- *         (half, float, double, longdouble)*2#
+ *             cfloat, cdouble, clongdouble)*4,
+ *         (half, float, double, longdouble)*3#
  * #Name = (Byte, UByte, Short, UShort, Int, UInt,
  *             Long, ULong,LongLong,ULongLong)*12,
  *         (Half, Float, Double, LongDouble,
- *             CFloat, CDouble, CLongDouble)*5,
- *         (Half, Float, Double, LongDouble)*2#
+ *             CFloat, CDouble, CLongDouble)*4,
+ *         (Half, Float, Double, LongDouble)*3#
  * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*12,
  *         (npy_half, npy_float, npy_double, npy_longdouble,
- *             npy_cfloat, npy_cdouble, npy_clongdouble)*5,
- *         (npy_half, npy_float, npy_double, npy_longdouble)*2#
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*4,
+ *         (npy_half, npy_float, npy_double, npy_longdouble)*3#
  *
  * #oper = add*10, subtract*10, multiply*10, remainder*10,
  *         divmod*10, floor_divide*10, lshift*10, rshift*10, and*10,
  *         or*10, xor*10, true_divide*10,
- *         add*7, subtract*7, multiply*7, floor_divide*7, true_divide*7,
- *         divmod*4, remainder*4#
+ *         add*7, subtract*7, multiply*7, true_divide*7,
+ *         floor_divide*4, divmod*4, remainder*4#
  *
- * #fperr = 1*60,0*50,1*10,
- *          1*35,
- *          1*8#
+ * #fperr = 0*110, 1*10,
+ *          1*28, 1*12#
  * #twoout = 0*40,1*10,0*70,
- *           0*35,
- *           1*4,0*4#
+ *           0*28,
+ *           0*4,1*4,0*4#
  * #otype = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong)*11,
- *         npy_float*4, npy_double*6,
+ *         npy_double*10,
  *         (npy_half, npy_float, npy_double, npy_longdouble,
- *             npy_cfloat, npy_cdouble, npy_clongdouble)*5,
- *         (npy_half, npy_float, npy_double, npy_longdouble)*2#
+ *             npy_cfloat, npy_cdouble, npy_clongdouble)*4,
+ *         (npy_half, npy_float, npy_double, npy_longdouble)*3#
  * #OName = (Byte, UByte, Short, UShort, Int, UInt,
  *              Long, ULong, LongLong, ULongLong)*11,
- *          Float*4, Double*6,
+ *          Double*10,
  *          (Half, Float, Double, LongDouble,
- *              CFloat, CDouble, CLongDouble)*5,
- *          (Half, Float, Double, LongDouble)*2#
+ *              CFloat, CDouble, CLongDouble)*4,
+ *          (Half, Float, Double, LongDouble)*3#
  */
+#define IS_@name@
 
 static PyObject *
 @name@_@oper@(PyObject *a, PyObject *b)
 {
     PyObject *ret;
-    @type@ arg1, arg2;
-    @otype@ out;
+    @type@ arg1, arg2, other_val;
 
-#if @twoout@
-    @otype@ out2;
-    PyObject *obj;
-#endif
-
-#if @fperr@
-    int retstatus;
-    int first;
-#endif
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`, we need to check
+     * `BINOP_IS_FORWARD` for that (it takes into account that both may be
+     * identicalclass).
+     */
+    int is_forward = (Py_TYPE(a)->tp_as_number != NULL
+            && (void *)(Py_TYPE(a)->tp_as_number->nb_@oper@) == (void*)(@name@_@oper@));
 
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_@oper@, @name@_@oper@);
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
 
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* one of them can't be cast safely must be mixed-types*/
-            return PyArray_Type.tp_as_number->nb_@oper@(a,b);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
+    conversion_result res = convert_to_@name@(other, &other_val);
+    switch (res) {
+        case CONVERSION_ERROR:
+            return NULL;  /* an error occurred (should never happen) */
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+            BINOP_GIVE_UP_IF_NEEDED(a, b, nb_@oper@, @name@_@oper@);
+        case PROMOTION_REQUIRED:
+            /*
+             * Either an array-like, unknown scalar or we need to promote.
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
             return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
-        case -3:
+        case OTHER_IS_SUBCLASS:
             /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
+             * Success converting.  We _could_ in principle defer in cases
+             * were the other subclass does not inherit the behavior.  In
+             * practice not even Python's `int` attempt this, so we also punt.
              */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
+            break;
     }
 
 #if @fperr@
-    npy_clear_floatstatus_barrier((char*)&out);
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, @Name@);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, @Name@);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    @otype@ out;
+#if @twoout@
+    @otype@ out2;
+    PyObject *obj;
 #endif
 
+
+
     /*
      * here we do the actual calculation with arg1 and arg2
      * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
      */
 #if @twoout@
-    @name@_ctype_@oper@(arg1, arg2, (@otype@ *)&out, &out2);
+    int retstatus = @name@_ctype_@oper@(arg1, arg2, &out, &out2);
 #else
-    @name@_ctype_@oper@(arg1, arg2, (@otype@ *)&out);
+    int retstatus = @name@_ctype_@oper@(arg1, arg2, &out);
 #endif
 
 #if @fperr@
     /* Check status flag.  If it is set, then look up what to do */
-    retstatus = npy_get_floatstatus_barrier((char*)&out);
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -860,14 +1179,13 @@ static PyObject *
                                 &errobj) < 0) {
             return NULL;
         }
-        first = 1;
+        int first = 1;
         if (PyUFunc_handlefperr(errmask, errobj, retstatus, &first)) {
             Py_XDECREF(errobj);
             return NULL;
         }
         Py_XDECREF(errobj);
     }
-#endif
 
 
 #if @twoout@
@@ -899,6 +1217,9 @@ static PyObject *
     return ret;
 }
 
+
+#undef IS_@name@
+
 /**end repeat**/
 
 #define _IS_ZERO(x) (x == 0)
@@ -920,17 +1241,6 @@ static PyObject *
  *         Half, Float, Double, LongDouble,
  *         CFloat, CDouble, CLongDouble#
  *
- * #oname = float*4, double*6, half, float, double, longdouble,
- *          cfloat, cdouble, clongdouble#
- *
- * #otype = npy_float*4, npy_double*6, npy_half, npy_float,
- *          npy_double, npy_longdouble,
- *          npy_cfloat, npy_cdouble, npy_clongdouble#
- *
- * #OName = Float*4, Double*6, Half, Float,
- *          Double, LongDouble,
- *          CFloat, CDouble, CLongDouble#
- *
  * #isint = 1*10,0*7#
  * #isuint = (0,1)*5,0*7#
  * #cmplx = 0*14,1*3#
@@ -938,46 +1248,71 @@ static PyObject *
  * #zero = 0*10, NPY_HALF_ZERO, 0*6#
  * #one = 1*10, NPY_HALF_ONE, 1*6#
  */
+#define IS_@name@
 
 static PyObject *
 @name@_power(PyObject *a, PyObject *b, PyObject *modulo)
 {
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
     PyObject *ret;
-    @type@ arg1, arg2, out;
+    @type@ arg1, arg2, other_val;
 
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
+    int is_forward = (Py_TYPE(a)->tp_as_number != NULL
+            && (void *)(Py_TYPE(a)->tp_as_number->nb_power) == (void*)(@name@_power));
 
-    switch(_@name@_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* can't cast both safely mixed-types? */
-            return PyArray_Type.tp_as_number->nb_power(a,b,modulo);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_power(a,b,modulo);
-        case -3:
-        default:
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    int res = convert_to_@name@(other, &other_val);
+    switch (res) {
+        case CONVERSION_ERROR:
+            return NULL;  /* an error occurred (should never happen) */
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+            BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, @name@_power);
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case OTHER_IS_SUBCLASS:
             /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
+             * Success converting.  We _could_ in principle defer in cases
+             * were the other subclass does not inherit the behavior.  In
+             * practice not even Python's `int` attempt this, so we also punt.
              */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    if (modulo != Py_None) {
-        /* modular exponentiation is not implemented (gh-8804) */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
+            break;
     }
 
 #if !@isint@
-    npy_clear_floatstatus_barrier((char*)&out);
+    npy_clear_floatstatus_barrier((char*)&arg1);
 #endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, @Name@);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, @Name@);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    @type@ out;
+
     /*
      * here we do the actual calculation with arg1 and arg2
      * as a function call.
@@ -989,11 +1324,12 @@ static PyObject *
         return NULL;
     }
 #endif
-    @name@_ctype_power(arg1, arg2, &out);
+    int retstatus = @name@_ctype_power(arg1, arg2, &out);
 
 #if !@isint@
     /* Check status flag.  If it is set, then look up what to do */
-    int retstatus = npy_get_floatstatus_barrier((char*)&out);
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
     if (retstatus) {
         int bufsize, errmask;
         PyObject *errobj;
@@ -1009,7 +1345,6 @@ static PyObject *
         }
         Py_XDECREF(errobj);
     }
-#endif
 
     ret = PyArrayScalar_New(@Name@);
     if (ret == NULL) {
@@ -1021,78 +1356,28 @@ static PyObject *
 }
 
 
+#undef IS_@name@
 /**end repeat**/
 #undef _IS_ZERO
 
 
 /**begin repeat
  *
- * #name = cfloat, cdouble#
- *
- */
-
-/**begin repeat1
- *
- * #oper = divmod, remainder#
- *
- */
-
-#define @name@_@oper@ NULL
-
-/**end repeat1**/
-
-/**end repeat**/
-
-/**begin repeat
- *
- * #oper = divmod, remainder#
+ * #name = (cfloat, cdouble, clongdouble)*3#
+ * #oper = floor_divide*3, divmod*3, remainder*3#
  *
  */
 
 /* 
-Complex numbers do not support remainder operations. Unfortunately, 
-the type inference for long doubles is complicated, and if a remainder 
-operation is not defined - if the relevant field is left NULL - then 
-operations between long doubles and objects lead to an infinite recursion 
-instead of a TypeError. This should ensure that once everything gets
-converted to complex long doubles you correctly get a reasonably
-informative TypeError. This fixes the last part of bug gh-18548.
-*/
-
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
 static PyObject *
-clongdouble_@oper@(PyObject *a, PyObject *b)
+@name@_@oper@(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
 {
-    npy_clongdouble arg1, arg2;
-
-    BINOP_GIVE_UP_IF_NEEDED(a, b, nb_@oper@, clongdouble_@oper@);
-
-    switch(_clongdouble_convert2_to_ctypes(a, &arg1, b, &arg2)) {
-        case 0:
-            break;
-        case -1:
-            /* one of them can't be cast safely must be mixed-types*/
-            return PyArray_Type.tp_as_number->nb_@oper@(a,b);
-        case -2:
-            /* use default handling */
-            if (PyErr_Occurred()) {
-                return NULL;
-            }
-            return PyGenericArrType_Type.tp_as_number->nb_@oper@(a,b);
-        case -3:
-            /*
-             * special case for longdouble and clongdouble
-             * because they have a recursive getitem in their dtype
-             */
-            Py_INCREF(Py_NotImplemented);
-            return Py_NotImplemented;
-    }
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * as a function call.
-     */
-    PyErr_SetString(PyExc_TypeError, "complex long doubles do not support remainder");
-    return NULL;
+    Py_RETURN_NOTIMPLEMENTED;
 }
 
 /**end repeat**/
@@ -1124,6 +1409,14 @@ clongdouble_@oper@(PyObject *a, PyObject *b)
  *         byte, ubyte, short, ushort, int, uint,
  *         long, ulong, longlong, ulonglong#
  *
+ * #Name = (Byte, UByte, Short, UShort, Int, UInt,
+ *             Long, ULong, LongLong, ULongLong,
+ *             Half, Float, Double, LongDouble,
+ *             CFloat, CDouble, CLongDouble)*3,
+ *
+ *         Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong#
+ *
  * #type = (npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
  *             npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *             npy_half, npy_float, npy_double, npy_longdouble,
@@ -1161,32 +1454,19 @@ clongdouble_@oper@(PyObject *a, PyObject *b)
 static PyObject *
 @name@_@oper@(PyObject *a)
 {
-    @type@ arg1;
+    @type@ val;
     @otype@ out;
     PyObject *ret;
 
-    switch(_@name@_convert_to_ctype(a, &arg1)) {
-    case 0:
-        break;
-    case -1:
-        /* can't cast both safely use different add function */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
-    case -2:
-        /* use default handling */
-        if (PyErr_Occurred()) {
-            return NULL;
-        }
-        return PyGenericArrType_Type.tp_as_number->nb_@oper@(a);
-    }
+
+    val = PyArrayScalar_VAL(a, @Name@);
+
+    @name@_ctype_@oper@(val, &out);
 
     /*
-     * here we do the actual calculation with arg1 and arg2
-     * make it a function call.
+     * TODO: Complex absolute should check floating point flags.
      */
 
-    @name@_ctype_@oper@(arg1, &out);
-
     ret = PyArrayScalar_New(@OName@);
     PyArrayScalar_ASSIGN(ret, @OName@, out);
 
@@ -1210,6 +1490,10 @@ static PyObject *
  *         uint, long, ulong, longlong, ulonglong,
  *         half, float, double, longdouble,
  *         cfloat, cdouble, clongdouble#
+ * #Name = Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble#
  * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int,
  *         npy_uint, npy_long, npy_ulong, npy_longlong, npy_ulonglong,
  *         npy_half, npy_float, npy_double, npy_longdouble,
@@ -1221,24 +1505,14 @@ static int
 @name@_bool(PyObject *a)
 {
     int ret;
-    @type@ arg1;
+    @type@ val;
 
-    if (_@name@_convert_to_ctype(a, &arg1) < 0) {
-        if (PyErr_Occurred()) {
-            return -1;
-        }
-        return PyGenericArrType_Type.tp_as_number->nb_bool(a);
-    }
-
-    /*
-     * here we do the actual calculation with arg1 and arg2
-     * make it a function call.
-     */
+    val = PyArrayScalar_VAL(a, @Name@);
 
 #if @simp@
-    ret = @nonzero@(arg1);
+    ret = @nonzero@(val);
 #else
-    ret = (@nonzero@(arg1.real) || @nonzero@(arg1.imag));
+    ret = (@nonzero@(val.real) || @nonzero@(val.imag));
 #endif
 
     return ret;
@@ -1321,7 +1595,7 @@ static PyObject *
  * #to_ctype = , , , , , , , , , , npy_half_to_double, , , , , , #
  * #func = PyFloat_FromDouble*17#
  */
-static NPY_INLINE PyObject *
+static PyObject *
 @name@_float(PyObject *obj)
 {
 #if @cmplx@
@@ -1353,36 +1627,50 @@ static NPY_INLINE PyObject *
  *         long, ulong, longlong, ulonglong,
  *         half, float, double, longdouble,
  *         cfloat, cdouble, clongdouble#
+ * #Name = Byte, UByte, Short, UShort, Int, UInt,
+ *         Long, ULong, LongLong, ULongLong,
+ *         Half, Float, Double, LongDouble,
+ *         CFloat, CDouble, CLongDouble#
  * #simp = def*10, def_half, def*3, cmplx*3#
  */
+#define IS_@name@
+
 static PyObject*
 @name@_richcompare(PyObject *self, PyObject *other, int cmp_op)
 {
     npy_@name@ arg1, arg2;
-    int out=0;
-
-    RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    int out = 0;
 
-    switch(_@name@_convert2_to_ctypes(self, &arg1, other, &arg2)) {
-    case 0:
-        break;
-    case -1:
-        /* can't cast both safely use different add function */
-    case -2:
-        /* use ufunc */
-        if (PyErr_Occurred()) {
-            return NULL;
-        }
-        return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
-    case -3:
-        /*
-         * special case for longdouble and clongdouble
-         * because they have a recursive getitem in their dtype
-         */
-        Py_INCREF(Py_NotImplemented);
-        return Py_NotImplemented;
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    int res = convert_to_@name@(other, &arg2);
+    switch (res) {
+        case CONVERSION_ERROR:
+            return NULL;  /* an error occurred (should never happen) */
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+            RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case OTHER_IS_SUBCLASS:
+            /*
+             * Success converting.  We _could_ in principle defer in cases
+             * were the other subclass does not inherit the behavior.  In
+             * practice not even Python's `int` attempt this, so we also punt.
+             * (This is also even trickier for richcompare, though.)
+             */
+            break;
     }
 
+    arg1 = PyArrayScalar_VAL(self, @Name@);
+
     /* here we do the actual calculation with arg1 and arg2 */
     switch (cmp_op) {
     case Py_EQ:
@@ -1412,6 +1700,8 @@ static PyObject*
         PyArrayScalar_RETURN_FALSE;
     }
 }
+
+#undef IS_@name@
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/tests/test_scalarmath.py b/numpy/core/tests/test_scalarmath.py
index f190a83ba..a2b801760 100644
--- a/numpy/core/tests/test_scalarmath.py
+++ b/numpy/core/tests/test_scalarmath.py
@@ -8,6 +8,7 @@ from numpy.compat import _pep440
 import pytest
 from hypothesis import given, settings
 from hypothesis.strategies import sampled_from
+from hypothesis.extra import numpy as hynp
 
 import numpy as np
 from numpy.testing import (
@@ -24,6 +25,14 @@ types = [np.bool_, np.byte, np.ubyte, np.short, np.ushort, np.intc, np.uintc,
 floating_types = np.floating.__subclasses__()
 complex_floating_types = np.complexfloating.__subclasses__()
 
+objecty_things = [object(), None]
+
+reasonable_operators_for_scalars = [
+    operator.lt, operator.le, operator.eq, operator.ne, operator.ge,
+    operator.gt, operator.add, operator.floordiv, operator.mod,
+    operator.mul, operator.pow, operator.sub, operator.truediv,
+]
+
 
 # This compares scalarmath against ufuncs.
 
@@ -66,6 +75,41 @@ class TestTypes:
             np.add(1, 1)
 
 
+@pytest.mark.slow
+@settings(max_examples=10000, deadline=2000)
+@given(sampled_from(reasonable_operators_for_scalars),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()),
+       hynp.arrays(dtype=hynp.scalar_dtypes(), shape=()))
+def test_array_scalar_ufunc_equivalence(op, arr1, arr2):
+    """
+    This is a thorough test attempting to cover important promotion paths
+    and ensuring that arrays and scalars stay as aligned as possible.
+    However, if it creates troubles, it should maybe just be removed.
+    """
+    scalar1 = arr1[()]
+    scalar2 = arr2[()]
+    assert isinstance(scalar1, np.generic)
+    assert isinstance(scalar2, np.generic)
+
+    if arr1.dtype.kind == "c" or arr2.dtype.kind == "c":
+        comp_ops = {operator.ge, operator.gt, operator.le, operator.lt}
+        if op in comp_ops and (np.isnan(scalar1) or np.isnan(scalar2)):
+            pytest.xfail("complex comp ufuncs use sort-order, scalars do not.")
+
+    # ignore fpe's since they may just mismatch for integers anyway.
+    with warnings.catch_warnings(), np.errstate(all="ignore"):
+        # Comparisons DeprecationWarnings replacing errors (2022-03):
+        warnings.simplefilter("error", DeprecationWarning)
+        try:
+            res = op(arr1, arr2)
+        except Exception as e:
+            with pytest.raises(type(e)):
+                op(scalar1, scalar2)
+        else:
+            scalar_res = op(scalar1, scalar2)
+            assert_array_equal(scalar_res, res)
+
+
 class TestBaseMath:
     def test_blocked(self):
         # test alignments offsets for simd instructions
@@ -778,15 +822,6 @@ def recursionlimit(n):
         sys.setrecursionlimit(o)
 
 
-objecty_things = [object(), None]
-reasonable_operators_for_scalars = [
-    operator.lt, operator.le, operator.eq, operator.ne, operator.ge,
-    operator.gt, operator.add, operator.floordiv, operator.mod,
-    operator.mul, operator.matmul, operator.pow, operator.sub,
-    operator.truediv,
-]
-
-
 @given(sampled_from(objecty_things),
        sampled_from(reasonable_operators_for_scalars),
        sampled_from(types))
@@ -843,3 +878,129 @@ def test_clongdouble_inf_loop(op):
         op(None, np.longdouble(3))
     except TypeError:
         pass
+
+
+@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
+@pytest.mark.parametrize("operation", [
+        lambda min, max: max + max,
+        lambda min, max: min - max,
+        lambda min, max: max * max], ids=["+", "-", "*"])
+def test_scalar_integer_operation_overflow(dtype, operation):
+    st = np.dtype(dtype).type
+    min = st(np.iinfo(dtype).min)
+    max = st(np.iinfo(dtype).max)
+
+    with pytest.warns(RuntimeWarning, match="overflow encountered"):
+        operation(min, max)
+
+
+@pytest.mark.parametrize("dtype", np.typecodes["Integer"])
+@pytest.mark.parametrize("operation", [
+        lambda min, neg_1: abs(min),
+        lambda min, neg_1: min * neg_1,
+        lambda min, neg_1: min // neg_1], ids=["abs", "*", "//"])
+def test_scalar_signed_integer_overflow(dtype, operation):
+    # The minimum signed integer can "overflow" for some additional operations
+    st = np.dtype(dtype).type
+    min = st(np.iinfo(dtype).min)
+    neg_1 = st(-1)
+
+    with pytest.warns(RuntimeWarning, match="overflow encountered"):
+        operation(min, neg_1)
+
+
+@pytest.mark.parametrize("dtype", np.typecodes["UnsignedInteger"])
+@pytest.mark.xfail  # TODO: the check is quite simply missing!
+def test_scalar_signed_integer_overflow(dtype):
+    val = np.dtype(dtype).type(8)
+    with pytest.warns(RuntimeWarning, match="overflow encountered"):
+        -val
+
+
+@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
+@pytest.mark.parametrize("operation", [
+        lambda val, zero: val // zero,
+        lambda val, zero: val % zero, ], ids=["//", "%"])
+def test_scalar_integer_operation_divbyzero(dtype, operation):
+    st = np.dtype(dtype).type
+    val = st(100)
+    zero = st(0)
+
+    with pytest.warns(RuntimeWarning, match="divide by zero"):
+        operation(val, zero)
+
+
+ops_with_names = [
+    ("__lt__", "__gt__", operator.lt, True),
+    ("__le__", "__ge__", operator.le, True),
+    ("__eq__", "__eq__", operator.eq, True),
+    # Note __op__ and __rop__ may be identical here:
+    ("__ne__", "__ne__", operator.ne, True),
+    ("__gt__", "__lt__", operator.gt, True),
+    ("__ge__", "__le__", operator.ge, True),
+    ("__floordiv__", "__rfloordiv__", operator.floordiv, False),
+    ("__truediv__", "__rtruediv__", operator.truediv, False),
+    ("__add__", "__radd__", operator.add, False),
+    ("__mod__", "__rmod__", operator.mod, False),
+    ("__mul__", "__rmul__", operator.mul, False),
+    ("__pow__", "__rpow__", operator.pow, False),
+    ("__sub__", "__rsub__", operator.sub, False),
+]
+
+
+@pytest.mark.parametrize(["__op__", "__rop__", "op", "cmp"], ops_with_names)
+@pytest.mark.parametrize("sctype", [np.float32, np.float64, np.longdouble])
+def test_subclass_deferral(sctype, __op__, __rop__, op, cmp):
+    """
+    This test covers scalar subclass deferral.  Note that this is exceedingly
+    complicated, especially since it tends to fall back to the array paths and
+    these additionally add the "array priority" mechanism.
+
+    The behaviour was modified subtly in 1.22 (to make it closer to how Python
+    scalars work).  Due to its complexity and the fact that subclassing NumPy
+    scalars is probably a bad idea to begin with.  There is probably room
+    for adjustments here.
+    """
+    class myf_simple1(sctype):
+        pass
+
+    class myf_simple2(sctype):
+        pass
+
+    def defer(self, other):
+        return NotImplemented
+
+    def op_func(self, other):
+        return __op__
+
+    def rop_func(self, other):
+        return __rop__
+
+    myf_op = type("myf_op", (sctype,), {__op__: op_func, __rop__: rop_func})
+
+    # inheritance has to override, or this is correctly lost:
+    res = op(myf_simple1(1), myf_simple2(2))
+    assert type(res) == sctype or type(res) == np.bool_
+    assert op(myf_simple1(1), myf_simple2(2)) == op(1, 2)  # inherited
+
+    # Two independent subclasses do not really define an order.  This could
+    # be attempted, but we do not since Python's `int` does neither:
+    assert op(myf_op(1), myf_simple1(2)) == __op__
+    assert op(myf_simple1(1), myf_op(2)) == op(1, 2)  # inherited
+
+
+@pytest.mark.parametrize(["__op__", "__rop__", "op", "cmp"], ops_with_names)
+@pytest.mark.parametrize("pytype", [float, int, complex])
+def test_pyscalar_subclasses(pytype, __op__, __rop__, op, cmp):
+    def op_func(self, other):
+        return __op__
+
+    def rop_func(self, other):
+        return __rop__
+
+    myf = type("myf", (pytype,),
+            {__op__: op_func, __rop__: rop_func, "__array_ufunc__": None})
+
+    # Just like normally, we should never presume we can modify the float.
+    assert op(myf(1), np.float64(2)) == __op__
+    assert op(np.float64(1), myf(2)) == __rop__
author	Matti Picus <matti.picus@gmail.com>	2022-05-07 20:02:19 +0300
committer	GitHub <noreply@github.com>	2022-05-07 20:02:19 +0300
commit	37cb0f8463345648add0e1e98fd146c966146981 (patch)
tree	d2771e0f43118ef11abc5858bd124f18ce67d6c8
parent	e3a9e1a4188a27359950029be4d82d87c4a618b2 (diff)
parent	c2af30347d99d8bfcac23a7002b9229b829b9030 (diff)
download	numpy-37cb0f8463345648add0e1e98fd146c966146981.tar.gz