Reorganize NEON min/max implementation to be more generic

Thank you @seiko2plus for the excellent example. Reorganized code so that it can be used for other architectures. Core implementations and unroll factors should be the same as before for ARM NEON. Beyond reorganizing, we've added default implementations using universal intrinsics for non-ARM-NEON. Additionally, we've moved most min, max, fmin, fmax implementations to a new dispatchable source file: numpy/core/src/umath/loops_minmax.dispatch.c.src **Testing** - Apple silicon M1 native (arm64 / aarch64) -- No test failures - Apple silicon M1 Rosetta (x86_64) -- No new test failures - iMacPro1,1 (AVX512F) -- No test failures **Benchmarks** - Apple silicon M1 native (arm64 / aarch64) - Similar improvements as before reorg (comparison below) - x86_64 (both Apple silicon M1 Rosetta and iMacPro1,1 AVX512F) - Some x86_64 benchmarks are better, some are worse Apple silicon M1 native (arm64 / aarch64) comparison to original implementation / before reorg: ``` before after ratio [559ddede] [a3463b09] <gh-issue-17989/improve-neon-min-max> <gh-issue-17989/feedback/round-1> + 6.45±0.04μs 7.07±0.09μs 1.10 bench_lib.Nan.time_nanargmin(200, 0.1) + 32.1±0.3μs 35.2±0.2μs 1.10 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 2, 1, 'd') + 29.1±0.02μs 31.8±0.05μs 1.10 bench_core.Core.time_array_int_l1000 + 69.0±0.2μs 75.3±3μs 1.09 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'logical_not'>, 2, 4, 'f') + 92.0±1μs 99.5±0.5μs 1.08 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'logical_not'>, 4, 4, 'd') + 9.29±0.1μs 9.99±0.5μs 1.08 bench_ma.UFunc.time_1d(True, True, 10) + 338±0.6μs 362±10μs 1.07 bench_function_base.Sort.time_sort('quick', 'int16', ('random',)) + 4.21±0.03μs 4.48±0.2μs 1.07 bench_core.CountNonzero.time_count_nonzero_multi_axis(3, 100, <class 'str'>) + 12.3±0.06μs 13.1±0.7μs 1.06 bench_function_base.Median.time_even_small + 1.27±0μs 1.35±0.06μs 1.06 bench_itemselection.PutMask.time_dense(False, 'float16') + 139±1ns 147±6ns 1.06 bench_core.Core.time_array_1 + 33.7±0.01μs 35.5±2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'reciprocal'>, 2, 4, 'f') + 69.4±0.1μs 73.1±0.2μs 1.05 bench_ufunc_strides.Unary.time_ufunc(<ufunc 'logical_not'>, 4, 4, 'f') + 225±0.09μs 237±9μs 1.05 bench_random.Bounded.time_bounded('PCG64', [<class 'numpy.uint32'>, 2047]) - 15.7±0.5μs 14.9±0.03μs 0.95 bench_core.CountNonzero.time_count_nonzero_axis(2, 10000, <class 'numpy.int64'>) - 34.2±2μs 32.0±0.03μs 0.94 bench_ufunc_strides.Unary.time_ufunc(<ufunc '_ones_like'>, 4, 2, 'f') - 1.03±0.05ms 955±3μs 0.92 bench_lib.Nan.time_nanargmax(200000, 50.0) - 6.97±0.08μs 6.43±0.02μs 0.92 bench_ma.UFunc.time_scalar(True, False, 10) - 5.41±0μs 4.98±0.01μs 0.92 bench_ufunc_strides.AVX_cmplx_arithmetic.time_ufunc('subtract', 2, 'F') - 22.4±0.01μs 20.6±0.02μs 0.92 bench_core.Core.time_array_float64_l1000 - 1.51±0.01ms 1.38±0ms 0.92 bench_core.CorrConv.time_correlate(1000, 10000, 'same') - 10.1±0.2μs 9.27±0.09μs 0.92 bench_ufunc.UFunc.time_ufunc_types('invert') - 8.50±0.02μs 7.80±0.09μs 0.92 bench_indexing.ScalarIndexing.time_assign_cast(1) - 29.5±0.2μs 26.6±0.03μs 0.90 bench_ma.Concatenate.time_it('masked', 100) - 2.09±0.02ms 1.87±0ms 0.90 bench_ma.UFunc.time_2d(True, True, 1000) - 298±10μs 267±0.3μs 0.89 bench_app.MaxesOfDots.time_it - 10.7±0.2μs 9.60±0.02μs 0.89 bench_ma.UFunc.time_1d(True, True, 100) - 567±3μs 505±2μs 0.89 bench_lib.Nan.time_nanargmax(200000, 90.0) - 342±0.9μs 282±5μs 0.83 bench_lib.Nan.time_nanargmax(200000, 2.0) - 307±0.7μs 244±0.8μs 0.80 bench_lib.Nan.time_nanargmax(200000, 0.1) - 309±1μs 241±0.1μs 0.78 bench_lib.Nan.time_nanargmax(200000, 0) ```
author: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-11-18 12:51:00 -0800
committer: Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com> 2021-11-18 12:51:00 -0800
commit: 2ff7ab64d4e7d5928e96ca95b85350aa9caa2b63 (patch)
tree: 0628e1e3bbbd38bf921d5046d62dbbbcc96f2258 /numpy
parent: a5130840fe6e2aa0ff12d95ccdfdadc5589bb88a (diff)
download: numpy-2ff7ab64d4e7d5928e96ca95b85350aa9caa2b63.tar.gz
6 files changed, 1105 insertions, 758 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index c20468a8f..9fa87a11e 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -516,16 +516,14 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
@@ -539,7 +537,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
@@ -547,7 +545,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 32b5bc89e..19ea6f22f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,70 +153,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
-#define npyv_reduce_max_u8 vmaxvq_u8
-#define npyv_reduce_max_s8 vmaxvq_s8
-#define npyv_reduce_max_u16 vmaxvq_u16
-#define npyv_reduce_max_s16 vmaxvq_s16
-#define npyv_reduce_max_u32 vmaxvq_u32
-#define npyv_reduce_max_s32 vmaxvq_s32
-NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v)
-{
-    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
-    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
-    npyv_lanetype_u64 result = (a > b) ? a : b;
-    return result;
-}
-NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v)
-{
-    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
-    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
-    npyv_lanetype_s64 result = (a > b) ? a : b;
-    return result;
-}
-#define npyv_reduce_max_f32 vmaxvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_max_f64 vmaxvq_f64
-#endif // NPY_SIMD_F64
-
-// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
-// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
-#define npyv_reduce_maxp_f32 vmaxnmvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_maxp_f64 vmaxnmvq_f64
-#endif // NPY_SIMD_F64
-
-#define npyv_reduce_min_u8 vminvq_u8
-#define npyv_reduce_min_s8 vminvq_s8
-#define npyv_reduce_min_u16 vminvq_u16
-#define npyv_reduce_min_s16 vminvq_s16
-#define npyv_reduce_min_u32 vminvq_u32
-#define npyv_reduce_min_s32 vminvq_s32
-NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v)
-{
-    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
-    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
-    npyv_lanetype_u64 result = (a < b) ? a : b;
-    return result;
-}
-NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v)
-{
-    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
-    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
-    npyv_lanetype_s64 result = (a < b) ? a : b;
-    return result;
-}
-#define npyv_reduce_min_f32 vminvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_min_f64 vminvq_f64
-#endif // NPY_SIMD_F64
-
-// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
-// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
-#define npyv_reduce_minp_f32 vminnmvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_minp_f64 vminnmvq_f64
-#endif // NPY_SIMD_F64
-
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 950ea011e..fa7844014 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,7 +13,6 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
-#include "loops.h"
 
 #include "npy_pycompat.h"
 
@@ -551,7 +550,6 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *          npy_double, npy_double, npy_double, npy_double#
  * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
  * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -726,34 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat1**/
 
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >, <#
- **/
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            io1 = (io1 @OP@ in2) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
-        }
-    }
-}
-
-/**end repeat1**/
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-
 NPY_NO_EXPORT void
 @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1597,7 +1567,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
- *  #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
  */
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
@@ -1720,65 +1689,7 @@ NPY_NO_EXPORT void
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        BINARY_LOOP {
-            @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-            *((@type@ *)op1) = in1;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**end repeat1**/
-
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**begin repeat1
- * #kind = fmax, fmin#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
 /**end repeat1**/
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index aab1b5bbb..90115006f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -23,14 +23,6 @@
 #define BOOL_fmin BOOL_minimum
 
 /*
- * The ARM NEON implementation of min/max for longdouble, longlong, and
- * ulonglong assume that they're all 64-bits.  If that's not true, then we'll
- * use the scalar implementations instead.
- */
-#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64)
-#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64)
-
-/*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
@@ -680,32 +672,26 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
 /**begin repeat
  * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *         LONG, ULONG, LONGLONG, ULONGLONG#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
  */
-#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = maximum, minimum#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
-#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**end repeat**/
 
 //---------- Float ----------
 
  /**begin repeat
   * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
-  * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE#
   */
- #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = maximum, minimum, fmax, fmin#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
-#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**end repeat**/
 
 /*
diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig
new file mode 100644
index 000000000..aab1b5bbb
--- /dev/null
+++ b/numpy/core/src/umath/loops.h.src.orig
@@ -0,0 +1,717 @@
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#define BOOL_invert BOOL_logical_not
+#define BOOL_add BOOL_logical_or
+#define BOOL_bitwise_and BOOL_logical_and
+#define BOOL_bitwise_or BOOL_logical_or
+#define BOOL_logical_xor BOOL_not_equal
+#define BOOL_bitwise_xor BOOL_logical_xor
+#define BOOL_multiply BOOL_logical_and
+#define BOOL_maximum BOOL_logical_or
+#define BOOL_minimum BOOL_logical_and
+#define BOOL_fmax BOOL_maximum
+#define BOOL_fmin BOOL_minimum
+
+/*
+ * The ARM NEON implementation of min/max for longdouble, longlong, and
+ * ulonglong assume that they're all 64-bits.  If that's not true, then we'll
+ * use the scalar implementations instead.
+ */
+#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64)
+#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64)
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or, absolute, logical_not#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+
+/**begin repeat1
+ * both signed and unsigned integer types
+ * #s = , u#
+ * #S = , U#
+ */
+
+#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_fmax @S@@TYPE@_maximum
+#define @S@@TYPE@_fmin @S@@TYPE@_minimum
+
+NPY_NO_EXPORT void
+@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #isa = , _avx2#
+ */
+
+NPY_NO_EXPORT void
+@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat3
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *          left_shift, right_shift#
+ * #OP = +, -,*, &, |, ^, <<, >>#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+/**begin repeat3
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or#
+ * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**begin repeat2
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = sqrt, absolute, square, reciprocal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos# 
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = maximum, minimum#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * # kind = exp, log, frexp, ldexp#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ *  #c = f, f, , l#
+ *  #C = F, F, , L#
+ */
+
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
+ * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ **/
+
+/**begin repeat2
+ * #ISA  = , _avx512_skx#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = add, subtract, multiply#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract, multiply#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
+ * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+   #kind = logical_and, logical_or#
+   #OP1 = ||, ||#
+   #OP2 = &&, ||#
+*/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #OP = ||, ||, &&#
+ **/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #isa = , _avx512f#
+ */
+
+NPY_NO_EXPORT void
+C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define @TYPE@_isnan @TYPE@_isnat
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, !=, >, >=, <, <=#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum, fmin, fmax#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP = EQ, NE, GT, GE, LT, LE#
+ */
+/**begin repeat1
+ * #suffix = , _OO_O#
+ */
+NPY_NO_EXPORT void
+OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
+
+/*
+ *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ */
+#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+//---------- Float ----------
+
+ /**begin repeat
+  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+  * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE#
+  */
+ #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 06aed1bb7..c2eddcee6 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** neon
+ ** neon asimd
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -13,659 +13,460 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
+/*******************************************************************************
+ ** Scalar intrinsics
+ ******************************************************************************/
+// signed/unsigned int
+#define scalar_max_i(A, B) ((A > B) ? A : B)
+#define scalar_min_i(A, B) ((A < B) ? A : B)
+// fp, propagates NaNs
+#if !defined(__aarch64__)
+#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B)
+#define scalar_max_d scalar_max_f
+#define scalar_max_l scalar_max_f
+#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B)
+#define scalar_min_d scalar_min_f
+#define scalar_min_l scalar_min_f
+// fp, ignores NaNs
+#define scalar_maxp_f fmaxf
+#define scalar_maxp_d fmax
+#define scalar_maxp_l fmaxl
+#define scalar_minp_f fminf
+#define scalar_minp_d fmin
+#define scalar_minp_l fminl
+#elif defined(__aarch64__)
+/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #scalar_sfx = f, d, l#
+ * #asm_sfx = s, d, d#
+ */
+/**begin repeat1
+ * #op = max, min, maxp, minp#
+ * #asm_instr = fmax, fmin, fmaxnm, fminnm#
+ */
+static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
+    @type@ result = 0;
+    __asm(
+        "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+/**end repeat1**/
+/**end repeat**/
+#endif // !defined(__aarch64__)
 
-//##############################################################################
-//## Maximum, Minimum
-//##############################################################################
-
-#ifdef NPY_HAVE_NEON
-//******************************************************************************
-//** NEON Min / Max
-//******************************************************************************
-
-
-//------------------------------------------------------------------------------
-//-- Integer
-//------------------------------------------------------------------------------
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
 
+#if NPY_SIMD
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #is_64 = 0*6, 1*2#
+ */
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #if !@is_64@
+        #define npyv_reduce_min_@sfx@ vminvq_@sfx@
+        #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@
+    #else
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a < b) ? a : b;
+            return result;
+        }
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a > b) ? a : b;
+            return result;
+        }
+    #endif // !@is_64@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@intrin@_i(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+/**end repeat**/
+#endif // NPY_SIMD
 
 /**begin repeat
- * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
- * #sfx = s8, u8, s16, u16, s32, u32,
- *        s64, u64, s64, u64#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #simd_chk = NPY_SIMD, NPY_SIMD_F64#
+ * #scalar_sfx = f, d#
  */
+#if @simd_chk@
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #define npyv_minn_@sfx@ vminq_@sfx@
+    #define npyv_maxn_@sfx@ vmaxq_@sfx@
+    #define npyv_reduce_minn_@sfx@ vminvq_@sfx@
+    #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@
+    #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@
+    #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    // propagates NaNs
+    NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
+    {
+        npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
+        return result;
+    }
+    /**end repeat1**/
+    /**begin repeat1
+     * #intrin = minn, maxn, minp, maxp#
+     * #scalar_intrin = min, max, minp, maxp#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+#endif // simd_chk
+/**end repeat**/
 
-// Implementation below assumes longlong and ulonglong are 64-bit.
-#if @HAVE_NEON_IMPL@
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
 
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1, 1#
+ * #scalar_sfx = i*8, f, d#
+ */
 /**begin repeat1
-* Arithmetic
-* # kind = maximum, minimum#
-* # OP = >, <#
-* # vop = max, min#
-*/
-
-#define SCALAR_OP @TYPE@_scalar_@vop@
-static inline @type@ SCALAR_OP(@type@ a, @type@ b){
-    return ((a @OP@ b) ? a : b);
-}
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
+#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
+#if @is_fp@ && !@fp_only@
+    #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
+#else
+    #define V_INTRIN npyv_@intrin@_@sfx@
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@
+#endif
 
-static inline npy_intp
-simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+// contiguous input.
+static inline void
+simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
 {
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    @type@ *ip2 = (@type@ *)args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
+    // Note, 8x unroll was chosen for best results on Apple M1
     const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
     npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
 
+    npy_intp i = 0;
+    npyv_lanetype_@sfx@ io1 = op1[0];
+
     // SIMD if possible
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        #define NPYV_CAST (const npyv_lanetype_@sfx@ *)
-        npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
-        npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
-        npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
-        npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
-        npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
-        npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
-        npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
-        npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
+    if((i+elemPerLoop) <= len){
+        npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
+        npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
+        npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
+        npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
+        npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
+        npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
+        npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
+        npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
 
         i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
-            npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
-            npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
-            npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
-            npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
-            npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
-            npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
-            npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
-
-            m0 = npyv_@vop@_@sfx@(m0, v0);
-            m1 = npyv_@vop@_@sfx@(m1, v1);
-            m2 = npyv_@vop@_@sfx@(m2, v2);
-            m3 = npyv_@vop@_@sfx@(m3, v3);
-            m4 = npyv_@vop@_@sfx@(m4, v4);
-            m5 = npyv_@vop@_@sfx@(m5, v5);
-            m6 = npyv_@vop@_@sfx@(m6, v6);
-            m7 = npyv_@vop@_@sfx@(m7, v7);
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
+            npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
+
+            m0 = V_INTRIN(m0, v0);
+            m1 = V_INTRIN(m1, v1);
+            m2 = V_INTRIN(m2, v2);
+            m3 = V_INTRIN(m3, v3);
+            m4 = V_INTRIN(m4, v4);
+            m5 = V_INTRIN(m5, v5);
+            m6 = V_INTRIN(m6, v6);
+            m7 = V_INTRIN(m7, v7);
         }
 
-        #undef NPYV_CAST
+        m0 = V_INTRIN(m0, m1);
+        m2 = V_INTRIN(m2, m3);
+        m4 = V_INTRIN(m4, m5);
+        m6 = V_INTRIN(m6, m7);
 
-        m0 = npyv_@vop@_@sfx@(m0, m1);
-        m2 = npyv_@vop@_@sfx@(m2, m3);
-        m4 = npyv_@vop@_@sfx@(m4, m5);
-        m6 = npyv_@vop@_@sfx@(m6, m7);
+        m0 = V_INTRIN(m0, m2);
+        m4 = V_INTRIN(m4, m6);
 
-        m0 = npyv_@vop@_@sfx@(m0, m2);
-        m4 = npyv_@vop@_@sfx@(m4, m6);
+        m0 = V_INTRIN(m0, m4);
 
-        m0 = npyv_@vop@_@sfx@(m0, m4);
-
-        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
+        npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0);
 
         io1 = SCALAR_OP(io1, r);
     }
 
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline npy_intp
-scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    // 8x scalar
-    npy_intp elemPerLoop = 8;
-    if((i+elemPerLoop) <= n){
-        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
-        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
-        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
-        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
-        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
-        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
-        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
-        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
-            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
-            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
-            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
-            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
-            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
-            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
-            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-            m0 = SCALAR_OP(m0, v0);
-            m1 = SCALAR_OP(m1, v1);
-            m2 = SCALAR_OP(m2, v2);
-            m3 = SCALAR_OP(m3, v3);
-            m4 = SCALAR_OP(m4, v4);
-            m5 = SCALAR_OP(m5, v5);
-            m6 = SCALAR_OP(m6, v6);
-            m7 = SCALAR_OP(m7, v7);
-        }
-
-        m0 = SCALAR_OP(m0, m1);
-        m2 = SCALAR_OP(m2, m3);
-        m4 = SCALAR_OP(m4, m5);
-        m6 = SCALAR_OP(m6, m7);
-
-        m0 = SCALAR_OP(m0, m2);
-        m4 = SCALAR_OP(m4, m6);
-
-        m0 = SCALAR_OP(m0, m4);
-
-        io1 = SCALAR_OP(io1, m0);
-    }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline void
-run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    i = 0;
-
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else{
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
     // Scalar - finish up any remaining iterations
-    io1 = iop1[0];
-    ip2 += i * is2;
-    for(; i<n; ++i, ip2 += is2){
-        const @type@ in2 = *(@type@ *)ip2;
+    for(; i<len; ++i){
+        const npyv_lanetype_@sfx@ in2 = ip[i];
         io1 = SCALAR_OP(io1, in2);
     }
-    *((@type@ *)iop1) = io1;
+    op1[0] = io1;
 }
 
-static inline npy_intp
-simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
+                                     npyv_lanetype_@sfx@ *op1, npy_intp len)
 {
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
+    // Note, 6x unroll was chosen for best results on Apple M1
     const int vectorsPerLoop = 6;
-    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
     int elemPerLoop = vectorsPerLoop * elemPerVector;
 
-    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
-            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
-            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
-            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
-            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
-            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
-
-            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
-            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
-            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
-            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
-            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
-            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
-
-            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
-            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
-            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
-            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
-            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
-            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
-
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
-        }
+    npy_intp i = 0;
+
+    for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+        npyv_@sfx@ v0 =  npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
+        npyv_@sfx@ v1 =  npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
+        npyv_@sfx@ v2 =  npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
+        npyv_@sfx@ v3 =  npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
+        npyv_@sfx@ v4 =  npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
+        npyv_@sfx@ v5 =  npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
+
+        npyv_@sfx@ u0 =  npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
+        npyv_@sfx@ u1 =  npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
+        npyv_@sfx@ u2 =  npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
+        npyv_@sfx@ u3 =  npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
+        npyv_@sfx@ u4 =  npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
+        npyv_@sfx@ u5 =  npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
+
+        npyv_@sfx@ m0 = V_INTRIN(v0, u0);
+        npyv_@sfx@ m1 = V_INTRIN(v1, u1);
+        npyv_@sfx@ m2 = V_INTRIN(v2, u2);
+        npyv_@sfx@ m3 = V_INTRIN(v3, u3);
+        npyv_@sfx@ m4 = V_INTRIN(v4, u4);
+        npyv_@sfx@ m5 = V_INTRIN(v5, u5);
+
+        npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
+        npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
     }
-    
-    return i;
-}
-
-static inline npy_intp
-scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    npy_intp elemPerLoop = 4;
-    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-        /* Note, we can't just load all, do all ops, then store all here.
-         * Sometimes ufuncs are called with `accumulate`, which makes the
-         * assumption that previous iterations have finished before next
-         * iteration.  For example, the output of iteration 2 depends on the
-         * result of iteration 1.
-         */
-
-        /**begin repeat2
-         * #unroll = 0, 1, 2, 3#
-         */
-        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
-        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
-        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
-        /**end repeat2**/
-    }
-
-    return i;
-}
-
-static inline void
-run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    i = 0;
-
-    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
-            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
-            nomemoverlap(ip2, is2 * n, op1, os1 * n))
-    {
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else {
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
 
     // Scalar - finish up any remaining iterations
-    ip1 += is1 * i;
-    ip2 += is2 * i;
-    op1 += os1 * i;
-    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
+    for(; i<len; ++i){
+        const npyv_lanetype_@sfx@ in1 = ip1[i];
+        const npyv_lanetype_@sfx@ in2 = ip2[i];
         
-        *((@type@ *)op1) = SCALAR_OP(in1, in2);
+        op1[i] = SCALAR_OP(in1, in2);
     }
 }
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
-    } else {
-        run_binary_@TYPE@_@kind@(args, dimensions, steps);
-    }
-}
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
 
-#undef SCALAR_OP
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
 
+#undef SCALAR_OP
 /**end repeat1**/
-
-#endif // @HAVE_NEON_IMPL@
-
 /**end repeat**/
 
-
-//------------------------------------------------------------------------------
-//-- Float
-//------------------------------------------------------------------------------
-
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
 /**begin repeat
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #type = npy_float, npy_double, npy_longdouble#
- * #sfx = f32, f64, f64#
- * #asmType = s, d, d#
- * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         float, double, long double#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigend = 1*5, 0*5, 0*3#
+ * #scalar_sfx = i*10, f, d, l#
  */
-
-// Implementation below assumes longdouble is 64-bit.
-#if @HAVE_NEON_IMPL@
-
+#undef TO_SIMD_SFX
+#if 0
 /**begin repeat1
-* Arithmetic
-* # kind = maximum, minimum, fmax, fmin#
-* # OP = >=, <=, >=, <=#
-* # vop = max, min, maxp, minp#
-* # asmInstr = fmax, fmin, fmaxnm, fminnm#
-* # PROPAGATE_NAN = 1, 1, 0, 0#
-*/
-
-#define SCALAR_OP @TYPE@_scalar_@vop@
-static inline @type@ SCALAR_OP(@type@ a, @type@ b){
-#ifdef __aarch64__
-    @type@ result = 0;
-    __asm(
-        "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]"
-        : [result] "=w" (result)
-        : [a] "w" (a), [b] "w" (b)
-    );
-    return result;
-#else
-
-#if @PROPAGATE_NAN@
-    return ((a @OP@ b || npy_isnan(a)) ? a : b);
-#else
-    return ((a @OP@ b || npy_isnan(b)) ? a : b);
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigend@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
 #endif
-#endif // __aarch64__
-}
 
-static inline npy_intp
-simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    @type@ *ip2 = (@type@ *)args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    // SIMD if possible
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
-        npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
-        npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
-        npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
-        npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
-        npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
-        npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
-        npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
+/**begin repeat1
+ * # kind = maximum, minimum, fmax, fmin#
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#if !@fp_only@ || (@is_fp@ && @fp_only@)
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
 
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
-            npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
-            npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
-            npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
-            npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
-            npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
-            npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
-            npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
-
-            m0 = npyv_@vop@_@sfx@(m0, v0);
-            m1 = npyv_@vop@_@sfx@(m1, v1);
-            m2 = npyv_@vop@_@sfx@(m2, v2);
-            m3 = npyv_@vop@_@sfx@(m3, v3);
-            m4 = npyv_@vop@_@sfx@(m4, v4);
-            m5 = npyv_@vop@_@sfx@(m5, v5);
-            m6 = npyv_@vop@_@sfx@(m6, v6);
-            m7 = npyv_@vop@_@sfx@(m7, v7);
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(@type@)) {
+            TO_SIMD_SFX(simd_reduce_c_@intrin@)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
         }
-
-        m0 = npyv_@vop@_@sfx@(m0, m1);
-        m2 = npyv_@vop@_@sfx@(m2, m3);
-        m4 = npyv_@vop@_@sfx@(m4, m5);
-        m6 = npyv_@vop@_@sfx@(m6, m7);
-
-        m0 = npyv_@vop@_@sfx@(m0, m2);
-        m4 = npyv_@vop@_@sfx@(m4, m6);
-
-        m0 = npyv_@vop@_@sfx@(m0, m4);
-
-        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
-
-        io1 = SCALAR_OP(io1, r);
     }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline npy_intp
-scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    // 8x scalar
-    npy_intp elemPerLoop = 8;
-    if((i+elemPerLoop) <= n){
-        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
-        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
-        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
-        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
-        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
-        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
-        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
-        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
-            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
-            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
-            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
-            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
-            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
-            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
-            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-            m0 = SCALAR_OP(m0, v0);
-            m1 = SCALAR_OP(m1, v1);
-            m2 = SCALAR_OP(m2, v2);
-            m3 = SCALAR_OP(m3, v3);
-            m4 = SCALAR_OP(m4, v4);
-            m5 = SCALAR_OP(m5, v5);
-            m6 = SCALAR_OP(m6, v6);
-            m7 = SCALAR_OP(m7, v7);
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(@type@, @type@)) {
+            TO_SIMD_SFX(simd_binary_ccc_@intrin@)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
         }
-
-        m0 = SCALAR_OP(m0, m1);
-        m2 = SCALAR_OP(m2, m3);
-        m4 = SCALAR_OP(m4, m5);
-        m6 = SCALAR_OP(m6, m7);
-
-        m0 = SCALAR_OP(m0, m2);
-        m4 = SCALAR_OP(m4, m6);
-
-        m0 = SCALAR_OP(m0, m4);
-
-        io1 = SCALAR_OP(io1, m0);
     }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline void
-run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    i = 0;
-
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
+            @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
+            @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
+            @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
+            @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
+            @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
+            @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
+            @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
+                @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
+                @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
+                @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
+                @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
+                @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
+                @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
+                @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0);
+        }
     } else{
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
-    // Scalar - finish up any remaining iterations
-    io1 = iop1[0];
-    ip2 += i * is2;
-    for(; i<n; ++i, ip2 += is2){
-        const @type@ in2 = *(@type@ *)ip2;
-        io1 = SCALAR_OP(io1, in2);
-    }
-    *((@type@ *)iop1) = io1;
-}
-
-static inline npy_intp
-simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    const int vectorsPerLoop = 6;
-    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    int elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
-            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
-            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
-            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
-            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
-            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
-
-            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
-            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
-            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
-            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
-            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
-            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
-
-            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
-            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
-            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
-            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
-            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
-            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
-
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            /**begin repeat2
+             * #unroll = 0, 1, 2, 3#
+             */
+            @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
+            @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
+            *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
+            /**end repeat2**/
         }
     }
-    
-    return i;
-}
-
-static inline npy_intp
-scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    npy_intp elemPerLoop = 4;
-    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-        /* Note, we can't just load all, do all ops, then store all here.
-         * Sometimes ufuncs are called with `accumulate`, which makes the
-         * assumption that previous iterations have finished before next
-         * iteration.  For example, the output of iteration 2 depends on the
-         * result of iteration 1.
-         */
-
-        /**begin repeat2
-         * #unroll = 0, 1, 2, 3#
-         */
-        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
-        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
-        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
-        /**end repeat2**/
-    }
-
-    return i;
-}
-
-static inline void
-run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    i = 0;
-
-    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
-            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
-            nomemoverlap(ip2, is2 * n, op1, os1 * n))
-    {
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else {
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
-    // Scalar - finish up any remaining iterations
+#endif // NPY_DISABLE_OPTIMIZATION
     ip1 += is1 * i;
     ip2 += is2 * i;
     op1 += os1 * i;
-    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        
         *((@type@ *)op1) = SCALAR_OP(in1, in2);
     }
-}
-
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
-    } else {
-        run_binary_@TYPE@_@kind@(args, dimensions, steps);
-    }
-
+#ifdef TO_SIMD_SFX
+clear_fp:;
+#endif
+#if @is_fp@
     npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
 }
 
 #undef SCALAR_OP
 
+#endif // !fp_only || (is_fp && fp_only)
 /**end repeat1**/
-
-#endif // @HAVE_NEON_IMPL@
-
 /**end repeat**/
 
-#endif // NPY_HAVE_NEON
author	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-11-18 12:51:00 -0800
committer	Developer-Ecosystem-Engineering <65677710+Developer-Ecosystem-Engineering@users.noreply.github.com>	2021-11-18 12:51:00 -0800
commit	2ff7ab64d4e7d5928e96ca95b85350aa9caa2b63 (patch)
tree	0628e1e3bbbd38bf921d5046d62dbbbcc96f2258 /numpy
parent	a5130840fe6e2aa0ff12d95ccdfdadc5589bb88a (diff)
download	numpy-2ff7ab64d4e7d5928e96ca95b85350aa9caa2b63.tar.gz