6 files changed, 1105 insertions, 758 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index c20468a8f..9fa87a11e 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -516,16 +516,14 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(ints+inexactvec, dispatch=[('loops_minmax', ints+inexactvec)]),
-          TD(noobj, simd=[('avx512f', 'fd')]),
+          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
@@ -539,7 +537,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMax')
           ),
@@ -547,7 +545,7 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(inexactvec, dispatch=[('loops_minmax', inexactvec)]),
+          TD('fdg', dispatch=[('loops_minmax', 'fdg')]),
           TD(noobj),
           TD(O, f='npy_ObjectMin')
           ),
diff --git a/numpy/core/src/common/simd/neon/math.h b/numpy/core/src/common/simd/neon/math.h
index 32b5bc89e..19ea6f22f 100644
--- a/numpy/core/src/common/simd/neon/math.h
+++ b/numpy/core/src/common/simd/neon/math.h
@@ -153,70 +153,4 @@ NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b)
     return vbslq_s64(npyv_cmplt_s64(a, b), a, b);
 }
 
-#define npyv_reduce_max_u8 vmaxvq_u8
-#define npyv_reduce_max_s8 vmaxvq_s8
-#define npyv_reduce_max_u16 vmaxvq_u16
-#define npyv_reduce_max_s16 vmaxvq_s16
-#define npyv_reduce_max_u32 vmaxvq_u32
-#define npyv_reduce_max_s32 vmaxvq_s32
-NPY_FINLINE npyv_lanetype_u64 npyv_reduce_max_u64(npyv_u64 v)
-{
-    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
-    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
-    npyv_lanetype_u64 result = (a > b) ? a : b;
-    return result;
-}
-NPY_FINLINE npyv_lanetype_s64 npyv_reduce_max_s64(npyv_s64 v)
-{
-    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
-    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
-    npyv_lanetype_s64 result = (a > b) ? a : b;
-    return result;
-}
-#define npyv_reduce_max_f32 vmaxvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_max_f64 vmaxvq_f64
-#endif // NPY_SIMD_F64
-
-// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
-// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
-#define npyv_reduce_maxp_f32 vmaxnmvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_maxp_f64 vmaxnmvq_f64
-#endif // NPY_SIMD_F64
-
-#define npyv_reduce_min_u8 vminvq_u8
-#define npyv_reduce_min_s8 vminvq_s8
-#define npyv_reduce_min_u16 vminvq_u16
-#define npyv_reduce_min_s16 vminvq_s16
-#define npyv_reduce_min_u32 vminvq_u32
-#define npyv_reduce_min_s32 vminvq_s32
-NPY_FINLINE npyv_lanetype_u64 npyv_reduce_min_u64(npyv_u64 v)
-{
-    npyv_lanetype_u64 a = vgetq_lane_u64(v, 0);
-    npyv_lanetype_u64 b = vgetq_lane_u64(v, 1);
-    npyv_lanetype_u64 result = (a < b) ? a : b;
-    return result;
-}
-NPY_FINLINE npyv_lanetype_s64 npyv_reduce_min_s64(npyv_s64 v)
-{
-    npyv_lanetype_s64 a = vgetq_lane_s64(v, 0);
-    npyv_lanetype_s64 b = vgetq_lane_s64(v, 1);
-    npyv_lanetype_s64 result = (a < b) ? a : b;
-    return result;
-}
-#define npyv_reduce_min_f32 vminvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_min_f64 vminvq_f64
-#endif // NPY_SIMD_F64
-
-// Minimum, supports IEEE floating-point arithmetic (IEC 60559),
-// - If one of the two vectors contains NaN, the equivalent element of the other vector is set
-// - Only if both corresponded elements are NaN, NaN is set. 
-#define npyv_reduce_minp_f32 vminnmvq_f32
-#if NPY_SIMD_F64
-    #define npyv_reduce_minp_f64 vminnmvq_f64
-#endif // NPY_SIMD_F64
-
 #endif // _NPY_SIMD_NEON_MATH_H
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 950ea011e..fa7844014 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -13,7 +13,6 @@
 #include "numpy/npy_math.h"
 #include "numpy/halffloat.h"
 #include "lowlevel_strided_loops.h"
-#include "loops.h"
 
 #include "npy_pycompat.h"
 
@@ -551,7 +550,6 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *          npy_double, npy_double, npy_double, npy_double#
  * #SIGNED = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0#
  * #c = hh,uhh,h,uh,,u,l,ul,ll,ull#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
  */
 
 #define @TYPE@_floor_divide @TYPE@_divide
@@ -726,34 +724,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 /**end repeat1**/
 
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**begin repeat1
- * #kind = maximum, minimum#
- * #OP =  >, <#
- **/
-
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            io1 = (io1 @OP@ in2) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            *((@type@ *)op1) = (in1 @OP@ in2) ? in1 : in2;
-        }
-    }
-}
-
-/**end repeat1**/
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-
 NPY_NO_EXPORT void
 @TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -1597,7 +1567,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
  *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, , l#
  *  #C = F, , L#
- *  #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
  */
 /**begin repeat1
  * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
@@ -1720,65 +1689,7 @@ NPY_NO_EXPORT void
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
-
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) {
-            BINARY_REDUCE_LOOP(@type@) {
-                const @type@ in2 = *(@type@ *)ip2;
-                /* Order of operations important for MSVC 2015 */
-                io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2;
-            }
-            *((@type@ *)iop1) = io1;
-        }
-    }
-    else {
-        BINARY_LOOP {
-            @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2;
-            *((@type@ *)op1) = in1;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**end repeat1**/
-
-#if !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
-/**begin repeat1
- * #kind = fmax, fmin#
- * #OP =  >=, <=#
- **/
-NPY_NO_EXPORT void
-@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    /*  */
-    if (IS_BINARY_REDUCE) {
-        BINARY_REDUCE_LOOP(@type@) {
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            io1 = (io1 @OP@ in2 || npy_isnan(in2)) ? io1 : in2;
-        }
-        *((@type@ *)iop1) = io1;
-    }
-    else {
-        BINARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            const @type@ in2 = *(@type@ *)ip2;
-            /* Order of operations important for MSVC 2015 */
-            *((@type@ *)op1) = (in1 @OP@ in2 || npy_isnan(in2)) ? in1 : in2;
-        }
-    }
-    npy_clear_floatstatus_barrier((char*)dimensions);
-}
 /**end repeat1**/
-#endif // !defined(NPY_HAVE_NEON) || !@HAVE_NEON_IMPL@
 
 NPY_NO_EXPORT void
 @TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index aab1b5bbb..90115006f 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -23,14 +23,6 @@
 #define BOOL_fmin BOOL_minimum
 
 /*
- * The ARM NEON implementation of min/max for longdouble, longlong, and
- * ulonglong assume that they're all 64-bits.  If that's not true, then we'll
- * use the scalar implementations instead.
- */
-#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64)
-#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64)
-
-/*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
@@ -680,32 +672,26 @@ PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
 /**begin repeat
  * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
  *         LONG, ULONG, LONGLONG, ULONGLONG#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
  */
-#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = maximum, minimum#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
-#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**end repeat**/
 
 //---------- Float ----------
 
  /**begin repeat
   * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
-  * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE#
   */
- #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**begin repeat1
  * #kind = maximum, minimum, fmax, fmin#
  */
  NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat1**/
-#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
 /**end repeat**/
 
 /*
diff --git a/numpy/core/src/umath/loops.h.src.orig b/numpy/core/src/umath/loops.h.src.orig
new file mode 100644
index 000000000..aab1b5bbb
--- /dev/null
+++ b/numpy/core/src/umath/loops.h.src.orig
@@ -0,0 +1,717 @@
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+#define BOOL_invert BOOL_logical_not
+#define BOOL_add BOOL_logical_or
+#define BOOL_bitwise_and BOOL_logical_and
+#define BOOL_bitwise_or BOOL_logical_or
+#define BOOL_logical_xor BOOL_not_equal
+#define BOOL_bitwise_xor BOOL_logical_xor
+#define BOOL_multiply BOOL_logical_and
+#define BOOL_maximum BOOL_logical_or
+#define BOOL_minimum BOOL_logical_and
+#define BOOL_fmax BOOL_maximum
+#define BOOL_fmin BOOL_minimum
+
+/*
+ * The ARM NEON implementation of min/max for longdouble, longlong, and
+ * ulonglong assume that they're all 64-bits.  If that's not true, then we'll
+ * use the scalar implementations instead.
+ */
+#define HAVE_NEON_IMPL_LONGDOUBLE (NPY_BITSOF_LONGDOUBLE == 64)
+#define HAVE_NEON_IMPL_LONGLONG (NPY_BITSOF_LONGLONG == 64)
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or, absolute, logical_not#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+           BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+
+/**begin repeat1
+ * both signed and unsigned integer types
+ * #s = , u#
+ * #S = , U#
+ */
+
+#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
+#define @S@@TYPE@_fmax @S@@TYPE@_maximum
+#define @S@@TYPE@_fmin @S@@TYPE@_minimum
+
+NPY_NO_EXPORT void
+@S@@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #isa = , _avx2#
+ */
+
+NPY_NO_EXPORT void
+@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat3
+ * Arithmetic
+ * #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
+ *          left_shift, right_shift#
+ * #OP = +, -,*, &, |, ^, <<, >>#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+/**begin repeat3
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal,
+ *         logical_and, logical_or#
+ * #OP =  ==, !=, >, >=, <, <=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**begin repeat2
+ * #kind = maximum, minimum#
+ * #OP =  >, <#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+NPY_NO_EXPORT void
+@S@@TYPE@_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_fmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat2
+ * #kind = isnan, isinf, isfinite#
+ **/
+NPY_NO_EXPORT void
+@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+/**end repeat1**/
+
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #kind = sqrt, absolute, square, reciprocal#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos# 
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = maximum, minimum#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_@func@, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * # kind = exp, log, frexp, ldexp#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = rint, ceil, floor, trunc#
+ */
+
+/**begin repeat1
+*  #TYPE = FLOAT, DOUBLE#
+*/
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat2
+ * #isa = avx512f, fma#
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+@TYPE@_@func@_@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * Float types
+ *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
+ *  #c = f, f, , l#
+ *  #C = F, F, , L#
+ */
+
+
+/**begin repeat1
+ * Arithmetic
+ * # kind = add, subtract, multiply, divide#
+ * # OP = +, -, *, /#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = equal, not_equal, less, less_equal, greater, greater_equal,
+ *        logical_and, logical_or#
+ * #OP = ==, !=, <, <=, >, >=, &&, ||#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
+ * #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
+ **/
+
+/**begin repeat2
+ * #ISA  = , _avx512_skx#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP =  >=, <=#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+@TYPE@_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+@TYPE@_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = CFLOAT, CDOUBLE#
+ */
+/**begin repeat1
+ * #kind = add, subtract, multiply#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+/**begin repeat
+ * complex types
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ * #c = f, , l#
+ * #C = F, , L#
+ */
+
+/**begin repeat1
+ * arithmetic
+ * #kind = add, subtract, multiply#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind= greater, greater_equal, less, less_equal, equal, not_equal#
+ * #OP = CGT, CGE, CLT, CLE, CEQ, CNE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+   #kind = logical_and, logical_or#
+   #OP1 = ||, ||#
+   #OP2 = &&, ||#
+*/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**begin repeat1
+ * #kind = isnan, isinf, isfinite#
+ * #func = npy_isnan, npy_isinf, npy_isfinite#
+ * #OP = ||, ||, &&#
+ **/
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+C@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #isa = , _avx512f#
+ */
+
+NPY_NO_EXPORT void
+C@TYPE@_conjugate@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_absolute@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_square@isa@(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+/**end repeat1**/
+
+NPY_NO_EXPORT void
+C@TYPE@__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+C@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat1
+ * #kind = maximum, minimum#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = fmax, fmin#
+ * #OP = CGE, CLE#
+ */
+NPY_NO_EXPORT void
+C@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/**begin repeat
+ * #TYPE = DATETIME, TIMEDELTA#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+@TYPE@_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define @TYPE@_isnan @TYPE@_isnat
+
+NPY_NO_EXPORT void
+@TYPE@__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+/**begin repeat1
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP =  ==, !=, >, >=, <, <=#
+ */
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**begin repeat1
+ * #kind = maximum, minimum, fmin, fmax#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+
+/**end repeat**/
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+/**begin repeat
+ * #kind = equal, not_equal, greater, greater_equal, less, less_equal#
+ * #OP = EQ, NE, GT, GE, LT, LE#
+ */
+/**begin repeat1
+ * #suffix = , _OO_O#
+ */
+NPY_NO_EXPORT void
+OBJECT@suffix@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat1**/
+/**end repeat**/
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
+
+/*
+ *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+/**begin repeat
+ * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
+ *         LONG, ULONG, LONGLONG, ULONGLONG#
+ * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ */
+#if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+//---------- Float ----------
+
+ /**begin repeat
+  * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+  * #HAVE_NEON_IMPL = 1, 1, HAVE_NEON_IMPL_LONGDOUBLE#
+  */
+ #if defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**begin repeat1
+ * #kind = maximum, minimum, fmax, fmin#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+#endif // defined(NPY_HAVE_NEON) && @HAVE_NEON_IMPL@
+/**end repeat**/
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
diff --git a/numpy/core/src/umath/loops_minmax.dispatch.c.src b/numpy/core/src/umath/loops_minmax.dispatch.c.src
index 06aed1bb7..c2eddcee6 100644
--- a/numpy/core/src/umath/loops_minmax.dispatch.c.src
+++ b/numpy/core/src/umath/loops_minmax.dispatch.c.src
@@ -1,6 +1,6 @@
 /*@targets
  ** $maxopt baseline
- ** neon
+ ** neon asimd
  **/
 #define _UMATHMODULE
 #define _MULTIARRAYMODULE
@@ -13,659 +13,460 @@
 // Provides the various *_LOOP macros
 #include "fast_loop_macros.h"
 
+/*******************************************************************************
+ ** Scalar intrinsics
+ ******************************************************************************/
+// signed/unsigned int
+#define scalar_max_i(A, B) ((A > B) ? A : B)
+#define scalar_min_i(A, B) ((A < B) ? A : B)
+// fp, propagates NaNs
+#if !defined(__aarch64__)
+#define scalar_max_f(A, B) ((A >= B || npy_isnan(A)) ? A : B)
+#define scalar_max_d scalar_max_f
+#define scalar_max_l scalar_max_f
+#define scalar_min_f(A, B) ((A <= B || npy_isnan(A)) ? A : B)
+#define scalar_min_d scalar_min_f
+#define scalar_min_l scalar_min_f
+// fp, ignores NaNs
+#define scalar_maxp_f fmaxf
+#define scalar_maxp_d fmax
+#define scalar_maxp_l fmaxl
+#define scalar_minp_f fminf
+#define scalar_minp_d fmin
+#define scalar_minp_l fminl
+#elif defined(__aarch64__)
+/**begin repeat
+ * #type = npy_float, npy_double, npy_longdouble#
+ * #scalar_sfx = f, d, l#
+ * #asm_sfx = s, d, d#
+ */
+/**begin repeat1
+ * #op = max, min, maxp, minp#
+ * #asm_instr = fmax, fmin, fmaxnm, fminnm#
+ */
+static inline @type@ scalar_@op@_@scalar_sfx@(@type@ a, @type@ b){
+    @type@ result = 0;
+    __asm(
+        "@asm_instr@ %@asm_sfx@[result], %@asm_sfx@[a], %@asm_sfx@[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+/**end repeat1**/
+/**end repeat**/
+#endif // !defined(__aarch64__)
 
-//##############################################################################
-//## Maximum, Minimum
-//##############################################################################
-
-#ifdef NPY_HAVE_NEON
-//******************************************************************************
-//** NEON Min / Max
-//******************************************************************************
-
-
-//------------------------------------------------------------------------------
-//-- Integer
-//------------------------------------------------------------------------------
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
 
+#if NPY_SIMD
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #is_64 = 0*6, 1*2#
+ */
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #if !@is_64@
+        #define npyv_reduce_min_@sfx@ vminvq_@sfx@
+        #define npyv_reduce_max_@sfx@ vmaxvq_@sfx@
+    #else
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_min_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a < b) ? a : b;
+            return result;
+        }
+        NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_max_@sfx@(npyv_@sfx@ v)
+        {
+            npyv_lanetype_@sfx@ a = vgetq_lane_@sfx@(v, 0);
+            npyv_lanetype_@sfx@ b = vgetq_lane_@sfx@(v, 1);
+            npyv_lanetype_@sfx@ result = (a > b) ? a : b;
+            return result;
+        }
+    #endif // !@is_64@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@intrin@_i(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+/**end repeat**/
+#endif // NPY_SIMD
 
 /**begin repeat
- * #TYPE = BYTE, UBYTE, SHORT, USHORT, INT, UINT,
- *         LONG, ULONG, LONGLONG, ULONGLONG#
- * #type = npy_byte, npy_ubyte, npy_short, npy_ushort, npy_int, npy_uint,
- *         npy_long, npy_ulong, npy_longlong, npy_ulonglong#
- * #sfx = s8, u8, s16, u16, s32, u32,
- *        s64, u64, s64, u64#
- * #HAVE_NEON_IMPL = 1*8, HAVE_NEON_IMPL_LONGLONG*2#
+ * #sfx = f32, f64#
+ * #bsfx = b32, b64#
+ * #simd_chk = NPY_SIMD, NPY_SIMD_F64#
+ * #scalar_sfx = f, d#
  */
+#if @simd_chk@
+#if defined(NPY_HAVE_ASIMD) && defined(__aarch64__)
+    #define npyv_minn_@sfx@ vminq_@sfx@
+    #define npyv_maxn_@sfx@ vmaxq_@sfx@
+    #define npyv_reduce_minn_@sfx@ vminvq_@sfx@
+    #define npyv_reduce_maxn_@sfx@ vmaxvq_@sfx@
+    #define npyv_reduce_minp_@sfx@ vminnmvq_@sfx@
+    #define npyv_reduce_maxp_@sfx@ vmaxnmvq_@sfx@
+#else
+    /**begin repeat1
+     * #intrin = min, max#
+     */
+    // propagates NaNs
+    NPY_FINLINE npyv_@sfx@ npyv_@intrin@n_@sfx@(npyv_@sfx@ a, npyv_@sfx@ b)
+    {
+        npyv_@sfx@ result = npyv_@intrin@_@sfx@(a, b);
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(b), result, b);
+        result = npyv_select_@sfx@(npyv_notnan_@sfx@(a), result, a);
+        return result;
+    }
+    /**end repeat1**/
+    /**begin repeat1
+     * #intrin = minn, maxn, minp, maxp#
+     * #scalar_intrin = min, max, minp, maxp#
+     */
+    NPY_FINLINE npyv_lanetype_@sfx@ npyv_reduce_@intrin@_@sfx@(npyv_@sfx@ v)
+    {
+        npyv_lanetype_@sfx@ s[npyv_nlanes_@sfx@];
+        npyv_storea_@sfx@(s, v);
+        npyv_lanetype_@sfx@ result = s[0];
+        for(int i=1; i<npyv_nlanes_@sfx@; ++i){
+            result = scalar_@scalar_intrin@_@scalar_sfx@(result, s[i]);
+        }
+        return result;
+    }
+    /**end repeat1**/
+#endif
+#endif // simd_chk
+/**end repeat**/
 
-// Implementation below assumes longlong and ulonglong are 64-bit.
-#if @HAVE_NEON_IMPL@
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
 
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*9, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1, 1#
+ * #scalar_sfx = i*8, f, d#
+ */
 /**begin repeat1
-* Arithmetic
-* # kind = maximum, minimum#
-* # OP = >, <#
-* # vop = max, min#
-*/
-
-#define SCALAR_OP @TYPE@_scalar_@vop@
-static inline @type@ SCALAR_OP(@type@ a, @type@ b){
-    return ((a @OP@ b) ? a : b);
-}
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
+#if @simd_chk@ && (!@fp_only@ || (@is_fp@ && @fp_only@))
+#if @is_fp@ && !@fp_only@
+    #define V_INTRIN npyv_@intrin@n_@sfx@ // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@n_@sfx@
+#else
+    #define V_INTRIN npyv_@intrin@_@sfx@
+    #define V_REDUCE_INTRIN npyv_reduce_@intrin@_@sfx@
+#endif
 
-static inline npy_intp
-simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+// contiguous input.
+static inline void
+simd_reduce_c_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npyv_lanetype_@sfx@ *op1, npy_intp len)
 {
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    @type@ *ip2 = (@type@ *)args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
+    // Note, 8x unroll was chosen for best results on Apple M1
     const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
     npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
 
+    npy_intp i = 0;
+    npyv_lanetype_@sfx@ io1 = op1[0];
+
     // SIMD if possible
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        #define NPYV_CAST (const npyv_lanetype_@sfx@ *)
-        npyv_@sfx@ m0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
-        npyv_@sfx@ m1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
-        npyv_@sfx@ m2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
-        npyv_@sfx@ m3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
-        npyv_@sfx@ m4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
-        npyv_@sfx@ m5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
-        npyv_@sfx@ m6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
-        npyv_@sfx@ m7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
+    if((i+elemPerLoop) <= len){
+        npyv_@sfx@ m0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
+        npyv_@sfx@ m1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
+        npyv_@sfx@ m2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
+        npyv_@sfx@ m3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
+        npyv_@sfx@ m4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
+        npyv_@sfx@ m5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
+        npyv_@sfx@ m6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
+        npyv_@sfx@ m7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
 
         i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 0 * elemPerVector]);
-            npyv_@sfx@ v1 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 1 * elemPerVector]);
-            npyv_@sfx@ v2 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 2 * elemPerVector]);
-            npyv_@sfx@ v3 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 3 * elemPerVector]);
-            npyv_@sfx@ v4 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 4 * elemPerVector]);
-            npyv_@sfx@ v5 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 5 * elemPerVector]);
-            npyv_@sfx@ v6 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 6 * elemPerVector]);
-            npyv_@sfx@ v7 = npyv_load_@sfx@(NPYV_CAST &ip2[i + 7 * elemPerVector]);
-
-            m0 = npyv_@vop@_@sfx@(m0, v0);
-            m1 = npyv_@vop@_@sfx@(m1, v1);
-            m2 = npyv_@vop@_@sfx@(m2, v2);
-            m3 = npyv_@vop@_@sfx@(m3, v3);
-            m4 = npyv_@vop@_@sfx@(m4, v4);
-            m5 = npyv_@vop@_@sfx@(m5, v5);
-            m6 = npyv_@vop@_@sfx@(m6, v6);
-            m7 = npyv_@vop@_@sfx@(m7, v7);
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            npyv_@sfx@ v0 = npyv_load_@sfx@(&ip[i + 0 * elemPerVector]);
+            npyv_@sfx@ v1 = npyv_load_@sfx@(&ip[i + 1 * elemPerVector]);
+            npyv_@sfx@ v2 = npyv_load_@sfx@(&ip[i + 2 * elemPerVector]);
+            npyv_@sfx@ v3 = npyv_load_@sfx@(&ip[i + 3 * elemPerVector]);
+            npyv_@sfx@ v4 = npyv_load_@sfx@(&ip[i + 4 * elemPerVector]);
+            npyv_@sfx@ v5 = npyv_load_@sfx@(&ip[i + 5 * elemPerVector]);
+            npyv_@sfx@ v6 = npyv_load_@sfx@(&ip[i + 6 * elemPerVector]);
+            npyv_@sfx@ v7 = npyv_load_@sfx@(&ip[i + 7 * elemPerVector]);
+
+            m0 = V_INTRIN(m0, v0);
+            m1 = V_INTRIN(m1, v1);
+            m2 = V_INTRIN(m2, v2);
+            m3 = V_INTRIN(m3, v3);
+            m4 = V_INTRIN(m4, v4);
+            m5 = V_INTRIN(m5, v5);
+            m6 = V_INTRIN(m6, v6);
+            m7 = V_INTRIN(m7, v7);
         }
 
-        #undef NPYV_CAST
+        m0 = V_INTRIN(m0, m1);
+        m2 = V_INTRIN(m2, m3);
+        m4 = V_INTRIN(m4, m5);
+        m6 = V_INTRIN(m6, m7);
 
-        m0 = npyv_@vop@_@sfx@(m0, m1);
-        m2 = npyv_@vop@_@sfx@(m2, m3);
-        m4 = npyv_@vop@_@sfx@(m4, m5);
-        m6 = npyv_@vop@_@sfx@(m6, m7);
+        m0 = V_INTRIN(m0, m2);
+        m4 = V_INTRIN(m4, m6);
 
-        m0 = npyv_@vop@_@sfx@(m0, m2);
-        m4 = npyv_@vop@_@sfx@(m4, m6);
+        m0 = V_INTRIN(m0, m4);
 
-        m0 = npyv_@vop@_@sfx@(m0, m4);
-
-        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
+        npyv_lanetype_@sfx@ r = V_REDUCE_INTRIN(m0);
 
         io1 = SCALAR_OP(io1, r);
     }
 
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline npy_intp
-scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    // 8x scalar
-    npy_intp elemPerLoop = 8;
-    if((i+elemPerLoop) <= n){
-        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
-        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
-        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
-        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
-        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
-        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
-        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
-        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
-            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
-            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
-            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
-            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
-            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
-            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
-            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-            m0 = SCALAR_OP(m0, v0);
-            m1 = SCALAR_OP(m1, v1);
-            m2 = SCALAR_OP(m2, v2);
-            m3 = SCALAR_OP(m3, v3);
-            m4 = SCALAR_OP(m4, v4);
-            m5 = SCALAR_OP(m5, v5);
-            m6 = SCALAR_OP(m6, v6);
-            m7 = SCALAR_OP(m7, v7);
-        }
-
-        m0 = SCALAR_OP(m0, m1);
-        m2 = SCALAR_OP(m2, m3);
-        m4 = SCALAR_OP(m4, m5);
-        m6 = SCALAR_OP(m6, m7);
-
-        m0 = SCALAR_OP(m0, m2);
-        m4 = SCALAR_OP(m4, m6);
-
-        m0 = SCALAR_OP(m0, m4);
-
-        io1 = SCALAR_OP(io1, m0);
-    }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline void
-run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    i = 0;
-
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else{
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
     // Scalar - finish up any remaining iterations
-    io1 = iop1[0];
-    ip2 += i * is2;
-    for(; i<n; ++i, ip2 += is2){
-        const @type@ in2 = *(@type@ *)ip2;
+    for(; i<len; ++i){
+        const npyv_lanetype_@sfx@ in2 = ip[i];
         io1 = SCALAR_OP(io1, in2);
     }
-    *((@type@ *)iop1) = io1;
+    op1[0] = io1;
 }
 
-static inline npy_intp
-simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip1, const npyv_lanetype_@sfx@ *ip2,
+                                     npyv_lanetype_@sfx@ *op1, npy_intp len)
 {
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
+    // Note, 6x unroll was chosen for best results on Apple M1
     const int vectorsPerLoop = 6;
-    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
+    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(npyv_lanetype_@sfx@);
     int elemPerLoop = vectorsPerLoop * elemPerVector;
 
-    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
-            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
-            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
-            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
-            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
-            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
-
-            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
-            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
-            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
-            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
-            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
-            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
-
-            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
-            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
-            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
-            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
-            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
-            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
-
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
-        }
+    npy_intp i = 0;
+
+    for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+        npyv_@sfx@ v0 =  npyv_load_@sfx@(&ip1[i + 0 * elemPerVector]);
+        npyv_@sfx@ v1 =  npyv_load_@sfx@(&ip1[i + 1 * elemPerVector]);
+        npyv_@sfx@ v2 =  npyv_load_@sfx@(&ip1[i + 2 * elemPerVector]);
+        npyv_@sfx@ v3 =  npyv_load_@sfx@(&ip1[i + 3 * elemPerVector]);
+        npyv_@sfx@ v4 =  npyv_load_@sfx@(&ip1[i + 4 * elemPerVector]);
+        npyv_@sfx@ v5 =  npyv_load_@sfx@(&ip1[i + 5 * elemPerVector]);
+
+        npyv_@sfx@ u0 =  npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
+        npyv_@sfx@ u1 =  npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
+        npyv_@sfx@ u2 =  npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
+        npyv_@sfx@ u3 =  npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
+        npyv_@sfx@ u4 =  npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
+        npyv_@sfx@ u5 =  npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
+
+        npyv_@sfx@ m0 = V_INTRIN(v0, u0);
+        npyv_@sfx@ m1 = V_INTRIN(v1, u1);
+        npyv_@sfx@ m2 = V_INTRIN(v2, u2);
+        npyv_@sfx@ m3 = V_INTRIN(v3, u3);
+        npyv_@sfx@ m4 = V_INTRIN(v4, u4);
+        npyv_@sfx@ m5 = V_INTRIN(v5, u5);
+
+        npyv_store_@sfx@(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_@sfx@(&op1[i + 1 * elemPerVector], m1);
+        npyv_store_@sfx@(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_@sfx@(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_@sfx@(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_@sfx@(&op1[i + 5 * elemPerVector], m5);
     }
-    
-    return i;
-}
-
-static inline npy_intp
-scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    npy_intp elemPerLoop = 4;
-    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-        /* Note, we can't just load all, do all ops, then store all here.
-         * Sometimes ufuncs are called with `accumulate`, which makes the
-         * assumption that previous iterations have finished before next
-         * iteration.  For example, the output of iteration 2 depends on the
-         * result of iteration 1.
-         */
-
-        /**begin repeat2
-         * #unroll = 0, 1, 2, 3#
-         */
-        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
-        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
-        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
-        /**end repeat2**/
-    }
-
-    return i;
-}
-
-static inline void
-run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    i = 0;
-
-    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
-            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
-            nomemoverlap(ip2, is2 * n, op1, os1 * n))
-    {
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else {
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
 
     // Scalar - finish up any remaining iterations
-    ip1 += is1 * i;
-    ip2 += is2 * i;
-    op1 += os1 * i;
-    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
+    for(; i<len; ++i){
+        const npyv_lanetype_@sfx@ in1 = ip1[i];
+        const npyv_lanetype_@sfx@ in2 = ip2[i];
         
-        *((@type@ *)op1) = SCALAR_OP(in1, in2);
+        op1[i] = SCALAR_OP(in1, in2);
     }
 }
 
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
-    } else {
-        run_binary_@TYPE@_@kind@(args, dimensions, steps);
-    }
-}
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
 
-#undef SCALAR_OP
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
 
+#undef SCALAR_OP
 /**end repeat1**/
-
-#endif // @HAVE_NEON_IMPL@
-
 /**end repeat**/
 
-
-//------------------------------------------------------------------------------
-//-- Float
-//------------------------------------------------------------------------------
-
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
 /**begin repeat
- * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
- * #type = npy_float, npy_double, npy_longdouble#
- * #sfx = f32, f64, f64#
- * #asmType = s, d, d#
- * #HAVE_NEON_IMPL = 1*2, HAVE_NEON_IMPL_LONGDOUBLE#
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE, SHORT, INT, LONG, LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         float, double, long double#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigend = 1*5, 0*5, 0*3#
+ * #scalar_sfx = i*10, f, d, l#
  */
-
-// Implementation below assumes longdouble is 64-bit.
-#if @HAVE_NEON_IMPL@
-
+#undef TO_SIMD_SFX
+#if 0
 /**begin repeat1
-* Arithmetic
-* # kind = maximum, minimum, fmax, fmin#
-* # OP = >=, <=, >=, <=#
-* # vop = max, min, maxp, minp#
-* # asmInstr = fmax, fmin, fmaxnm, fminnm#
-* # PROPAGATE_NAN = 1, 1, 0, 0#
-*/
-
-#define SCALAR_OP @TYPE@_scalar_@vop@
-static inline @type@ SCALAR_OP(@type@ a, @type@ b){
-#ifdef __aarch64__
-    @type@ result = 0;
-    __asm(
-        "@asmInstr@ %@asmType@[result], %@asmType@[a], %@asmType@[b]"
-        : [result] "=w" (result)
-        : [a] "w" (a), [b] "w" (b)
-    );
-    return result;
-#else
-
-#if @PROPAGATE_NAN@
-    return ((a @OP@ b || npy_isnan(a)) ? a : b);
-#else
-    return ((a @OP@ b || npy_isnan(b)) ? a : b);
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigend@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
 #endif
-#endif // __aarch64__
-}
 
-static inline npy_intp
-simd_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    @type@ *ip2 = (@type@ *)args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    // SIMD if possible
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        npyv_@sfx@ m0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
-        npyv_@sfx@ m1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
-        npyv_@sfx@ m2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
-        npyv_@sfx@ m3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
-        npyv_@sfx@ m4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
-        npyv_@sfx@ m5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
-        npyv_@sfx@ m6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
-        npyv_@sfx@ m7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
+/**begin repeat1
+ * # kind = maximum, minimum, fmax, fmin#
+ * # intrin = max, min, maxp, minp#
+ * # fp_only = 0, 0, 1, 1#
+ */
+#if !@fp_only@ || (@is_fp@ && @fp_only@)
+#define SCALAR_OP scalar_@intrin@_@scalar_sfx@
 
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 = npyv_load_@sfx@(&ip2[i + 0 * elemPerVector]);
-            npyv_@sfx@ v1 = npyv_load_@sfx@(&ip2[i + 1 * elemPerVector]);
-            npyv_@sfx@ v2 = npyv_load_@sfx@(&ip2[i + 2 * elemPerVector]);
-            npyv_@sfx@ v3 = npyv_load_@sfx@(&ip2[i + 3 * elemPerVector]);
-            npyv_@sfx@ v4 = npyv_load_@sfx@(&ip2[i + 4 * elemPerVector]);
-            npyv_@sfx@ v5 = npyv_load_@sfx@(&ip2[i + 5 * elemPerVector]);
-            npyv_@sfx@ v6 = npyv_load_@sfx@(&ip2[i + 6 * elemPerVector]);
-            npyv_@sfx@ v7 = npyv_load_@sfx@(&ip2[i + 7 * elemPerVector]);
-
-            m0 = npyv_@vop@_@sfx@(m0, v0);
-            m1 = npyv_@vop@_@sfx@(m1, v1);
-            m2 = npyv_@vop@_@sfx@(m2, v2);
-            m3 = npyv_@vop@_@sfx@(m3, v3);
-            m4 = npyv_@vop@_@sfx@(m4, v4);
-            m5 = npyv_@vop@_@sfx@(m5, v5);
-            m6 = npyv_@vop@_@sfx@(m6, v6);
-            m7 = npyv_@vop@_@sfx@(m7, v7);
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(@type@)) {
+            TO_SIMD_SFX(simd_reduce_c_@intrin@)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
         }
-
-        m0 = npyv_@vop@_@sfx@(m0, m1);
-        m2 = npyv_@vop@_@sfx@(m2, m3);
-        m4 = npyv_@vop@_@sfx@(m4, m5);
-        m6 = npyv_@vop@_@sfx@(m6, m7);
-
-        m0 = npyv_@vop@_@sfx@(m0, m2);
-        m4 = npyv_@vop@_@sfx@(m4, m6);
-
-        m0 = npyv_@vop@_@sfx@(m0, m4);
-
-        @type@ r = npyv_reduce_@vop@_@sfx@(m0);
-
-        io1 = SCALAR_OP(io1, r);
     }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline npy_intp
-scalar_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-
-    // 8x scalar
-    npy_intp elemPerLoop = 8;
-    if((i+elemPerLoop) <= n){
-        @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
-        @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
-        @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
-        @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
-        @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
-        @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
-        @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
-        @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-        i += elemPerLoop;
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
-            @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
-            @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
-            @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
-            @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
-            @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
-            @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
-            @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
-
-            m0 = SCALAR_OP(m0, v0);
-            m1 = SCALAR_OP(m1, v1);
-            m2 = SCALAR_OP(m2, v2);
-            m3 = SCALAR_OP(m3, v3);
-            m4 = SCALAR_OP(m4, v4);
-            m5 = SCALAR_OP(m5, v5);
-            m6 = SCALAR_OP(m6, v6);
-            m7 = SCALAR_OP(m7, v7);
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(@type@, @type@)) {
+            TO_SIMD_SFX(simd_binary_ccc_@intrin@)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
         }
-
-        m0 = SCALAR_OP(m0, m1);
-        m2 = SCALAR_OP(m2, m3);
-        m4 = SCALAR_OP(m4, m5);
-        m6 = SCALAR_OP(m6, m7);
-
-        m0 = SCALAR_OP(m0, m2);
-        m4 = SCALAR_OP(m4, m6);
-
-        m0 = SCALAR_OP(m0, m4);
-
-        io1 = SCALAR_OP(io1, m0);
     }
-
-    *((@type@ *)iop1) = io1;
-
-    return i;
-}
-
-static inline void
-run_reduce_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    @type@ *iop1 = (@type@ *)args[0];
-    @type@ io1 = iop1[0];
-    char *ip2 = args[1];
-    npy_intp is2 = steps[1];
-    npy_intp n = dimensions[0];
-    npy_intp i;
-
-    const int vectorsPerLoop = 8;
-    const size_t elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    npy_intp elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    i = 0;
-
-    if((i+elemPerLoop) <= n && is2 == sizeof(@type@)){
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            @type@ m0 = *((@type@ *)(ip2 + (i + 0) * is2));
+            @type@ m1 = *((@type@ *)(ip2 + (i + 1) * is2));
+            @type@ m2 = *((@type@ *)(ip2 + (i + 2) * is2));
+            @type@ m3 = *((@type@ *)(ip2 + (i + 3) * is2));
+            @type@ m4 = *((@type@ *)(ip2 + (i + 4) * is2));
+            @type@ m5 = *((@type@ *)(ip2 + (i + 5) * is2));
+            @type@ m6 = *((@type@ *)(ip2 + (i + 6) * is2));
+            @type@ m7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                @type@ v0 = *((@type@ *)(ip2 + (i + 0) * is2));
+                @type@ v1 = *((@type@ *)(ip2 + (i + 1) * is2));
+                @type@ v2 = *((@type@ *)(ip2 + (i + 2) * is2));
+                @type@ v3 = *((@type@ *)(ip2 + (i + 3) * is2));
+                @type@ v4 = *((@type@ *)(ip2 + (i + 4) * is2));
+                @type@ v5 = *((@type@ *)(ip2 + (i + 5) * is2));
+                @type@ v6 = *((@type@ *)(ip2 + (i + 6) * is2));
+                @type@ v7 = *((@type@ *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((@type@ *)op1) = SCALAR_OP(*((@type@ *)op1), m0);
+        }
     } else{
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_reduce_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
-    // Scalar - finish up any remaining iterations
-    io1 = iop1[0];
-    ip2 += i * is2;
-    for(; i<n; ++i, ip2 += is2){
-        const @type@ in2 = *(@type@ *)ip2;
-        io1 = SCALAR_OP(io1, in2);
-    }
-    *((@type@ *)iop1) = io1;
-}
-
-static inline npy_intp
-simd_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    const int vectorsPerLoop = 6;
-    const int elemPerVector = NPY_SIMD_WIDTH / sizeof(@type@);
-    int elemPerLoop = vectorsPerLoop * elemPerVector;
-
-    if(IS_BINARY_STRIDE_ONE(sizeof(@type@), NPY_SIMD_WIDTH)){
-        for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-            npyv_@sfx@ v0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 0 * elemPerVector) * is1));
-            npyv_@sfx@ v1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 1 * elemPerVector) * is1));
-            npyv_@sfx@ v2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 2 * elemPerVector) * is1));
-            npyv_@sfx@ v3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 3 * elemPerVector) * is1));
-            npyv_@sfx@ v4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 4 * elemPerVector) * is1));
-            npyv_@sfx@ v5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip1 + (i + 5 * elemPerVector) * is1));
-
-            npyv_@sfx@ u0 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 0 * elemPerVector) * is2));
-            npyv_@sfx@ u1 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 1 * elemPerVector) * is2));
-            npyv_@sfx@ u2 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 2 * elemPerVector) * is2));
-            npyv_@sfx@ u3 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 3 * elemPerVector) * is2));
-            npyv_@sfx@ u4 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 4 * elemPerVector) * is2));
-            npyv_@sfx@ u5 =  npyv_load_@sfx@((const npyv_lanetype_@sfx@ *)(ip2 + (i + 5 * elemPerVector) * is2));
-
-            npyv_@sfx@ m0 = npyv_@vop@_@sfx@(v0, u0);
-            npyv_@sfx@ m1 = npyv_@vop@_@sfx@(v1, u1);
-            npyv_@sfx@ m2 = npyv_@vop@_@sfx@(v2, u2);
-            npyv_@sfx@ m3 = npyv_@vop@_@sfx@(v3, u3);
-            npyv_@sfx@ m4 = npyv_@vop@_@sfx@(v4, u4);
-            npyv_@sfx@ m5 = npyv_@vop@_@sfx@(v5, u5);
-
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 0 * elemPerVector) * os1), m0);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 1 * elemPerVector) * os1), m1);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 2 * elemPerVector) * os1), m2);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 3 * elemPerVector) * os1), m3);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 4 * elemPerVector) * os1), m4);
-            npyv_store_@sfx@((npyv_lanetype_@sfx@ *)(op1 + (i + 5 * elemPerVector) * os1), m5);
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            /**begin repeat2
+             * #unroll = 0, 1, 2, 3#
+             */
+            @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
+            @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
+            *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
+            /**end repeat2**/
         }
     }
-    
-    return i;
-}
-
-static inline npy_intp
-scalar_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, npy_intp i)
-{
-    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
-    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];
-    npy_intp n = dimensions[0];
-
-    npy_intp elemPerLoop = 4;
-    for(; (i+elemPerLoop)<=n; i+=elemPerLoop){
-        /* Note, we can't just load all, do all ops, then store all here.
-         * Sometimes ufuncs are called with `accumulate`, which makes the
-         * assumption that previous iterations have finished before next
-         * iteration.  For example, the output of iteration 2 depends on the
-         * result of iteration 1.
-         */
-
-        /**begin repeat2
-         * #unroll = 0, 1, 2, 3#
-         */
-        @type@ v@unroll@ = *((@type@ *)(ip1 + (i + @unroll@) * is1));
-        @type@ u@unroll@ = *((@type@ *)(ip2 + (i + @unroll@) * is2));
-        *((@type@ *)(op1 + (i + @unroll@) * os1)) = SCALAR_OP(v@unroll@, u@unroll@);
-        /**end repeat2**/
-    }
-
-    return i;
-}
-
-static inline void
-run_binary_@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-    BINARY_DEFS
-
-    i = 0;
-
-    if(IS_BLOCKABLE_BINARY(sizeof(@type@), NPY_SIMD_WIDTH) &&
-            nomemoverlap(ip1, is1 * n, op1, os1 * n) &&
-            nomemoverlap(ip2, is2 * n, op1, os1 * n))
-    {
-        // SIMD - do as many iterations as we can with vectors
-        i = simd_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    } else {
-        // Unrolled scalar - do as many iterations as we can with unrolled loops
-        i = scalar_binary_@TYPE@_@kind@(args, dimensions, steps, i);
-    }
-
-    // Scalar - finish up any remaining iterations
+#endif // NPY_DISABLE_OPTIMIZATION
     ip1 += is1 * i;
     ip2 += is2 * i;
     op1 += os1 * i;
-    for(; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1){
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
         const @type@ in1 = *(@type@ *)ip1;
         const @type@ in2 = *(@type@ *)ip2;
-        
         *((@type@ *)op1) = SCALAR_OP(in1, in2);
     }
-}
-
-
-NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
-(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (IS_BINARY_REDUCE) {
-        run_reduce_@TYPE@_@kind@(args, dimensions, steps);
-    } else {
-        run_binary_@TYPE@_@kind@(args, dimensions, steps);
-    }
-
+#ifdef TO_SIMD_SFX
+clear_fp:;
+#endif
+#if @is_fp@
     npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
 }
 
 #undef SCALAR_OP
 
+#endif // !fp_only || (is_fp && fp_only)
 /**end repeat1**/
-
-#endif // @HAVE_NEON_IMPL@
-
 /**end repeat**/
 
-#endif // NPY_HAVE_NEON