12 files changed, 577 insertions, 221 deletions
diff --git a/.gitignore b/.gitignore
index 6f63498e0..9851fcc77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,6 +216,7 @@ numpy/core/src/_simd/_simd.dispatch.c
 numpy/core/src/_simd/_simd_data.inc
 numpy/core/src/_simd/_simd_inc.h
 # umath module
+numpy/core/src/umath/loops_unary.dispatch.c
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
 numpy/core/src/umath/loops_arithmetic.dispatch.c
diff --git a/benchmarks/benchmarks/bench_ma.py b/benchmarks/benchmarks/bench_ma.py
index 0247065a7..49ccf92fe 100644
--- a/benchmarks/benchmarks/bench_ma.py
+++ b/benchmarks/benchmarks/bench_ma.py
@@ -119,3 +119,149 @@ class Concatenate(Benchmark):
 
     def time_it(self, mode, n):
         np.ma.concatenate(self.args)
+
+
+class MAFunctions1v(Benchmark):
+    param_names = ['mtype', 'func', 'msize']
+    params = [['np', 'np.ma'],
+              ['sin', 'log', 'sqrt'],
+              ['small', 'big']]
+
+    def setup(self, mtype, func, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_functions_1v(self, mtype, func, msize):
+        # fun = {'np.ma.sin': np.ma.sin, 'np.sin': np.sin}[func]
+        fun = eval(f"{mtype}.{func}")
+        if msize == 'small':
+            fun(self.nmxs)
+        elif msize == 'big':
+            fun(self.nmxl)
+
+
+class MAMethod0v(Benchmark):
+    param_names = ['method', 'msize']
+    params = [['ravel', 'transpose', 'compressed', 'conjugate'],
+              ['small', 'big']]
+
+    def setup(self, method, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_0v(self, method, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, method)()
+
+
+class MAFunctions2v(Benchmark):
+    param_names = ['mtype', 'func', 'msize']
+    params = [['np', 'np.ma'],
+              ['multiply', 'divide', 'power'],
+              ['small', 'big']]
+
+    def setup(self, mtype, func, msize):
+        # Small arrays
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        m2 = [[True, False, True], [False, False, True]]
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmys = np.ma.array(ys, mask=m2)
+        # Big arrays
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        masky = yl < -0.8
+        self.nmxl = np.ma.array(xl, mask=maskx)
+        self.nmyl = np.ma.array(yl, mask=masky)
+
+    def time_functions_2v(self, mtype, func, msize):
+        fun = eval(f"{mtype}.{func}")
+        if msize == 'small':
+            fun(self.nmxs, self.nmys)
+        elif msize == 'big':
+            fun(self.nmxl, self.nmyl)
+
+
+class MAMethodGetItem(Benchmark):
+    param_names = ['margs', 'msize']
+    params = [[0, (0, 0), [0, -1]],
+              ['small', 'big']]
+
+    def setup(self, margs, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_getitem(self, margs, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, '__getitem__')(margs)
+
+
+class MAMethodSetItem(Benchmark):
+    param_names = ['margs', 'mset', 'msize']
+    params = [[0, (0, 0), (-1, 0)],
+              [17, np.ma.masked],
+              ['small', 'big']]
+
+    def setup(self, margs, mset, msize):
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmxl = np.ma.array(xl, mask=maskx)
+
+    def time_methods_setitem(self, margs, mset, msize):
+        if msize == 'small':
+            mdat = self.nmxs
+        elif msize == 'big':
+            mdat = self.nmxl
+        getattr(mdat, '__setitem__')(margs, mset)
+
+
+class Where(Benchmark):
+    param_names = ['mtype', 'msize']
+    params = [['np', 'np.ma'],
+              ['small', 'big']]
+
+    def setup(self, mtype, msize):
+        # Small arrays
+        xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+        m1 = [[True, False, False], [False, False, True]]
+        m2 = [[True, False, True], [False, False, True]]
+        self.nmxs = np.ma.array(xs, mask=m1)
+        self.nmys = np.ma.array(ys, mask=m2)
+        # Big arrays
+        xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+        maskx = xl > 0.8
+        masky = yl < -0.8
+        self.nmxl = np.ma.array(xl, mask=maskx)
+        self.nmyl = np.ma.array(yl, mask=masky)
+
+    def time_where(self, mtype, msize):
+        fun = eval(f"{mtype}.where")
+        if msize == 'small':
+            fun(self.nmxs > 2, self.nmxs, self.nmys)
+        elif msize == 'big':
+            fun(self.nmxl > 2, self.nmxl, self.nmyl)
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 40382b8ae..768c8deee 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -426,7 +426,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.negative'),
           'PyUFunc_NegativeTypeResolver',
-          TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
+          TD(ints+flts+timedeltaonly, dispatch=[('loops_unary', ints+'fdg')]),
           TD(cmplx, f='neg'),
           TD(O, f='PyNumber_Negative'),
           ),
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 2b23d3f14..50cd8ccc5 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -747,6 +747,7 @@ src_umath = [
   src_file.process('src/umath/loops_modulo.dispatch.c.src'),
   src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
   src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
+  src_file.process('src/umath/loops_unary.dispatch.c.src'),
   src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
   src_file.process('src/umath/matmul.c.src'),
   src_file.process('src/umath/matmul.h.src'),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 4b61bd855..da5bc64c0 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1005,6 +1005,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops_utils.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'loops_unary.dispatch.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index fe5aa9374..0b4856847 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -601,14 +601,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 
 #if @CHK@
 NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    UNARY_LOOP_FAST(@type@, @type@, *out = -in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
 @TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
@@ -1546,17 +1538,6 @@ NPY_NO_EXPORT void
 }
 
 NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
-        UNARY_LOOP {
-            const @type@ in1 = *(@type@ *)ip1;
-            *((@type@ *)op1) = -in1;
-        }
-    }
-}
-
-NPY_NO_EXPORT void
 @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
     UNARY_LOOP {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 424e204c1..e3a410968 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -140,9 +140,6 @@ NPY_NO_EXPORT void
 @S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
 @S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
@@ -206,6 +203,23 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
+ 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE,  SHORT,  INT,  LONG,  LONGLONG#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+
 /*
  *****************************************************************************
  **                             FLOAT LOOPS                                 **
@@ -226,6 +240,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat**/
 
 #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_arithm_fp.dispatch.h"
 #endif
 /**begin repeat
@@ -362,6 +390,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
  *  #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
  *  #c = f, f, , l#
  *  #C = F, F, , L#
+ *  #half = 1, 0, 0, 0#
  */
 
 /**begin repeat1
@@ -440,8 +469,10 @@ NPY_NO_EXPORT void
 NPY_NO_EXPORT void
 @TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
+#if @half@
 NPY_NO_EXPORT void
 @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
 
 NPY_NO_EXPORT void
 @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx  = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx =  8,  8,  16,  16,  32,  32,  64,  64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+    return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+    return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx  = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+    return vnegq_@sfx@(v);
+#else
+    // (v ^ signmask)
+    const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+    return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind   = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_load_@sfx@(ip);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_store_@sfx@(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+                             npyv_lanetype_@sfx@ *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+        npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+        npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+    #endif
+    /**end repeat2**/
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+        npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+        npyv_storen_@sfx@(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_@intrin@(*ip);
+    }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *         BYTE,  SHORT,  INT,  LONG,  LONGLONG,
+ *         FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT,  LONG, LONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+    #if @is_fp@
+        #define TO_SIMD_SFX(X) X##_f@len@
+        #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif @is_unsigned@
+        #define TO_SIMD_SFX(X) X##_u@len@
+    #else
+        #define TO_SIMD_SFX(X) X##_s@len@
+    #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(@type@, @type@)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if @supports_ncontig@
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // SSE2 does better with unrolled scalar for heavy non-contiguous
+        #if !defined(NPY_HAVE_SSE2)
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // @supports_ncontig@
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    /**begin repeat2
+     * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+     */
+    #if UNROLL > @U@
+        const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+        *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+    #endif
+    /**end repeat2**/
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if @is_fp@
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 5351ec1fa..6fc1501c9 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -129,39 +129,9 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
  *  #vector = 1, 1, 0#
  *  #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
  */
-
-/**begin repeat1
- * #func = negative#
- * #check = IS_BLOCKABLE_UNARY#
- * #name = unary#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
-
-#endif
-
-static inline int
-run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-    if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
-        sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat1**/
-
 /**begin repeat1
  * #kind = isnan, isfinite, isinf, signbit#
  */
-
 #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
 
 static void
@@ -181,9 +151,7 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *
 #endif
     return 0;
 }
-
 /**end repeat1**/
-
 /**end repeat**/
 
 /*
@@ -426,41 +394,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
 }
 
 /**end repeat1**/
-
-static void
-sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    /*
-     * get 0x7FFFFFFF mask (everything but signbit set)
-     * float & ~mask will remove the sign, float ^ mask flips the sign
-     * this is equivalent to how the compiler implements fabs on amd64
-     */
-    const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-
-    /* align output to VECTOR_SIZE_BYTES bytes */
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
-        op[i] = -ip[i];
-    }
-    assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
-           npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
-    if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
-        }
-    }
-    else {
-        LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-            @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
-            @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
-        }
-    }
-    LOOP_BLOCKED_END {
-        op[i] = -ip[i];
-    }
-}
-/**end repeat1**/
-
 /**end repeat**/
 
 /* bunch of helper functions used in ISA_exp/log_FLOAT*/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index f21a88acd..88ab7e014 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -2635,6 +2635,35 @@ class TestAbsoluteNegative:
         np.abs(d, out=d)
         np.abs(np.ones_like(d), out=d)
 
+    @pytest.mark.parametrize("dtype", ['d', 'f', 'int32', 'int64'])
+    @pytest.mark.parametrize("big", [True, False])
+    def test_noncontiguous(self, dtype, big):
+        data = np.array([-1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.5, 2.5, -6,
+                            6, -2.2251e-308, -8, 10], dtype=dtype)
+        expect = np.array([1.0, -1.0, 0.0, -0.0, -2.2251e-308, 2.5, -2.5, 6,
+                            -6, 2.2251e-308, 8, -10], dtype=dtype)
+        if big:
+            data = np.repeat(data, 10)
+            expect = np.repeat(expect, 10)
+        out = np.ndarray(data.shape, dtype=dtype)
+        ncontig_in = data[1::2]
+        ncontig_out = out[1::2]
+        contig_in = np.array(ncontig_in)
+        # contig in, contig out
+        assert_array_equal(np.negative(contig_in), expect[1::2])
+        # contig in, ncontig out
+        assert_array_equal(np.negative(contig_in, out=ncontig_out),
+                                expect[1::2])
+        # ncontig in, contig out
+        assert_array_equal(np.negative(ncontig_in), expect[1::2])
+        # ncontig in, ncontig out
+        assert_array_equal(np.negative(ncontig_in, out=ncontig_out),
+                                expect[1::2])
+        # contig in, contig out, nd stride
+        data_split = np.array(np.array_split(data, 2))
+        expect_split = np.array(np.array_split(expect, 2))
+        assert_equal(np.negative(data_split), expect_split)
+
 
 class TestPositive:
     def test_valid(self):
diff --git a/numpy/ma/bench.py b/numpy/ma/bench.py
deleted file mode 100644
index 56865683d..000000000
--- a/numpy/ma/bench.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-
-import timeit
-import numpy
-
-
-###############################################################################
-#                               Global variables                              #
-###############################################################################
-
-
-# Small arrays
-xs = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-ys = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-zs = xs + 1j * ys
-m1 = [[True, False, False], [False, False, True]]
-m2 = [[True, False, True], [False, False, True]]
-nmxs = numpy.ma.array(xs, mask=m1)
-nmys = numpy.ma.array(ys, mask=m2)
-nmzs = numpy.ma.array(zs, mask=m1)
-
-# Big arrays
-xl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-yl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-zl = xl + 1j * yl
-maskx = xl > 0.8
-masky = yl < -0.8
-nmxl = numpy.ma.array(xl, mask=maskx)
-nmyl = numpy.ma.array(yl, mask=masky)
-nmzl = numpy.ma.array(zl, mask=maskx)
-
-
-###############################################################################
-#                                 Functions                                   #
-###############################################################################
-
-
-def timer(s, v='', nloop=500, nrep=3):
-    units = ["s", "ms", "µs", "ns"]
-    scaling = [1, 1e3, 1e6, 1e9]
-    print("%s : %-50s : " % (v, s), end=' ')
-    varnames = ["%ss,nm%ss,%sl,nm%sl" % tuple(x*4) for x in 'xyz']
-    setup = 'from __main__ import numpy, ma, %s' % ','.join(varnames)
-    Timer = timeit.Timer(stmt=s, setup=setup)
-    best = min(Timer.repeat(nrep, nloop)) / nloop
-    if best > 0.0:
-        order = min(-int(numpy.floor(numpy.log10(best)) // 3), 3)
-    else:
-        order = 3
-    print("%d loops, best of %d: %.*g %s per loop" % (nloop, nrep,
-                                                      3,
-                                                      best * scaling[order],
-                                                      units[order]))
-
-
-def compare_functions_1v(func, nloop=500,
-                       xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
-    funcname = func.__name__
-    print("-"*50)
-    print(f'{funcname} on small arrays')
-    module, data = "numpy.ma", "nmxs"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
-    print("%s on large arrays" % funcname)
-    module, data = "numpy.ma", "nmxl"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-    return
-
-def compare_methods(methodname, args, vars='x', nloop=500, test=True,
-                    xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
-    print("-"*50)
-    print(f'{methodname} on small arrays')
-    data, ver = f'nm{vars}l', 'numpy.ma'
-    timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
-
-    print("%s on large arrays" % methodname)
-    data, ver = "nm%sl" % vars, 'numpy.ma'
-    timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
-    return
-
-def compare_functions_2v(func, nloop=500, test=True,
-                       xs=xs, nmxs=nmxs,
-                       ys=ys, nmys=nmys,
-                       xl=xl, nmxl=nmxl,
-                       yl=yl, nmyl=nmyl):
-    funcname = func.__name__
-    print("-"*50)
-    print(f'{funcname} on small arrays')
-    module, data = "numpy.ma", "nmxs,nmys"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
-    print(f'{funcname} on large arrays')
-    module, data = "numpy.ma", "nmxl,nmyl"
-    timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-    return
-
-
-if __name__ == '__main__':
-    compare_functions_1v(numpy.sin)
-    compare_functions_1v(numpy.log)
-    compare_functions_1v(numpy.sqrt)
-
-    compare_functions_2v(numpy.multiply)
-    compare_functions_2v(numpy.divide)
-    compare_functions_2v(numpy.power)
-
-    compare_methods('ravel', '', nloop=1000)
-    compare_methods('conjugate', '', 'z', nloop=1000)
-    compare_methods('transpose', '', nloop=1000)
-    compare_methods('compressed', '', nloop=1000)
-    compare_methods('__getitem__', '0', nloop=1000)
-    compare_methods('__getitem__', '(0,0)', nloop=1000)
-    compare_methods('__getitem__', '[0,-1]', nloop=1000)
-    compare_methods('__setitem__', '0, 17', nloop=1000, test=False)
-    compare_methods('__setitem__', '(0,0), 17', nloop=1000, test=False)
-
-    print("-"*50)
-    print("__setitem__ on small arrays")
-    timer('nmxs.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma   ', nloop=10000)
-
-    print("-"*50)
-    print("__setitem__ on large arrays")
-    timer('nmxl.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma   ', nloop=10000)
-
-    print("-"*50)
-    print("where on small arrays")
-    timer('numpy.ma.where(nmxs>2,nmxs,nmys)', 'numpy.ma   ', nloop=1000)
-    print("-"*50)
-    print("where on large arrays")
-    timer('numpy.ma.where(nmxl>2,nmxl,nmyl)', 'numpy.ma   ', nloop=100)
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index 8f654f3d9..28ed2f1df 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -280,7 +280,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
     "lib.utils",
     "linalg.lapack_lite",
     "linalg.linalg",
-    "ma.bench",
     "ma.core",
     "ma.testutils",
     "ma.timer_comparison",