summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--benchmarks/benchmarks/bench_ma.py146
-rw-r--r--numpy/core/code_generators/generate_umath.py2
-rw-r--r--numpy/core/meson.build1
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/umath/loops.c.src19
-rw-r--r--numpy/core/src/umath/loops.h.src37
-rw-r--r--numpy/core/src/umath/loops_unary.dispatch.c.src364
-rw-r--r--numpy/core/src/umath/simd.inc.src67
-rw-r--r--numpy/core/tests/test_umath.py29
-rw-r--r--numpy/ma/bench.py130
-rw-r--r--numpy/tests/test_public_api.py1
12 files changed, 577 insertions, 221 deletions
diff --git a/.gitignore b/.gitignore
index 6f63498e0..9851fcc77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -216,6 +216,7 @@ numpy/core/src/_simd/_simd.dispatch.c
numpy/core/src/_simd/_simd_data.inc
numpy/core/src/_simd/_simd_inc.h
# umath module
+numpy/core/src/umath/loops_unary.dispatch.c
numpy/core/src/umath/loops_unary_fp.dispatch.c
numpy/core/src/umath/loops_arithm_fp.dispatch.c
numpy/core/src/umath/loops_arithmetic.dispatch.c
diff --git a/benchmarks/benchmarks/bench_ma.py b/benchmarks/benchmarks/bench_ma.py
index 0247065a7..49ccf92fe 100644
--- a/benchmarks/benchmarks/bench_ma.py
+++ b/benchmarks/benchmarks/bench_ma.py
@@ -119,3 +119,149 @@ class Concatenate(Benchmark):
def time_it(self, mode, n):
np.ma.concatenate(self.args)
+
+
+class MAFunctions1v(Benchmark):
+ param_names = ['mtype', 'func', 'msize']
+ params = [['np', 'np.ma'],
+ ['sin', 'log', 'sqrt'],
+ ['small', 'big']]
+
+ def setup(self, mtype, func, msize):
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmxl = np.ma.array(xl, mask=maskx)
+
+ def time_functions_1v(self, mtype, func, msize):
+ # fun = {'np.ma.sin': np.ma.sin, 'np.sin': np.sin}[func]
+ fun = eval(f"{mtype}.{func}")
+ if msize == 'small':
+ fun(self.nmxs)
+ elif msize == 'big':
+ fun(self.nmxl)
+
+
+class MAMethod0v(Benchmark):
+ param_names = ['method', 'msize']
+ params = [['ravel', 'transpose', 'compressed', 'conjugate'],
+ ['small', 'big']]
+
+ def setup(self, method, msize):
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmxl = np.ma.array(xl, mask=maskx)
+
+ def time_methods_0v(self, method, msize):
+ if msize == 'small':
+ mdat = self.nmxs
+ elif msize == 'big':
+ mdat = self.nmxl
+ getattr(mdat, method)()
+
+
+class MAFunctions2v(Benchmark):
+ param_names = ['mtype', 'func', 'msize']
+ params = [['np', 'np.ma'],
+ ['multiply', 'divide', 'power'],
+ ['small', 'big']]
+
+ def setup(self, mtype, func, msize):
+ # Small arrays
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ m2 = [[True, False, True], [False, False, True]]
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmys = np.ma.array(ys, mask=m2)
+ # Big arrays
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ masky = yl < -0.8
+ self.nmxl = np.ma.array(xl, mask=maskx)
+ self.nmyl = np.ma.array(yl, mask=masky)
+
+ def time_functions_2v(self, mtype, func, msize):
+ fun = eval(f"{mtype}.{func}")
+ if msize == 'small':
+ fun(self.nmxs, self.nmys)
+ elif msize == 'big':
+ fun(self.nmxl, self.nmyl)
+
+
+class MAMethodGetItem(Benchmark):
+ param_names = ['margs', 'msize']
+ params = [[0, (0, 0), [0, -1]],
+ ['small', 'big']]
+
+ def setup(self, margs, msize):
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmxl = np.ma.array(xl, mask=maskx)
+
+ def time_methods_getitem(self, margs, msize):
+ if msize == 'small':
+ mdat = self.nmxs
+ elif msize == 'big':
+ mdat = self.nmxl
+ getattr(mdat, '__getitem__')(margs)
+
+
+class MAMethodSetItem(Benchmark):
+ param_names = ['margs', 'mset', 'msize']
+ params = [[0, (0, 0), (-1, 0)],
+ [17, np.ma.masked],
+ ['small', 'big']]
+
+ def setup(self, margs, mset, msize):
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmxl = np.ma.array(xl, mask=maskx)
+
+ def time_methods_setitem(self, margs, mset, msize):
+ if msize == 'small':
+ mdat = self.nmxs
+ elif msize == 'big':
+ mdat = self.nmxl
+ getattr(mdat, '__setitem__')(margs, mset)
+
+
+class Where(Benchmark):
+ param_names = ['mtype', 'msize']
+ params = [['np', 'np.ma'],
+ ['small', 'big']]
+
+ def setup(self, mtype, msize):
+ # Small arrays
+ xs = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ ys = np.random.uniform(-1, 1, 6).reshape(2, 3)
+ m1 = [[True, False, False], [False, False, True]]
+ m2 = [[True, False, True], [False, False, True]]
+ self.nmxs = np.ma.array(xs, mask=m1)
+ self.nmys = np.ma.array(ys, mask=m2)
+ # Big arrays
+ xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100)
+ maskx = xl > 0.8
+ masky = yl < -0.8
+ self.nmxl = np.ma.array(xl, mask=maskx)
+ self.nmyl = np.ma.array(yl, mask=masky)
+
+ def time_where(self, mtype, msize):
+ fun = eval(f"{mtype}.where")
+ if msize == 'small':
+ fun(self.nmxs > 2, self.nmxs, self.nmys)
+ elif msize == 'big':
+ fun(self.nmxl > 2, self.nmxl, self.nmyl)
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 40382b8ae..768c8deee 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -426,7 +426,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.negative'),
'PyUFunc_NegativeTypeResolver',
- TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]),
+ TD(ints+flts+timedeltaonly, dispatch=[('loops_unary', ints+'fdg')]),
TD(cmplx, f='neg'),
TD(O, f='PyNumber_Negative'),
),
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index 2b23d3f14..50cd8ccc5 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -747,6 +747,7 @@ src_umath = [
src_file.process('src/umath/loops_modulo.dispatch.c.src'),
src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
src_file.process('src/umath/loops_umath_fp.dispatch.c.src'),
+ src_file.process('src/umath/loops_unary.dispatch.c.src'),
src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
src_file.process('src/umath/matmul.c.src'),
src_file.process('src/umath/matmul.h.src'),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 4b61bd855..da5bc64c0 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1005,6 +1005,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops_utils.h.src'),
join('src', 'umath', 'loops.c.src'),
+ join('src', 'umath', 'loops_unary.dispatch.c.src'),
join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index fe5aa9374..0b4856847 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -601,14 +601,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
#if @CHK@
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
-@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- UNARY_LOOP_FAST(@type@, @type@, *out = -in);
-}
-#endif
-
-#if @CHK@
-NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
@@ -1546,17 +1538,6 @@ NPY_NO_EXPORT void
}
NPY_NO_EXPORT void
-@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
- if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) {
- UNARY_LOOP {
- const @type@ in1 = *(@type@ *)ip1;
- *((@type@ *)op1) = -in1;
- }
- }
-}
-
-NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
UNARY_LOOP {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 424e204c1..e3a410968 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -140,9 +140,6 @@ NPY_NO_EXPORT void
@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
-@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
-
-NPY_NO_EXPORT void
@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
NPY_NO_EXPORT void
@@ -206,6 +203,23 @@ NPY_NO_EXPORT void
/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+
/*
*****************************************************************************
** FLOAT LOOPS **
@@ -226,6 +240,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat**/
#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_unary.dispatch.h"
+#endif
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
+ */
+/**begin repeat1
+ * #kind = negative#
+ */
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+/**end repeat1**/
+/**end repeat**/
+
+#ifndef NPY_DISABLE_OPTIMIZATION
#include "loops_arithm_fp.dispatch.h"
#endif
/**begin repeat
@@ -362,6 +390,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, (
* #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE#
* #c = f, f, , l#
* #C = F, F, , L#
+ * #half = 1, 0, 0, 0#
*/
/**begin repeat1
@@ -440,8 +469,10 @@ NPY_NO_EXPORT void
NPY_NO_EXPORT void
@TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#if @half@
NPY_NO_EXPORT void
@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
NPY_NO_EXPORT void
@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src
new file mode 100644
index 000000000..1e2a81d20
--- /dev/null
+++ b/numpy/core/src/umath/loops_unary.dispatch.c.src
@@ -0,0 +1,364 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64#
+ * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64#
+ */
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64)
+ return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v)));
+#else
+ // (x ^ -1) + 1
+ const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1);
+ return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1);
+#endif
+}
+/**end repeat**/
+
+/**begin repeat
+ * #sfx = f32, f64#
+ * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64#
+ * #fd = f, #
+ */
+#if @VCHK@
+static NPY_INLINE npyv_@sfx@
+npyv_negative_@sfx@(npyv_@sfx@ v)
+{
+#if defined(NPY_HAVE_NEON)
+ return vnegq_@sfx@(v);
+#else
+ // (v ^ signmask)
+ const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@);
+ return npyv_xor_@sfx@(v, signmask);
+#endif
+}
+#endif // @VCHK@
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/**begin repeat
+ * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64#
+ * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64#
+ * #is_fp = 0*8, 1*2#
+ * #supports_ncontig = 0*4,1*6#
+ */
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ * #unroll = 4#
+ */
+#if @simd_chk@
+#if @unroll@ < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && @unroll@ > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL @unroll@
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+
+#if @supports_ncontig@
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_load_@sfx@(ip);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ++ip, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_store_@sfx@(op + @U@ * vstep, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_store_@sfx@(op, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, ++op) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+static NPY_INLINE void
+simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride,
+ npyv_lanetype_@sfx@ *op, npy_intp ostride,
+ npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ const int wstep = vstep * UNROLL;
+
+ // unrolled vector loop
+ for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride);
+ npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@);
+ npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@);
+ #endif
+ /**end repeat2**/
+ }
+ // single vector loop
+ for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+ npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride);
+ npyv_@sfx@ r = npyv_@intrin@_@sfx@(v);
+ npyv_storen_@sfx@(op, ostride, r);
+ }
+ // scalar finish up any remaining iterations
+ for (; len > 0; --len, ip += istride, op += ostride) {
+ *op = scalar_@intrin@(*ip);
+ }
+}
+#endif // @supports_ncontig@
+#undef UNROLL
+#endif // @simd_chk@
+/*end repeat1**/
+/**end repeat**/
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ *
+ * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * FLOAT, DOUBLE, LONGDOUBLE#
+ * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_float, npy_double, npy_longdouble#
+ *
+ * #is_fp = 0*10, 1*3#
+ * #is_unsigned = 1*5, 0*5, 0*3#
+ * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@
+ #if @is_fp@
+ #define TO_SIMD_SFX(X) X##_f@len@
+ #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32
+ #undef TO_SIMD_SFX
+ #endif
+ #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64
+ #undef TO_SIMD_SFX
+ #endif
+ #elif @is_unsigned@
+ #define TO_SIMD_SFX(X) X##_u@len@
+ #else
+ #define TO_SIMD_SFX(X) X##_s@len@
+ #endif
+/**end repeat1**/
+#endif
+
+/**begin repeat1
+ * #kind = negative#
+ * #intrin = negative#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+ char *ip = args[0], *op = args[1];
+ npy_intp istep = steps[0], ostep = steps[1],
+ len = dimensions[0];
+#ifdef TO_SIMD_SFX
+ #undef STYPE
+ #define STYPE TO_SIMD_SFX(npyv_lanetype)
+ if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+ if (IS_UNARY_CONT(@type@, @type@)) {
+ // no overlap and operands are contiguous
+ TO_SIMD_SFX(simd_unary_cc_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ #if @supports_ncontig@
+ const npy_intp istride = istep / sizeof(STYPE);
+ const npy_intp ostride = ostep / sizeof(STYPE);
+ if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+ TO_SIMD_SFX(npyv_storable_stride)(ostride))
+ {
+ if (istride == 1 && ostride != 1) {
+ // contiguous input, non-contiguous output
+ TO_SIMD_SFX(simd_unary_cn_@intrin@)(
+ (STYPE*)ip, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ else if (istride != 1 && ostride == 1) {
+ // non-contiguous input, contiguous output
+ TO_SIMD_SFX(simd_unary_nc_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, len
+ );
+ goto clear;
+ }
+ // SSE2 does better with unrolled scalar for heavy non-contiguous
+ #if !defined(NPY_HAVE_SSE2)
+ else if (istride != 1 && ostride != 1) {
+ // non-contiguous input and output
+ TO_SIMD_SFX(simd_unary_nn_@intrin@)(
+ (STYPE*)ip, istride, (STYPE*)op, ostride, len
+ );
+ goto clear;
+ }
+ #endif
+ }
+ #endif // @supports_ncontig@
+ }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+ /*
+ * scalar unrolls
+ * 8x unroll performed best on
+ * - Apple M1 Native / arm64
+ * - Apple M1 Rosetta / SSE42
+ * - iMacPro / AVX512
+ */
+ #define UNROLL 8
+ for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+ /**begin repeat2
+ * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+ */
+ #if UNROLL > @U@
+ const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep));
+ *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@);
+ #endif
+ /**end repeat2**/
+ }
+#endif // NPY_DISABLE_OPTIMIZATION
+ for (; len > 0; --len, ip += istep, op += ostep) {
+ *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip);
+ }
+#ifdef TO_SIMD_SFX
+clear:
+ npyv_cleanup();
+#endif
+#if @is_fp@
+ npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+/**end repeat**/
+
+#undef NEGATIVE_CONTIG_ONLY
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 5351ec1fa..6fc1501c9 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -129,39 +129,9 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c
* #vector = 1, 1, 0#
* #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 #
*/
-
-/**begin repeat1
- * #func = negative#
- * #check = IS_BLOCKABLE_UNARY#
- * #name = unary#
- */
-
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
-
-/* prototypes */
-static void
-sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n);
-
-#endif
-
-static inline int
-run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
- if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) {
- sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]);
- return 1;
- }
-#endif
- return 0;
-}
-
-/**end repeat1**/
-
/**begin repeat1
* #kind = isnan, isfinite, isinf, signbit#
*/
-
#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS
static void
@@ -181,9 +151,7 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *
#endif
return 0;
}
-
/**end repeat1**/
-
/**end repeat**/
/*
@@ -426,41 +394,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n)
}
/**end repeat1**/
-
-static void
-sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n)
-{
- /*
- * get 0x7FFFFFFF mask (everything but signbit set)
- * float & ~mask will remove the sign, float ^ mask flips the sign
- * this is equivalent to how the compiler implements fabs on amd64
- */
- const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@);
-
- /* align output to VECTOR_SIZE_BYTES bytes */
- LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) {
- op[i] = -ip[i];
- }
- assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) ||
- npy_is_aligned(&op[i], VECTOR_SIZE_BYTES));
- if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_load_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
- }
- }
- else {
- LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
- @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]);
- @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a));
- }
- }
- LOOP_BLOCKED_END {
- op[i] = -ip[i];
- }
-}
-/**end repeat1**/
-
/**end repeat**/
/* bunch of helper functions used in ISA_exp/log_FLOAT*/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index f21a88acd..88ab7e014 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -2635,6 +2635,35 @@ class TestAbsoluteNegative:
np.abs(d, out=d)
np.abs(np.ones_like(d), out=d)
+ @pytest.mark.parametrize("dtype", ['d', 'f', 'int32', 'int64'])
+ @pytest.mark.parametrize("big", [True, False])
+ def test_noncontiguous(self, dtype, big):
+ data = np.array([-1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.5, 2.5, -6,
+ 6, -2.2251e-308, -8, 10], dtype=dtype)
+ expect = np.array([1.0, -1.0, 0.0, -0.0, -2.2251e-308, 2.5, -2.5, 6,
+ -6, 2.2251e-308, 8, -10], dtype=dtype)
+ if big:
+ data = np.repeat(data, 10)
+ expect = np.repeat(expect, 10)
+ out = np.ndarray(data.shape, dtype=dtype)
+ ncontig_in = data[1::2]
+ ncontig_out = out[1::2]
+ contig_in = np.array(ncontig_in)
+ # contig in, contig out
+ assert_array_equal(np.negative(contig_in), expect[1::2])
+ # contig in, ncontig out
+ assert_array_equal(np.negative(contig_in, out=ncontig_out),
+ expect[1::2])
+ # ncontig in, contig out
+ assert_array_equal(np.negative(ncontig_in), expect[1::2])
+ # ncontig in, ncontig out
+ assert_array_equal(np.negative(ncontig_in, out=ncontig_out),
+ expect[1::2])
+ # contig in, contig out, nd stride
+ data_split = np.array(np.array_split(data, 2))
+ expect_split = np.array(np.array_split(expect, 2))
+ assert_equal(np.negative(data_split), expect_split)
+
class TestPositive:
def test_valid(self):
diff --git a/numpy/ma/bench.py b/numpy/ma/bench.py
deleted file mode 100644
index 56865683d..000000000
--- a/numpy/ma/bench.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-
-import timeit
-import numpy
-
-
-###############################################################################
-# Global variables #
-###############################################################################
-
-
-# Small arrays
-xs = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-ys = numpy.random.uniform(-1, 1, 6).reshape(2, 3)
-zs = xs + 1j * ys
-m1 = [[True, False, False], [False, False, True]]
-m2 = [[True, False, True], [False, False, True]]
-nmxs = numpy.ma.array(xs, mask=m1)
-nmys = numpy.ma.array(ys, mask=m2)
-nmzs = numpy.ma.array(zs, mask=m1)
-
-# Big arrays
-xl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-yl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100)
-zl = xl + 1j * yl
-maskx = xl > 0.8
-masky = yl < -0.8
-nmxl = numpy.ma.array(xl, mask=maskx)
-nmyl = numpy.ma.array(yl, mask=masky)
-nmzl = numpy.ma.array(zl, mask=maskx)
-
-
-###############################################################################
-# Functions #
-###############################################################################
-
-
-def timer(s, v='', nloop=500, nrep=3):
- units = ["s", "ms", "µs", "ns"]
- scaling = [1, 1e3, 1e6, 1e9]
- print("%s : %-50s : " % (v, s), end=' ')
- varnames = ["%ss,nm%ss,%sl,nm%sl" % tuple(x*4) for x in 'xyz']
- setup = 'from __main__ import numpy, ma, %s' % ','.join(varnames)
- Timer = timeit.Timer(stmt=s, setup=setup)
- best = min(Timer.repeat(nrep, nloop)) / nloop
- if best > 0.0:
- order = min(-int(numpy.floor(numpy.log10(best)) // 3), 3)
- else:
- order = 3
- print("%d loops, best of %d: %.*g %s per loop" % (nloop, nrep,
- 3,
- best * scaling[order],
- units[order]))
-
-
-def compare_functions_1v(func, nloop=500,
- xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
- funcname = func.__name__
- print("-"*50)
- print(f'{funcname} on small arrays')
- module, data = "numpy.ma", "nmxs"
- timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
- print("%s on large arrays" % funcname)
- module, data = "numpy.ma", "nmxl"
- timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
- return
-
-def compare_methods(methodname, args, vars='x', nloop=500, test=True,
- xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl):
- print("-"*50)
- print(f'{methodname} on small arrays')
- data, ver = f'nm{vars}l', 'numpy.ma'
- timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
-
- print("%s on large arrays" % methodname)
- data, ver = "nm%sl" % vars, 'numpy.ma'
- timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop)
- return
-
-def compare_functions_2v(func, nloop=500, test=True,
- xs=xs, nmxs=nmxs,
- ys=ys, nmys=nmys,
- xl=xl, nmxl=nmxl,
- yl=yl, nmyl=nmyl):
- funcname = func.__name__
- print("-"*50)
- print(f'{funcname} on small arrays')
- module, data = "numpy.ma", "nmxs,nmys"
- timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
-
- print(f'{funcname} on large arrays')
- module, data = "numpy.ma", "nmxl,nmyl"
- timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop)
- return
-
-
-if __name__ == '__main__':
- compare_functions_1v(numpy.sin)
- compare_functions_1v(numpy.log)
- compare_functions_1v(numpy.sqrt)
-
- compare_functions_2v(numpy.multiply)
- compare_functions_2v(numpy.divide)
- compare_functions_2v(numpy.power)
-
- compare_methods('ravel', '', nloop=1000)
- compare_methods('conjugate', '', 'z', nloop=1000)
- compare_methods('transpose', '', nloop=1000)
- compare_methods('compressed', '', nloop=1000)
- compare_methods('__getitem__', '0', nloop=1000)
- compare_methods('__getitem__', '(0,0)', nloop=1000)
- compare_methods('__getitem__', '[0,-1]', nloop=1000)
- compare_methods('__setitem__', '0, 17', nloop=1000, test=False)
- compare_methods('__setitem__', '(0,0), 17', nloop=1000, test=False)
-
- print("-"*50)
- print("__setitem__ on small arrays")
- timer('nmxs.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma ', nloop=10000)
-
- print("-"*50)
- print("__setitem__ on large arrays")
- timer('nmxl.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma ', nloop=10000)
-
- print("-"*50)
- print("where on small arrays")
- timer('numpy.ma.where(nmxs>2,nmxs,nmys)', 'numpy.ma ', nloop=1000)
- print("-"*50)
- print("where on large arrays")
- timer('numpy.ma.where(nmxl>2,nmxl,nmyl)', 'numpy.ma ', nloop=100)
diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py
index 8f654f3d9..28ed2f1df 100644
--- a/numpy/tests/test_public_api.py
+++ b/numpy/tests/test_public_api.py
@@ -280,7 +280,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [
"lib.utils",
"linalg.lapack_lite",
"linalg.linalg",
- "ma.bench",
"ma.core",
"ma.testutils",
"ma.timer_comparison",