diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | benchmarks/benchmarks/bench_ma.py | 146 | ||||
| -rw-r--r-- | numpy/core/code_generators/generate_umath.py | 2 | ||||
| -rw-r--r-- | numpy/core/meson.build | 1 | ||||
| -rw-r--r-- | numpy/core/setup.py | 1 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.c.src | 19 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops.h.src | 37 | ||||
| -rw-r--r-- | numpy/core/src/umath/loops_unary.dispatch.c.src | 364 | ||||
| -rw-r--r-- | numpy/core/src/umath/simd.inc.src | 67 | ||||
| -rw-r--r-- | numpy/core/tests/test_umath.py | 29 | ||||
| -rw-r--r-- | numpy/ma/bench.py | 130 | ||||
| -rw-r--r-- | numpy/tests/test_public_api.py | 1 |
12 files changed, 577 insertions, 221 deletions
diff --git a/.gitignore b/.gitignore index 6f63498e0..9851fcc77 100644 --- a/.gitignore +++ b/.gitignore @@ -216,6 +216,7 @@ numpy/core/src/_simd/_simd.dispatch.c numpy/core/src/_simd/_simd_data.inc numpy/core/src/_simd/_simd_inc.h # umath module +numpy/core/src/umath/loops_unary.dispatch.c numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c numpy/core/src/umath/loops_arithmetic.dispatch.c diff --git a/benchmarks/benchmarks/bench_ma.py b/benchmarks/benchmarks/bench_ma.py index 0247065a7..49ccf92fe 100644 --- a/benchmarks/benchmarks/bench_ma.py +++ b/benchmarks/benchmarks/bench_ma.py @@ -119,3 +119,149 @@ class Concatenate(Benchmark): def time_it(self, mode, n): np.ma.concatenate(self.args) + + +class MAFunctions1v(Benchmark): + param_names = ['mtype', 'func', 'msize'] + params = [['np', 'np.ma'], + ['sin', 'log', 'sqrt'], + ['small', 'big']] + + def setup(self, mtype, func, msize): + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + self.nmxs = np.ma.array(xs, mask=m1) + self.nmxl = np.ma.array(xl, mask=maskx) + + def time_functions_1v(self, mtype, func, msize): + # fun = {'np.ma.sin': np.ma.sin, 'np.sin': np.sin}[func] + fun = eval(f"{mtype}.{func}") + if msize == 'small': + fun(self.nmxs) + elif msize == 'big': + fun(self.nmxl) + + +class MAMethod0v(Benchmark): + param_names = ['method', 'msize'] + params = [['ravel', 'transpose', 'compressed', 'conjugate'], + ['small', 'big']] + + def setup(self, method, msize): + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + self.nmxs = np.ma.array(xs, mask=m1) + self.nmxl = np.ma.array(xl, mask=maskx) + + def time_methods_0v(self, method, msize): + if msize == 'small': + mdat = self.nmxs + elif msize == 'big': + mdat = self.nmxl + getattr(mdat, method)() + + +class MAFunctions2v(Benchmark): + param_names = ['mtype', 'func', 'msize'] + params = [['np', 'np.ma'], + ['multiply', 'divide', 'power'], + ['small', 'big']] + + def setup(self, mtype, func, msize): + # Small arrays + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + ys = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + m2 = [[True, False, True], [False, False, True]] + self.nmxs = np.ma.array(xs, mask=m1) + self.nmys = np.ma.array(ys, mask=m2) + # Big arrays + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + masky = yl < -0.8 + self.nmxl = np.ma.array(xl, mask=maskx) + self.nmyl = np.ma.array(yl, mask=masky) + + def time_functions_2v(self, mtype, func, msize): + fun = eval(f"{mtype}.{func}") + if msize == 'small': + fun(self.nmxs, self.nmys) + elif msize == 'big': + fun(self.nmxl, self.nmyl) + + +class MAMethodGetItem(Benchmark): + param_names = ['margs', 'msize'] + params = [[0, (0, 0), [0, -1]], + ['small', 'big']] + + def setup(self, margs, msize): + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + self.nmxs = np.ma.array(xs, mask=m1) + self.nmxl = np.ma.array(xl, mask=maskx) + + def time_methods_getitem(self, margs, msize): + if msize == 'small': + mdat = self.nmxs + elif msize == 'big': + mdat = self.nmxl + getattr(mdat, '__getitem__')(margs) + + +class MAMethodSetItem(Benchmark): + param_names = ['margs', 'mset', 'msize'] + params = [[0, (0, 0), (-1, 0)], + [17, np.ma.masked], + ['small', 'big']] + + def setup(self, margs, mset, msize): + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + self.nmxs = np.ma.array(xs, mask=m1) + self.nmxl = np.ma.array(xl, mask=maskx) + + def time_methods_setitem(self, margs, mset, msize): + if msize == 'small': + mdat = self.nmxs + elif msize == 'big': + mdat = self.nmxl + getattr(mdat, '__setitem__')(margs, mset) + + +class Where(Benchmark): + param_names = ['mtype', 'msize'] + params = [['np', 'np.ma'], + ['small', 'big']] + + def setup(self, mtype, msize): + # Small arrays + xs = np.random.uniform(-1, 1, 6).reshape(2, 3) + ys = np.random.uniform(-1, 1, 6).reshape(2, 3) + m1 = [[True, False, False], [False, False, True]] + m2 = [[True, False, True], [False, False, True]] + self.nmxs = np.ma.array(xs, mask=m1) + self.nmys = np.ma.array(ys, mask=m2) + # Big arrays + xl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + yl = np.random.uniform(-1, 1, 100*100).reshape(100, 100) + maskx = xl > 0.8 + masky = yl < -0.8 + self.nmxl = np.ma.array(xl, mask=maskx) + self.nmyl = np.ma.array(yl, mask=masky) + + def time_where(self, mtype, msize): + fun = eval(f"{mtype}.where") + if msize == 'small': + fun(self.nmxs > 2, self.nmxs, self.nmys) + elif msize == 'big': + fun(self.nmxl > 2, self.nmxl, self.nmyl) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 40382b8ae..768c8deee 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -426,7 +426,7 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.negative'), 'PyUFunc_NegativeTypeResolver', - TD(ints+flts+timedeltaonly, simd=[('avx2', ints)]), + TD(ints+flts+timedeltaonly, dispatch=[('loops_unary', ints+'fdg')]), TD(cmplx, f='neg'), TD(O, f='PyNumber_Negative'), ), diff --git a/numpy/core/meson.build b/numpy/core/meson.build index 2b23d3f14..50cd8ccc5 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -747,6 +747,7 @@ src_umath = [ src_file.process('src/umath/loops_modulo.dispatch.c.src'), src_file.process('src/umath/loops_trigonometric.dispatch.c.src'), src_file.process('src/umath/loops_umath_fp.dispatch.c.src'), + src_file.process('src/umath/loops_unary.dispatch.c.src'), src_file.process('src/umath/loops_unary_fp.dispatch.c.src'), src_file.process('src/umath/matmul.c.src'), src_file.process('src/umath/matmul.h.src'), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 4b61bd855..da5bc64c0 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1005,6 +1005,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops.h.src'), join('src', 'umath', 'loops_utils.h.src'), join('src', 'umath', 'loops.c.src'), + join('src', 'umath', 'loops_unary.dispatch.c.src'), join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index fe5aa9374..0b4856847 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -601,14 +601,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void #if @CHK@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void -@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - UNARY_LOOP_FAST(@type@, @type@, *out = -in); -} -#endif - -#if @CHK@ -NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void @TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { UNARY_LOOP_FAST(@type@, npy_bool, *out = !in); @@ -1546,17 +1538,6 @@ NPY_NO_EXPORT void } NPY_NO_EXPORT void -@TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (!run_unary_simd_negative_@TYPE@(args, dimensions, steps)) { - UNARY_LOOP { - const @type@ in1 = *(@type@ *)ip1; - *((@type@ *)op1) = -in1; - } - } -} - -NPY_NO_EXPORT void @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { UNARY_LOOP { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 424e204c1..e3a410968 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -140,9 +140,6 @@ NPY_NO_EXPORT void @S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void -@S@@TYPE@_negative@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); - -NPY_NO_EXPORT void @S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); NPY_NO_EXPORT void @@ -206,6 +203,23 @@ NPY_NO_EXPORT void /**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary.dispatch.h" +#endif +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG# + */ +/**begin repeat1 + * #kind = negative# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + + /* ***************************************************************************** ** FLOAT LOOPS ** @@ -226,6 +240,20 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, /**end repeat**/ #ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_unary.dispatch.h" +#endif +/**begin repeat + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + */ +/**begin repeat1 + * #kind = negative# + */ +NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) +/**end repeat1**/ +/**end repeat**/ + +#ifndef NPY_DISABLE_OPTIMIZATION #include "loops_arithm_fp.dispatch.h" #endif /**begin repeat @@ -362,6 +390,7 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@, ( * #TYPE = HALF, FLOAT, DOUBLE, LONGDOUBLE# * #c = f, f, , l# * #C = F, F, , L# + * #half = 1, 0, 0, 0# */ /**begin repeat1 @@ -440,8 +469,10 @@ NPY_NO_EXPORT void NPY_NO_EXPORT void @TYPE@_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#if @half@ NPY_NO_EXPORT void @TYPE@_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +#endif NPY_NO_EXPORT void @TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); diff --git a/numpy/core/src/umath/loops_unary.dispatch.c.src b/numpy/core/src/umath/loops_unary.dispatch.c.src new file mode 100644 index 000000000..1e2a81d20 --- /dev/null +++ b/numpy/core/src/umath/loops_unary.dispatch.c.src @@ -0,0 +1,364 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 + ** vx vxe + **/ + +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "numpy/npy_math.h" +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Scalar ops + ******************************************************************************/ +#define scalar_negative(X) (-X) + +/******************************************************************************* + ** extra SIMD intrinsics + ******************************************************************************/ + +#if NPY_SIMD + +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64# + * #ssfx = 8, 8, 16, 16, 32, 32, 64, 64# + */ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || @ssfx@ < 64) + return npyv_reinterpret_@sfx@_s@ssfx@(vnegq_s@ssfx@(npyv_reinterpret_s@ssfx@_@sfx@(v))); +#else + // (x ^ -1) + 1 + const npyv_@sfx@ m1 = npyv_setall_@sfx@((npyv_lanetype_@sfx@)-1); + return npyv_sub_@sfx@(npyv_xor_@sfx@(v, m1), m1); +#endif +} +/**end repeat**/ + +/**begin repeat + * #sfx = f32, f64# + * #VCHK = NPY_SIMD_F32, NPY_SIMD_F64# + * #fd = f, # + */ +#if @VCHK@ +static NPY_INLINE npyv_@sfx@ +npyv_negative_@sfx@(npyv_@sfx@ v) +{ +#if defined(NPY_HAVE_NEON) + return vnegq_@sfx@(v); +#else + // (v ^ signmask) + const npyv_@sfx@ signmask = npyv_setall_@sfx@(-0.@fd@); + return npyv_xor_@sfx@(v, signmask); +#endif +} +#endif // @VCHK@ +/**end repeat**/ + +#endif // NPY_SIMD + +/******************************************************************************** + ** Defining the SIMD kernels + ********************************************************************************/ +/**begin repeat + * #sfx = s8, u8, s16, u16, s32, u32, s64, u64, f32, f64# + * #simd_chk = NPY_SIMD*8, NPY_SIMD_F32, NPY_SIMD_F64# + * #is_fp = 0*8, 1*2# + * #supports_ncontig = 0*4,1*6# + */ +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + * #unroll = 4# + */ +#if @simd_chk@ +#if @unroll@ < 1 +#error "Unroll must be at least 1" +#elif NPY_SIMD != 128 && @unroll@ > 2 +// Avoid memory bandwidth bottleneck for larger SIMD +#define UNROLL 2 +#else +#define UNROLL @unroll@ +#endif +// contiguous inputs and output. +static NPY_INLINE void +simd_unary_cc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, ++op) { + *op = scalar_@intrin@(*ip); + } +} + +#if @supports_ncontig@ +// contiguous input, non-contiguous output +static NPY_INLINE void +simd_unary_cn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_load_@sfx@(ip + @U@ * vstep); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_load_@sfx@(ip); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ++ip, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input, contiguous output +static NPY_INLINE void +simd_unary_nc_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_store_@sfx@(op + @U@ * vstep, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_store_@sfx@(op, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, ++op) { + *op = scalar_@intrin@(*ip); + } +} +// non-contiguous input and output +// limit unroll to 2x +#if UNROLL > 2 +#undef UNROLL +#define UNROLL 2 +#endif +static NPY_INLINE void +simd_unary_nn_@intrin@_@sfx@(const npyv_lanetype_@sfx@ *ip, npy_intp istride, + npyv_lanetype_@sfx@ *op, npy_intp ostride, + npy_intp len) +{ + const int vstep = npyv_nlanes_@sfx@; + const int wstep = vstep * UNROLL; + + // unrolled vector loop + for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + npyv_@sfx@ v_@U@ = npyv_loadn_@sfx@(ip + @U@ * vstep * istride, istride); + npyv_@sfx@ r_@U@ = npyv_@intrin@_@sfx@(v_@U@); + npyv_storen_@sfx@(op + @U@ * vstep * ostride, ostride, r_@U@); + #endif + /**end repeat2**/ + } + // single vector loop + for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) { + npyv_@sfx@ v = npyv_loadn_@sfx@(ip, istride); + npyv_@sfx@ r = npyv_@intrin@_@sfx@(v); + npyv_storen_@sfx@(op, ostride, r); + } + // scalar finish up any remaining iterations + for (; len > 0; --len, ip += istride, op += ostride) { + *op = scalar_@intrin@(*ip); + } +} +#endif // @supports_ncontig@ +#undef UNROLL +#endif // @simd_chk@ +/*end repeat1**/ +/**end repeat**/ + +/******************************************************************************** + ** Defining ufunc inner functions + ********************************************************************************/ +/**begin repeat + * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * + * #BTYPE = BYTE, SHORT, INT, LONG, LONGLONG, + * BYTE, SHORT, INT, LONG, LONGLONG, + * FLOAT, DOUBLE, LONGDOUBLE# + * #type = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, + * npy_byte, npy_short, npy_int, npy_long, npy_longlong, + * npy_float, npy_double, npy_longdouble# + * + * #is_fp = 0*10, 1*3# + * #is_unsigned = 1*5, 0*5, 0*3# + * #supports_ncontig = 0*2, 1*3, 0*2, 1*3, 1*3# + */ +#undef TO_SIMD_SFX +#if 0 +/**begin repeat1 + * #len = 8, 16, 32, 64# + */ +#elif NPY_SIMD && NPY_BITSOF_@BTYPE@ == @len@ + #if @is_fp@ + #define TO_SIMD_SFX(X) X##_f@len@ + #if NPY_BITSOF_@BTYPE@ == 32 && !NPY_SIMD_F32 + #undef TO_SIMD_SFX + #endif + #if NPY_BITSOF_@BTYPE@ == 64 && !NPY_SIMD_F64 + #undef TO_SIMD_SFX + #endif + #elif @is_unsigned@ + #define TO_SIMD_SFX(X) X##_u@len@ + #else + #define TO_SIMD_SFX(X) X##_s@len@ + #endif +/**end repeat1**/ +#endif + +/**begin repeat1 + * #kind = negative# + * #intrin = negative# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + char *ip = args[0], *op = args[1]; + npy_intp istep = steps[0], ostep = steps[1], + len = dimensions[0]; +#ifdef TO_SIMD_SFX + #undef STYPE + #define STYPE TO_SIMD_SFX(npyv_lanetype) + if (!is_mem_overlap(ip, istep, op, ostep, len)) { + if (IS_UNARY_CONT(@type@, @type@)) { + // no overlap and operands are contiguous + TO_SIMD_SFX(simd_unary_cc_@intrin@)( + (STYPE*)ip, (STYPE*)op, len + ); + goto clear; + } + #if @supports_ncontig@ + const npy_intp istride = istep / sizeof(STYPE); + const npy_intp ostride = ostep / sizeof(STYPE); + if (TO_SIMD_SFX(npyv_loadable_stride)(istride) && + TO_SIMD_SFX(npyv_storable_stride)(ostride)) + { + if (istride == 1 && ostride != 1) { + // contiguous input, non-contiguous output + TO_SIMD_SFX(simd_unary_cn_@intrin@)( + (STYPE*)ip, (STYPE*)op, ostride, len + ); + goto clear; + } + else if (istride != 1 && ostride == 1) { + // non-contiguous input, contiguous output + TO_SIMD_SFX(simd_unary_nc_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, len + ); + goto clear; + } + // SSE2 does better with unrolled scalar for heavy non-contiguous + #if !defined(NPY_HAVE_SSE2) + else if (istride != 1 && ostride != 1) { + // non-contiguous input and output + TO_SIMD_SFX(simd_unary_nn_@intrin@)( + (STYPE*)ip, istride, (STYPE*)op, ostride, len + ); + goto clear; + } + #endif + } + #endif // @supports_ncontig@ + } +#endif // TO_SIMD_SFX +#ifndef NPY_DISABLE_OPTIMIZATION + /* + * scalar unrolls + * 8x unroll performed best on + * - Apple M1 Native / arm64 + * - Apple M1 Rosetta / SSE42 + * - iMacPro / AVX512 + */ + #define UNROLL 8 + for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) { + /**begin repeat2 + * #U = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @U@ + const @type@ in_@U@ = *((const @type@ *)(ip + @U@ * istep)); + *((@type@ *)(op + @U@ * ostep)) = scalar_@intrin@(in_@U@); + #endif + /**end repeat2**/ + } +#endif // NPY_DISABLE_OPTIMIZATION + for (; len > 0; --len, ip += istep, op += ostep) { + *((@type@ *)op) = scalar_@intrin@(*(const @type@ *)ip); + } +#ifdef TO_SIMD_SFX +clear: + npyv_cleanup(); +#endif +#if @is_fp@ + npy_clear_floatstatus_barrier((char*)dimensions); +#endif +} +/**end repeat**/ + +#undef NEGATIVE_CONTIG_ONLY diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 5351ec1fa..6fc1501c9 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -129,39 +129,9 @@ run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp c * #vector = 1, 1, 0# * #VECTOR = NPY_SIMD, NPY_SIMD_F64, 0 # */ - -/**begin repeat1 - * #func = negative# - * #check = IS_BLOCKABLE_UNARY# - * #name = unary# - */ - -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - -/* prototypes */ -static void -sse2_@func@_@TYPE@(@type@ *, @type@ *, const npy_intp n); - -#endif - -static inline int -run_@name@_simd_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS - if (@check@(sizeof(@type@), VECTOR_SIZE_BYTES)) { - sse2_@func@_@TYPE@((@type@*)args[1], (@type@*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat1**/ - /**begin repeat1 * #kind = isnan, isfinite, isinf, signbit# */ - #if @vector@ && defined NPY_HAVE_SSE2_INTRINSICS static void @@ -181,9 +151,7 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const * #endif return 0; } - /**end repeat1**/ - /**end repeat**/ /* @@ -426,41 +394,6 @@ sse2_@kind@_@TYPE@(npy_bool * op, @type@ * ip1, npy_intp n) } /**end repeat1**/ - -static void -sse2_negative_@TYPE@(@type@ * op, @type@ * ip, const npy_intp n) -{ - /* - * get 0x7FFFFFFF mask (everything but signbit set) - * float & ~mask will remove the sign, float ^ mask flips the sign - * this is equivalent to how the compiler implements fabs on amd64 - */ - const @vtype@ mask = @vpre@_set1_@vsuf@(-0.@c@); - - /* align output to VECTOR_SIZE_BYTES bytes */ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) { - op[i] = -ip[i]; - } - assert((npy_uintp)n < (VECTOR_SIZE_BYTES / sizeof(@type@)) || - npy_is_aligned(&op[i], VECTOR_SIZE_BYTES)); - if (npy_is_aligned(&ip[i], VECTOR_SIZE_BYTES)) { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_load_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a)); - } - } - else { - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vpre@_loadu_@vsuf@(&ip[i]); - @vpre@_store_@vsuf@(&op[i], @vpre@_xor_@vsuf@(mask, a)); - } - } - LOOP_BLOCKED_END { - op[i] = -ip[i]; - } -} -/**end repeat1**/ - /**end repeat**/ /* bunch of helper functions used in ISA_exp/log_FLOAT*/ diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index f21a88acd..88ab7e014 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -2635,6 +2635,35 @@ class TestAbsoluteNegative: np.abs(d, out=d) np.abs(np.ones_like(d), out=d) + @pytest.mark.parametrize("dtype", ['d', 'f', 'int32', 'int64']) + @pytest.mark.parametrize("big", [True, False]) + def test_noncontiguous(self, dtype, big): + data = np.array([-1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.5, 2.5, -6, + 6, -2.2251e-308, -8, 10], dtype=dtype) + expect = np.array([1.0, -1.0, 0.0, -0.0, -2.2251e-308, 2.5, -2.5, 6, + -6, 2.2251e-308, 8, -10], dtype=dtype) + if big: + data = np.repeat(data, 10) + expect = np.repeat(expect, 10) + out = np.ndarray(data.shape, dtype=dtype) + ncontig_in = data[1::2] + ncontig_out = out[1::2] + contig_in = np.array(ncontig_in) + # contig in, contig out + assert_array_equal(np.negative(contig_in), expect[1::2]) + # contig in, ncontig out + assert_array_equal(np.negative(contig_in, out=ncontig_out), + expect[1::2]) + # ncontig in, contig out + assert_array_equal(np.negative(ncontig_in), expect[1::2]) + # ncontig in, ncontig out + assert_array_equal(np.negative(ncontig_in, out=ncontig_out), + expect[1::2]) + # contig in, contig out, nd stride + data_split = np.array(np.array_split(data, 2)) + expect_split = np.array(np.array_split(expect, 2)) + assert_equal(np.negative(data_split), expect_split) + class TestPositive: def test_valid(self): diff --git a/numpy/ma/bench.py b/numpy/ma/bench.py deleted file mode 100644 index 56865683d..000000000 --- a/numpy/ma/bench.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python3 - -import timeit -import numpy - - -############################################################################### -# Global variables # -############################################################################### - - -# Small arrays -xs = numpy.random.uniform(-1, 1, 6).reshape(2, 3) -ys = numpy.random.uniform(-1, 1, 6).reshape(2, 3) -zs = xs + 1j * ys -m1 = [[True, False, False], [False, False, True]] -m2 = [[True, False, True], [False, False, True]] -nmxs = numpy.ma.array(xs, mask=m1) -nmys = numpy.ma.array(ys, mask=m2) -nmzs = numpy.ma.array(zs, mask=m1) - -# Big arrays -xl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100) -yl = numpy.random.uniform(-1, 1, 100*100).reshape(100, 100) -zl = xl + 1j * yl -maskx = xl > 0.8 -masky = yl < -0.8 -nmxl = numpy.ma.array(xl, mask=maskx) -nmyl = numpy.ma.array(yl, mask=masky) -nmzl = numpy.ma.array(zl, mask=maskx) - - -############################################################################### -# Functions # -############################################################################### - - -def timer(s, v='', nloop=500, nrep=3): - units = ["s", "ms", "µs", "ns"] - scaling = [1, 1e3, 1e6, 1e9] - print("%s : %-50s : " % (v, s), end=' ') - varnames = ["%ss,nm%ss,%sl,nm%sl" % tuple(x*4) for x in 'xyz'] - setup = 'from __main__ import numpy, ma, %s' % ','.join(varnames) - Timer = timeit.Timer(stmt=s, setup=setup) - best = min(Timer.repeat(nrep, nloop)) / nloop - if best > 0.0: - order = min(-int(numpy.floor(numpy.log10(best)) // 3), 3) - else: - order = 3 - print("%d loops, best of %d: %.*g %s per loop" % (nloop, nrep, - 3, - best * scaling[order], - units[order])) - - -def compare_functions_1v(func, nloop=500, - xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl): - funcname = func.__name__ - print("-"*50) - print(f'{funcname} on small arrays') - module, data = "numpy.ma", "nmxs" - timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop) - - print("%s on large arrays" % funcname) - module, data = "numpy.ma", "nmxl" - timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop) - return - -def compare_methods(methodname, args, vars='x', nloop=500, test=True, - xs=xs, nmxs=nmxs, xl=xl, nmxl=nmxl): - print("-"*50) - print(f'{methodname} on small arrays') - data, ver = f'nm{vars}l', 'numpy.ma' - timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop) - - print("%s on large arrays" % methodname) - data, ver = "nm%sl" % vars, 'numpy.ma' - timer("%(data)s.%(methodname)s(%(args)s)" % locals(), v=ver, nloop=nloop) - return - -def compare_functions_2v(func, nloop=500, test=True, - xs=xs, nmxs=nmxs, - ys=ys, nmys=nmys, - xl=xl, nmxl=nmxl, - yl=yl, nmyl=nmyl): - funcname = func.__name__ - print("-"*50) - print(f'{funcname} on small arrays') - module, data = "numpy.ma", "nmxs,nmys" - timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop) - - print(f'{funcname} on large arrays') - module, data = "numpy.ma", "nmxl,nmyl" - timer("%(module)s.%(funcname)s(%(data)s)" % locals(), v="%11s" % module, nloop=nloop) - return - - -if __name__ == '__main__': - compare_functions_1v(numpy.sin) - compare_functions_1v(numpy.log) - compare_functions_1v(numpy.sqrt) - - compare_functions_2v(numpy.multiply) - compare_functions_2v(numpy.divide) - compare_functions_2v(numpy.power) - - compare_methods('ravel', '', nloop=1000) - compare_methods('conjugate', '', 'z', nloop=1000) - compare_methods('transpose', '', nloop=1000) - compare_methods('compressed', '', nloop=1000) - compare_methods('__getitem__', '0', nloop=1000) - compare_methods('__getitem__', '(0,0)', nloop=1000) - compare_methods('__getitem__', '[0,-1]', nloop=1000) - compare_methods('__setitem__', '0, 17', nloop=1000, test=False) - compare_methods('__setitem__', '(0,0), 17', nloop=1000, test=False) - - print("-"*50) - print("__setitem__ on small arrays") - timer('nmxs.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma ', nloop=10000) - - print("-"*50) - print("__setitem__ on large arrays") - timer('nmxl.__setitem__((-1,0),numpy.ma.masked)', 'numpy.ma ', nloop=10000) - - print("-"*50) - print("where on small arrays") - timer('numpy.ma.where(nmxs>2,nmxs,nmys)', 'numpy.ma ', nloop=1000) - print("-"*50) - print("where on large arrays") - timer('numpy.ma.where(nmxl>2,nmxl,nmyl)', 'numpy.ma ', nloop=100) diff --git a/numpy/tests/test_public_api.py b/numpy/tests/test_public_api.py index 8f654f3d9..28ed2f1df 100644 --- a/numpy/tests/test_public_api.py +++ b/numpy/tests/test_public_api.py @@ -280,7 +280,6 @@ PRIVATE_BUT_PRESENT_MODULES = ['numpy.' + s for s in [ "lib.utils", "linalg.lapack_lite", "linalg.linalg", - "ma.bench", "ma.core", "ma.testutils", "ma.timer_comparison", |
