diff options
author | Matti Picus <matti.picus@gmail.com> | 2022-12-15 05:05:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-15 05:05:09 +0200 |
commit | 78a499d99d3bd080026010b0b68c387114888221 (patch) | |
tree | 4c7cd4e97e8ae5750a4d11cc23ae9d55a9b36494 | |
parent | 32d616224e8342a77e57a2d9d135981de5e6ea67 (diff) | |
parent | bfa444d45c392678851f5c3dd5f8d3a02683447f (diff) | |
download | numpy-78a499d99d3bd080026010b0b68c387114888221.tar.gz |
Merge pull request #22167 from Developer-Ecosystem-Engineering/add_simd_bool_logical_andornot_absolute
ENH: Add SIMD versions of bool logical_&&,||,! and absolute
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 61 | ||||
-rw-r--r-- | numpy/core/meson.build | 1 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 92 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 43 | ||||
-rw-r--r-- | numpy/core/src/umath/loops_logical.dispatch.c.src | 353 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 211 |
8 files changed, 426 insertions, 337 deletions
diff --git a/.gitignore b/.gitignore index 9851fcc77..c0d370bc2 100644 --- a/.gitignore +++ b/.gitignore @@ -220,6 +220,7 @@ numpy/core/src/umath/loops_unary.dispatch.c numpy/core/src/umath/loops_unary_fp.dispatch.c numpy/core/src/umath/loops_arithm_fp.dispatch.c numpy/core/src/umath/loops_arithmetic.dispatch.c +numpy/core/src/umath/loops_logical.dispatch.c numpy/core/src/umath/loops_minmax.dispatch.c numpy/core/src/umath/loops_trigonometric.dispatch.c numpy/core/src/umath/loops_exponent_log.dispatch.c diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 768c8deee..bf1a05ee4 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -246,7 +246,8 @@ chartoname = { 'P': 'OBJECT', } -noobj = '?bBhHiIlLqQefdgFDGmM' +no_obj_bool = 'bBhHiIlLqQefdgFDGmM' +noobj = '?' + no_obj_bool all = '?bBhHiIlLqQefdgFDGOmM' O = 'O' @@ -280,6 +281,7 @@ nocmplxO = nocmplx+O nocmplxP = nocmplx+P notimes_or_obj = bints + inexact nodatetime_or_obj = bints + inexact +no_bool_times_obj = ints + inexact # Find which code corresponds to int64. int64 = '' @@ -299,7 +301,9 @@ defdict = { Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.add'), 'PyUFunc_AdditionTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), + TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]), + TD(no_bool_times_obj, simd=[('avx2', ints)], + dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'mM', 'M'), @@ -310,7 +314,8 @@ defdict = { Ufunc(2, 1, None, # Zero is only a unit to the right, not the left docstrings.get('numpy.core.umath.subtract'), 'PyUFunc_SubtractionTypeResolver', - TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), + TD(no_bool_times_obj, simd=[('avx2', ints)], + dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('M', FullTypeDescr, 'Mm', 'M'), TypeDescription('m', FullTypeDescr, 'mm', 'm'), TypeDescription('M', FullTypeDescr, 'MM', 'm'), @@ -321,7 +326,10 @@ defdict = { Ufunc(2, 1, One, docstrings.get('numpy.core.umath.multiply'), 'PyUFunc_MultiplicationTypeResolver', - TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]), + TD('?', cfunc_alias='logical_and', + dispatch=[('loops_logical', '?')]), + TD(no_bool_times_obj, simd=[('avx2', ints)], + dispatch=[('loops_arithm_fp', 'fdFD')]), [TypeDescription('m', FullTypeDescr, 'mq', 'm'), TypeDescription('m', FullTypeDescr, 'qm', 'm'), TypeDescription('m', FullTypeDescr, 'md', 'm'), @@ -412,7 +420,8 @@ defdict = { Ufunc(1, 1, None, docstrings.get('numpy.core.umath.absolute'), 'PyUFunc_AbsoluteTypeResolver', - TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]), + TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'), + ('loops_logical', '?')]), TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')), TD(O, f='PyNumber_Absolute'), ), @@ -496,28 +505,33 @@ defdict = { Ufunc(2, 1, True_, docstrings.get('numpy.core.umath.logical_and'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)], + dispatch=[('loops_logical', '?')]), TD(O, f='npy_ObjectLogicalAnd'), ), 'logical_not': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.logical_not'), None, - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)], + dispatch=[('loops_logical', '?')]), TD(O, f='npy_ObjectLogicalNot'), ), 'logical_or': Ufunc(2, 1, False_, docstrings.get('numpy.core.umath.logical_or'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]), + TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)], + dispatch=[('loops_logical', '?')]), TD(O, f='npy_ObjectLogicalOr'), ), 'logical_xor': Ufunc(2, 1, False_, docstrings.get('numpy.core.umath.logical_xor'), 'PyUFunc_SimpleBinaryComparisonTypeResolver', - TD(nodatetime_or_obj, out='?'), + TD('?', out='?', cfunc_alias='not_equal', + dispatch=[('loops_comparison', '?')]), + TD(no_bool_times_obj, out='?'), # TODO: using obj.logical_xor() seems pretty much useless: TD(P, f='logical_xor'), ), @@ -525,14 +539,17 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), + TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]), + TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]), + TD('?', cfunc_alias='logical_and', + dispatch=[('loops_logical', '?')]), + TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]), TD(O, f='npy_ObjectMin') ), 'clip': @@ -546,14 +563,17 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmax'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, dispatch=[('loops_minmax', 'fdg')]), + TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]), + TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]), TD(O, f='npy_ObjectMax') ), 'fmin': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.fmin'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj, dispatch=[('loops_minmax', 'fdg')]), + TD('?', cfunc_alias='logical_and', + dispatch=[('loops_logical', '?')]), + TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]), TD(O, f='npy_ObjectMin') ), 'logaddexp': @@ -572,28 +592,35 @@ defdict = { Ufunc(2, 1, AllOnes, docstrings.get('numpy.core.umath.bitwise_and'), None, - TD(bints, simd=[('avx2', ints)]), + TD('?', cfunc_alias='logical_and', + dispatch=[('loops_logical', '?')]), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_And'), ), 'bitwise_or': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_or'), None, - TD(bints, simd=[('avx2', ints)]), + TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Or'), ), 'bitwise_xor': Ufunc(2, 1, Zero, docstrings.get('numpy.core.umath.bitwise_xor'), None, - TD(bints, simd=[('avx2', ints)]), + TD('?', cfunc_alias='not_equal', + dispatch=[('loops_comparison', '?')]), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Xor'), ), 'invert': Ufunc(1, 1, None, docstrings.get('numpy.core.umath.invert'), None, - TD(bints, simd=[('avx2', ints)]), + TD('?', cfunc_alias='logical_not', + dispatch=[('loops_logical', '?')]), + TD(ints, simd=[('avx2', ints)]), TD(O, f='PyNumber_Invert'), ), 'left_shift': diff --git a/numpy/core/meson.build b/numpy/core/meson.build index f4c9f49b5..3ee0f40b0 100644 --- a/numpy/core/meson.build +++ b/numpy/core/meson.build @@ -746,6 +746,7 @@ src_umath = [ src_file.process('src/umath/loops_comparison.dispatch.c.src'), src_file.process('src/umath/loops_exponent_log.dispatch.c.src'), src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'), + src_file.process('src/umath/loops_logical.dispatch.c.src'), src_file.process('src/umath/loops_minmax.dispatch.c.src'), src_file.process('src/umath/loops_modulo.dispatch.c.src'), src_file.process('src/umath/loops_trigonometric.dispatch.c.src'), diff --git a/numpy/core/setup.py b/numpy/core/setup.py index da5bc64c0..1c42e99c0 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -1009,6 +1009,7 @@ def configuration(parent_package='',top_path=None): join('src', 'umath', 'loops_unary_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'), join('src', 'umath', 'loops_arithmetic.dispatch.c.src'), + join('src', 'umath', 'loops_logical.dispatch.c.src'), join('src', 'umath', 'loops_minmax.dispatch.c.src'), join('src', 'umath', 'loops_trigonometric.dispatch.c.src'), join('src', 'umath', 'loops_umath_fp.dispatch.c.src'), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0b4856847..7b070a084 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -416,98 +416,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo ***************************************************************************** */ -/**begin repeat - * #kind = logical_and, logical_or# - * #OP = &&, ||# - * #SC = ==, !=# - * #and = 1, 0# - **/ - -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if(IS_BINARY_REDUCE) { -#ifdef NPY_HAVE_SSE2_INTRINSICS - /* - * stick with our variant for more reliable performance, only known - * platform which outperforms it by ~20% is an i7 with glibc 2.17 - */ - if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } -#else - /* for now only use libc on 32-bit/non-x86 */ - if (steps[1] == 1) { - npy_bool * op = (npy_bool *)args[0]; -#if @and@ - /* np.all(), search for a zero (false) */ - if (*op) { - *op = memchr(args[1], 0, dimensions[0]) == NULL; - } -#else - /* - * np.any(), search for a non-zero (true) via comparing against - * zero blocks, memcmp is faster than memchr on SSE4 machines - * with glibc >= 2.12 and memchr can only check for equal 1 - */ - static const npy_bool zero[4096]; /* zero by C standard */ - npy_uintp i, n = dimensions[0]; - - for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { - *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; - } - if (!*op && n - i > 0) { - *op = memcmp(&args[1][i], zero, n - i) != 0; - } -#endif - return; - } -#endif - else { - BINARY_REDUCE_LOOP(npy_bool) { - const npy_bool in2 = *(npy_bool *)ip2; - io1 = io1 @OP@ in2; - if (io1 @SC@ 0) { - break; - } - } - *((npy_bool *)iop1) = io1; - } - } - else { - if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - BINARY_LOOP { - const npy_bool in1 = *(npy_bool *)ip1; - const npy_bool in2 = *(npy_bool *)ip2; - *((npy_bool *)op1) = in1 @OP@ in2; - } - } - } -} -/**end repeat**/ - -/**begin repeat - * #kind = absolute, logical_not# - * #OP = !=, ==# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) -{ - if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { - return; - } - else { - UNARY_LOOP { - npy_bool in1 = *(npy_bool *)ip1; - *((npy_bool *)op1) = in1 @OP@ 0; - } - } -} -/**end repeat**/ - NPY_NO_EXPORT void BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) { diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index e3a410968..411d53e94 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -10,24 +10,29 @@ #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN #endif -#define BOOL_invert BOOL_logical_not -#define BOOL_add BOOL_logical_or -#define BOOL_bitwise_and BOOL_logical_and -#define BOOL_bitwise_or BOOL_logical_or -#define BOOL_logical_xor BOOL_not_equal -#define BOOL_bitwise_xor BOOL_logical_xor -#define BOOL_multiply BOOL_logical_and -#define BOOL_maximum BOOL_logical_or -#define BOOL_minimum BOOL_logical_and -#define BOOL_fmax BOOL_maximum -#define BOOL_fmin BOOL_minimum - /* ***************************************************************************** ** BOOLEAN LOOPS ** ***************************************************************************** */ +/* + * Following functions are defined by umath generator + * to enable runtime dispatching without the need + * to redefine them within dsipatch-able sources. + */ +// #define BOOL_invert BOOL_logical_not +// #define BOOL_add BOOL_logical_or +// #define BOOL_bitwise_and BOOL_logical_and +// #define BOOL_bitwise_or BOOL_logical_or +// #define BOOL_logical_xor BOOL_not_equal +// #define BOOL_bitwise_xor BOOL_logical_xor +// #define BOOL_multiply BOOL_logical_and +// #define BOOL_maximum BOOL_logical_or +// #define BOOL_minimum BOOL_logical_and +// #define BOOL_fmax BOOL_maximum +// #define BOOL_fmin BOOL_minimum + #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_comparison.dispatch.h" #endif @@ -39,11 +44,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))) /**end repeat**/ +#ifndef NPY_DISABLE_OPTIMIZATION + #include "loops_logical.dispatch.h" +#endif + /**begin repeat - * #kind = logical_and, logical_or, absolute, logical_not# - **/ -NPY_NO_EXPORT void -BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + * #kind = logical_and, logical_or, logical_not, absolute# + */ + NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@, + (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))) /**end repeat**/ NPY_NO_EXPORT void @@ -203,7 +212,7 @@ NPY_NO_EXPORT void /**end repeat**/ - + #ifndef NPY_DISABLE_OPTIMIZATION #include "loops_unary.dispatch.h" #endif diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src new file mode 100644 index 000000000..793a2af19 --- /dev/null +++ b/numpy/core/src/umath/loops_logical.dispatch.c.src @@ -0,0 +1,353 @@ +/*@targets + ** $maxopt baseline + ** neon asimd + ** sse2 avx2 avx512_skx + ** vsx2 + ** vx + **/ +#define _UMATHMODULE +#define _MULTIARRAYMODULE +#define NPY_NO_DEPRECATED_API NPY_API_VERSION + +#include "simd/simd.h" +#include "loops_utils.h" +#include "loops.h" +#include "lowlevel_strided_loops.h" +// Provides the various *_LOOP macros +#include "fast_loop_macros.h" + +/******************************************************************************* + ** Defining the SIMD kernels + ******************************************************************************/ + +#if NPY_SIMD +/* + * convert any bit set to boolean true so vectorized and normal operations are + * consistent, should not be required if bool is used correctly everywhere but + * you never know + */ +NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v) +{ + const npyv_u8 zero = npyv_zero_u8(); + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00 + npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero)); + // tmp is filled with 0xff/0x00, negate and mask to boolean true + return npyv_andc_u8(truemask, tmp); +} +/* + * convert mask vector (0xff/0x00) to boolean true. similar to byte_to_true(), + * but we've already got a mask and can skip negation. + */ +NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v) +{ + const npyv_u8 truemask = npyv_setall_u8(1 == 1); + return npyv_and_u8(truemask, npyv_cvt_u8_b8(v)); +} + + +/**begin repeat + * #kind = logical_and, logical_or# + * #and = 1, 0# + * #scalar_op = &&, ||# + * #intrin = and, or# + * #reduce = min, max# + * #scalar_cmp = ==, !=# + * #anyall = all, any# + */ +static void +simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@); + npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@); + npyv_u8 r@unroll@ = npyv_@intrin@_u8(a@unroll@, b@unroll@); + npyv_store_u8(op + vstep * @unroll@, byte_to_true(r@unroll@)); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) { + npyv_u8 a = npyv_load_u8(ip1); + npyv_u8 b = npyv_load_u8(ip2); + npyv_u8 r = npyv_@intrin@_u8(a, b); + npyv_store_u8(op, byte_to_true(r)); + } + + // Scalar loop to finish off + for (; len > 0; len--, ip1++, ip2++, op++) { + *op = *ip1 @scalar_op@ *ip2; + } +} + +static void +simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + #define UNROLL 8 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep) { + #if defined(NPY_HAVE_SSE2) + NPY_PREFETCH(ip + wstep, 0, 3); + #endif + npyv_u8 v0 = npyv_load_u8(ip + vstep * 0); + npyv_u8 v1 = npyv_load_u8(ip + vstep * 1); + npyv_u8 v2 = npyv_load_u8(ip + vstep * 2); + npyv_u8 v3 = npyv_load_u8(ip + vstep * 3); + npyv_u8 v4 = npyv_load_u8(ip + vstep * 4); + npyv_u8 v5 = npyv_load_u8(ip + vstep * 5); + npyv_u8 v6 = npyv_load_u8(ip + vstep * 6); + npyv_u8 v7 = npyv_load_u8(ip + vstep * 7); + + npyv_u8 m01 = npyv_@reduce@_u8(v0, v1); + npyv_u8 m23 = npyv_@reduce@_u8(v2, v3); + npyv_u8 m45 = npyv_@reduce@_u8(v4, v5); + npyv_u8 m67 = npyv_@reduce@_u8(v6, v7); + + npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23); + npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67); + + npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567); + + if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){ + *op = !@and@; + return; + } + } + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep) { + npyv_u8 v0 = npyv_load_u8(ip); + if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){ + *op = !@and@; + return; + } + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip) { + *op = *op @scalar_op@ *ip; + if (*op @scalar_cmp@ 0) { + return; + } + } +#undef UNROLL +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #op = ==, !=# + * #not = 1, 0# + */ +static void +simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len) +{ + #define UNROLL 16 + + const int vstep = npyv_nlanes_u8; + const int wstep = vstep * UNROLL; + + #if @not@ + const npyv_u8 zero = npyv_zero_u8(); + #endif + + // Unrolled vectors loop + for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) { + /**begin repeat1 + * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15# + */ + #if UNROLL > @unroll@ + npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@); +#if @not@ + npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero)); +#else + npyv_u8 r@unroll@ = byte_to_true(v@unroll@); +#endif + npyv_store_u8(op + vstep * @unroll@, r@unroll@); + #endif + /**end repeat1**/ + } + #undef UNROLL + + // Single vectors loop + for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) { + npyv_u8 v = npyv_load_u8(ip); +#if @not@ + npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero)); +#else + npyv_u8 r = byte_to_true(v); +#endif + npyv_store_u8(op, r); + } + + // Scalar loop to finish off + for (; len > 0; --len, ++ip, ++op) { + *op = (*ip @op@ 0); + } +} +/**end repeat**/ + +#endif // NPY_SIMD + +/******************************************************************************* + ** Defining ufunc inner functions + ******************************************************************************/ + +/**begin repeat + * # kind = logical_or, logical_and# + */ +static NPY_INLINE int +run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], + (npy_bool*)args[1], dimensions[0]); + return 1; + } +#endif + return 0; +} + + +static NPY_INLINE int +run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], + dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + */ +static NPY_INLINE int +run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if NPY_SIMD + if (sizeof(npy_bool) == 1 && + IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) { + simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); + return 1; + } +#endif + return 0; +} +/**end repeat**/ + + +/**begin repeat + * #kind = logical_and, logical_or# + * #OP = &&, ||# + * #SC = ==, !=# + * #and = 1, 0# + */ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if(IS_BINARY_REDUCE) { +#if NPY_SIMD + /* + * stick with our variant for more reliable performance, only known + * platform which outperforms it by ~20% is an i7 with glibc 2.17 + */ + if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } +#else + /* for now only use libc on 32-bit/non-x86 */ + if (steps[1] == 1) { + npy_bool * op = (npy_bool *)args[0]; +#if @and@ + /* np.all(), search for a zero (false) */ + if (*op) { + *op = memchr(args[1], 0, dimensions[0]) == NULL; + } +#else + /* + * np.any(), search for a non-zero (true) via comparing against + * zero blocks, memcmp is faster than memchr on SSE4 machines + * with glibc >= 2.12 and memchr can only check for equal 1 + */ + static const npy_bool zero[4096]; /* zero by C standard */ + npy_uintp i, n = dimensions[0]; + + for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { + *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; + } + if (!*op && n - i > 0) { + *op = memcmp(&args[1][i], zero, n - i) != 0; + } +#endif + return; + } +#endif + else { + BINARY_REDUCE_LOOP(npy_bool) { + const npy_bool in2 = *(npy_bool *)ip2; + io1 = io1 @OP@ in2; + if (io1 @SC@ 0) { + break; + } + } + *((npy_bool *)iop1) = io1; + } + } + else { + if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + BINARY_LOOP { + const npy_bool in1 = *(npy_bool *)ip1; + const npy_bool in2 = *(npy_bool *)ip2; + *((npy_bool *)op1) = in1 @OP@ in2; + } + } + } +} +/**end repeat**/ + +/**begin repeat + * #kind = logical_not, absolute# + * #OP = ==, !=# + **/ +NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@) +(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } + else { + UNARY_LOOP { + npy_bool in1 = *(npy_bool *)ip1; + *((npy_bool *)op1) = in1 @OP@ 0; + } + } +} +/**end repeat**/ + diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 6fc1501c9..10c44ce30 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -154,80 +154,6 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const * /**end repeat1**/ /**end repeat**/ -/* - ***************************************************************************** - ** BOOL DISPATCHERS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, - npy_intp n); - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n); -#endif - -static inline int -run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0], - (npy_bool*)args[1], dimensions[0]); - return 1; - } -#endif - return 0; -} - - -static inline int -run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], - dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - */ - -#if defined NPY_HAVE_SSE2_INTRINSICS -static void -sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n); -#endif - -static inline int -run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps) -{ -#if defined NPY_HAVE_SSE2_INTRINSICS - if (sizeof(npy_bool) == 1 && - IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) { - sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]); - return 1; - } -#endif - return 0; -} - -/**end repeat**/ - #ifdef NPY_HAVE_SSE2_INTRINSICS /* @@ -1005,143 +931,6 @@ AVX512F_absolute_@TYPE@(@type@ * op, #endif /**end repeat**/ -/* - ***************************************************************************** - ** BOOL LOOPS - ***************************************************************************** - */ - -/**begin repeat - * # kind = logical_or, logical_and# - * # and = 0, 1# - * # op = ||, &&# - * # sc = !=, ==# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vload = _mm_load_si128*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -/* - * convert any bit set to boolean true so vectorized and normal operations are - * consistent, should not be required if bool is used correctly everywhere but - * you never know - */ -#if !@and@ -NPY_FINLINE @vtype@ byte_to_true(@vtype@ v) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* get 0xFF for zeros */ - @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero); - /* filled with 0xFF/0x00, negate and mask to boolean true */ - return @vpre@_andnot_@vsuf@(tmp, truemask); -} -#endif - -static void -sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = ip1[i] @op@ ip2[i]; - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip1[i]); - @vtype@ b = @vloadu@((@vtype@*)&ip2[i]); -#if @and@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - /* get 0xFF for non zeros*/ - @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero); - /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */ - tmp = @vpre@_andnot_@vsuf@(tmp, b); -#else - @vtype@ tmp = @vpre@_or_@vsuf@(a, b); -#endif - - @vstore@((@vtype@*)&op[i], byte_to_true(tmp)); - } - LOOP_BLOCKED_END { - op[i] = (ip1[i] @op@ ip2[i]); - } -} - - -static void -sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) -{ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } - /* unrolled once to replace a slow movmsk with a fast pmaxb */ - LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) { - @vtype@ v = @vload@((@vtype@*)&ip[i]); - @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]); - v = @vpre@_cmpeq_epi8(v, zero); - v2 = @vpre@_cmpeq_epi8(v2, zero); -#if @and@ - if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) { - *op = 0; -#else - if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) { - *op = 1; -#endif - return; - } - } - LOOP_BLOCKED_END { - *op = *op @op@ ip[i]; - if (*op @sc@ 0) { - return; - } - } -} - -/**end repeat**/ - -/**begin repeat - * # kind = absolute, logical_not# - * # op = !=, ==# - * # not = 0, 1# - * # vpre = _mm*2# - * # vsuf = si128*2# - * # vtype = __m128i*2# - * # type = npy_bool*2# - * # vloadu = _mm_loadu_si128*2# - * # vstore = _mm_store_si128*2# - */ - -static void -sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) -{ - LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES) - op[i] = (ip[i] @op@ 0); - LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) { - @vtype@ a = @vloadu@((@vtype@*)&ip[i]); -#if @not@ - const @vtype@ zero = @vpre@_setzero_@vsuf@(); - const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); - /* equivalent to byte_to_true but can skip the negation */ - a = @vpre@_cmpeq_epi8(a, zero); - a = @vpre@_and_@vsuf@(a, truemask); -#else - /* abs is kind of pointless but maybe its used for byte_to_true */ - a = byte_to_true(a); -#endif - @vstore@((@vtype@*)&op[i], a); - } - LOOP_BLOCKED_END { - op[i] = (ip[i] @op@ 0); - } -} - -/**end repeat**/ - #undef VECTOR_SIZE_BYTES #endif /* NPY_HAVE_SSE2_INTRINSICS */ #endif |