diff options
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 3 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 64 |
2 files changed, 60 insertions, 7 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 59d144569..a99bdb9d2 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -561,6 +561,9 @@ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { if(IS_BINARY_REDUCE) { + if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { + return; + } BINARY_REDUCE_LOOP(npy_bool) { const npy_bool in2 = *(npy_bool *)ip2; io1 = io1 @OP@ in2; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 0382f2cf7..dc3b6ad8e 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -197,6 +197,23 @@ run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps) return 0; } + +static void +sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n); + +static NPY_INLINE int +run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps) +{ +#if defined HAVE_EMMINTRIN_H + if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) { + sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1], + dimensions[0]); + return 1; + } +#endif + return 0; +} + /**end repeat**/ /**begin repeat @@ -363,7 +380,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i } -void +static void sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n) { const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]); @@ -518,11 +535,12 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n) * # kind = logical_or, logical_and# * # and = 0, 1# * # op = ||, &&# - * # vop = or, and# + * # sc = !=, ==# * # vpre = _mm*2# * # vsuf = si128*2# * # vtype = __m128i*2# * # type = npy_bool*2# + * # vload = _mm_load_si128*2# * # vloadu = _mm_loadu_si128*2# * # vstore = _mm_store_si128*2# */ @@ -550,8 +568,8 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) op[i] = ip1[i] @op@ ip2[i]; LOOP_BLOCKED(@type@, 16) { - @vtype@ a = @vloadu@((__m128i*)&ip1[i]); - @vtype@ b = @vloadu@((__m128i*)&ip2[i]); + @vtype@ a = @vloadu@((@vtype@*)&ip1[i]); + @vtype@ b = @vloadu@((@vtype@*)&ip2[i]); #if @and@ const @vtype@ zero = @vpre@_setzero_@vsuf@(); /* get 0xFF for non zeros*/ @@ -562,13 +580,45 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp @vtype@ tmp = @vpre@_or_@vsuf@(a, b); #endif - @vstore@((__m128i*)&op[i], byte_to_true(tmp)); + @vstore@((@vtype@*)&op[i], byte_to_true(tmp)); } LOOP_BLOCKED_END { op[i] = (ip1[i] @op@ ip2[i]); } } + +static void +sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) +{ + const @vtype@ zero = @vpre@_setzero_@vsuf@(); + LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, 16) { + *op = *op @op@ ip[i]; + if (*op @sc@ 0) { + return; + } + } + LOOP_BLOCKED(npy_bool, 16) { + @vtype@ v = @vload@((@vtype@*)&ip[i]); + v = @vpre@_cmpeq_epi8(v, zero); +#if @and@ + if ((@vpre@_movemask_epi8(v) != 0)) { + *op = 0; +#else + if ((@vpre@_movemask_epi8(v) != 0xFFFF)) { + *op = 1; +#endif + return; + } + } + LOOP_BLOCKED_END { + *op = *op @op@ ip[i]; + if (*op @sc@ 0) { + return; + } + } +} + /**end repeat**/ /**begin repeat @@ -589,7 +639,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) LOOP_BLOCK_ALIGN_VAR(op, @type@, 16) op[i] = (ip[i] @op@ 0); LOOP_BLOCKED(@type@, 16) { - @vtype@ a = @vloadu@((__m128i*)&ip[i]); + @vtype@ a = @vloadu@((@vtype@*)&ip[i]); #if @not@ const @vtype@ zero = @vpre@_setzero_@vsuf@(); const @vtype@ truemask = @vpre@_set1_epi8(1 == 1); @@ -600,7 +650,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n) /* abs is kind of pointless but maybe its used for byte_to_true */ a = byte_to_true(a); #endif - @vstore@((__m128i*)&op[i], a); + @vstore@((@vtype@*)&op[i], a); } LOOP_BLOCKED_END { op[i] = (ip[i] @op@ 0); |