summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/umath/loops.c.src3
-rw-r--r--numpy/core/src/umath/simd.inc.src64
2 files changed, 60 insertions, 7 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 59d144569..a99bdb9d2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -561,6 +561,9 @@ NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
if(IS_BINARY_REDUCE) {
+ if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+ return;
+ }
BINARY_REDUCE_LOOP(npy_bool) {
const npy_bool in2 = *(npy_bool *)ip2;
io1 = io1 @OP@ in2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 0382f2cf7..dc3b6ad8e 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -197,6 +197,23 @@ run_binary_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
return 0;
}
+
+static void
+sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp *dimensions, npy_intp *steps)
+{
+#if defined HAVE_EMMINTRIN_H
+ if (sizeof(npy_bool) == 1 && IS_BLOCKABLE_REDUCE(sizeof(npy_bool), 16)) {
+ sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+ dimensions[0]);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
/**end repeat**/
/**begin repeat
@@ -363,7 +380,7 @@ sse2_binary_scalar1_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_i
}
-void
+static void
sse2_binary_scalar2_@kind@_@TYPE@(@type@ * op, @type@ * ip1, @type@ * ip2, npy_intp n)
{
const @vtype@ b = @vpre@_set1_@vsuf@(ip2[0]);
@@ -518,11 +535,12 @@ sse2_@kind@_@TYPE@(@type@ * ip, @type@ * op, const npy_intp n)
* # kind = logical_or, logical_and#
* # and = 0, 1#
* # op = ||, &&#
- * # vop = or, and#
+ * # sc = !=, ==#
* # vpre = _mm*2#
* # vsuf = si128*2#
* # vtype = __m128i*2#
* # type = npy_bool*2#
+ * # vload = _mm_load_si128*2#
* # vloadu = _mm_loadu_si128*2#
* # vstore = _mm_store_si128*2#
*/
@@ -550,8 +568,8 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = ip1[i] @op@ ip2[i];
LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vloadu@((__m128i*)&ip1[i]);
- @vtype@ b = @vloadu@((__m128i*)&ip2[i]);
+ @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
+ @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
#if @and@
const @vtype@ zero = @vpre@_setzero_@vsuf@();
/* get 0xFF for non zeros*/
@@ -562,13 +580,45 @@ sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp
@vtype@ tmp = @vpre@_or_@vsuf@(a, b);
#endif
- @vstore@((__m128i*)&op[i], byte_to_true(tmp));
+ @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
}
LOOP_BLOCKED_END {
op[i] = (ip1[i] @op@ ip2[i]);
}
}
+
+static void
+sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
+{
+ const @vtype@ zero = @vpre@_setzero_@vsuf@();
+ LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, 16) {
+ *op = *op @op@ ip[i];
+ if (*op @sc@ 0) {
+ return;
+ }
+ }
+ LOOP_BLOCKED(npy_bool, 16) {
+ @vtype@ v = @vload@((@vtype@*)&ip[i]);
+ v = @vpre@_cmpeq_epi8(v, zero);
+#if @and@
+ if ((@vpre@_movemask_epi8(v) != 0)) {
+ *op = 0;
+#else
+ if ((@vpre@_movemask_epi8(v) != 0xFFFF)) {
+ *op = 1;
+#endif
+ return;
+ }
+ }
+ LOOP_BLOCKED_END {
+ *op = *op @op@ ip[i];
+ if (*op @sc@ 0) {
+ return;
+ }
+ }
+}
+
/**end repeat**/
/**begin repeat
@@ -589,7 +639,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
LOOP_BLOCK_ALIGN_VAR(op, @type@, 16)
op[i] = (ip[i] @op@ 0);
LOOP_BLOCKED(@type@, 16) {
- @vtype@ a = @vloadu@((__m128i*)&ip[i]);
+ @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
#if @not@
const @vtype@ zero = @vpre@_setzero_@vsuf@();
const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
@@ -600,7 +650,7 @@ sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
/* abs is kind of pointless but maybe its used for byte_to_true */
a = byte_to_true(a);
#endif
- @vstore@((__m128i*)&op[i], a);
+ @vstore@((@vtype@*)&op[i], a);
}
LOOP_BLOCKED_END {
op[i] = (ip[i] @op@ 0);