Merge pull request #22167 from Developer-Ecosystem-Engineering/add_simd_bool_logical_andornot_absolute

ENH: Add SIMD versions of bool logical_&&,||,! and absolute
author: Matti Picus <matti.picus@gmail.com> 2022-12-15 05:05:09 +0200
committer: GitHub <noreply@github.com> 2022-12-15 05:05:09 +0200
commit: 78a499d99d3bd080026010b0b68c387114888221 (patch)
tree: 4c7cd4e97e8ae5750a4d11cc23ae9d55a9b36494
parent: 32d616224e8342a77e57a2d9d135981de5e6ea67 (diff)
parent: bfa444d45c392678851f5c3dd5f8d3a02683447f (diff)
download: numpy-78a499d99d3bd080026010b0b68c387114888221.tar.gz
8 files changed, 426 insertions, 337 deletions
diff --git a/.gitignore b/.gitignore
index 9851fcc77..c0d370bc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -220,6 +220,7 @@ numpy/core/src/umath/loops_unary.dispatch.c
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
 numpy/core/src/umath/loops_arithmetic.dispatch.c
+numpy/core/src/umath/loops_logical.dispatch.c
 numpy/core/src/umath/loops_minmax.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 768c8deee..bf1a05ee4 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -246,7 +246,8 @@ chartoname = {
     'P': 'OBJECT',
 }
 
-noobj = '?bBhHiIlLqQefdgFDGmM'
+no_obj_bool = 'bBhHiIlLqQefdgFDGmM'
+noobj = '?' + no_obj_bool
 all = '?bBhHiIlLqQefdgFDGOmM'
 
 O = 'O'
@@ -280,6 +281,7 @@ nocmplxO = nocmplx+O
 nocmplxP = nocmplx+P
 notimes_or_obj = bints + inexact
 nodatetime_or_obj = bints + inexact
+no_bool_times_obj = ints + inexact
 
 # Find which code corresponds to int64.
 int64 = ''
@@ -299,7 +301,9 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.add'),
           'PyUFunc_AdditionTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, simd=[('avx2', ints)],
+                                dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'mM', 'M'),
@@ -310,7 +314,8 @@ defdict = {
     Ufunc(2, 1, None, # Zero is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.subtract'),
           'PyUFunc_SubtractionTypeResolver',
-          TD(ints + inexact, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD(no_bool_times_obj, simd=[('avx2', ints)],
+                                dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('M', FullTypeDescr, 'Mm', 'M'),
            TypeDescription('m', FullTypeDescr, 'mm', 'm'),
            TypeDescription('M', FullTypeDescr, 'MM', 'm'),
@@ -321,7 +326,10 @@ defdict = {
     Ufunc(2, 1, One,
           docstrings.get('numpy.core.umath.multiply'),
           'PyUFunc_MultiplicationTypeResolver',
-          TD(notimes_or_obj, simd=[('avx2', ints)], dispatch=[('loops_arithm_fp', 'fdFD')]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_bool_times_obj, simd=[('avx2', ints)],
+                                dispatch=[('loops_arithm_fp', 'fdFD')]),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'qm', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
@@ -412,7 +420,8 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.absolute'),
           'PyUFunc_AbsoluteTypeResolver',
-          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd')]),
+          TD(bints+flts+timedeltaonly, dispatch=[('loops_unary_fp', 'fd'),
+                                                 ('loops_logical', '?')]),
           TD(cmplx, simd=[('avx512f', cmplxvec)], out=('f', 'd', 'g')),
           TD(O, f='PyNumber_Absolute'),
           ),
@@ -496,28 +505,33 @@ defdict = {
     Ufunc(2, 1, True_,
           docstrings.get('numpy.core.umath.logical_and'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)],
+                                dispatch=[('loops_logical', '?')]),
           TD(O, f='npy_ObjectLogicalAnd'),
           ),
 'logical_not':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.logical_not'),
           None,
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)],
+                                dispatch=[('loops_logical', '?')]),
           TD(O, f='npy_ObjectLogicalNot'),
           ),
 'logical_or':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_or'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)]),
+          TD(nodatetime_or_obj, out='?', simd=[('avx2', ints)],
+                                dispatch=[('loops_logical', '?')]),
           TD(O, f='npy_ObjectLogicalOr'),
           ),
 'logical_xor':
     Ufunc(2, 1, False_,
           docstrings.get('numpy.core.umath.logical_xor'),
           'PyUFunc_SimpleBinaryComparisonTypeResolver',
-          TD(nodatetime_or_obj, out='?'),
+          TD('?', out='?', cfunc_alias='not_equal',
+                           dispatch=[('loops_comparison', '?')]),
+          TD(no_bool_times_obj, out='?'),
           # TODO: using obj.logical_xor() seems pretty much useless:
           TD(P, f='logical_xor'),
           ),
@@ -525,14 +539,17 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.maximum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'minimum':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.minimum'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', ints+'fdg')]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', ints+'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'clip':
@@ -546,14 +563,17 @@ defdict = {
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmax'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
           TD(O, f='npy_ObjectMax')
           ),
 'fmin':
     Ufunc(2, 1, ReorderableNone,
           docstrings.get('numpy.core.umath.fmin'),
           'PyUFunc_SimpleUniformOperationTypeResolver',
-          TD(noobj, dispatch=[('loops_minmax', 'fdg')]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(no_obj_bool, dispatch=[('loops_minmax', 'fdg')]),
           TD(O, f='npy_ObjectMin')
           ),
 'logaddexp':
@@ -572,28 +592,35 @@ defdict = {
     Ufunc(2, 1, AllOnes,
           docstrings.get('numpy.core.umath.bitwise_and'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_and',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_And'),
           ),
 'bitwise_or':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_or'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_or', dispatch=[('loops_logical', '?')]),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Or'),
           ),
 'bitwise_xor':
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.bitwise_xor'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='not_equal',
+                  dispatch=[('loops_comparison', '?')]),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Xor'),
           ),
 'invert':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.invert'),
           None,
-          TD(bints, simd=[('avx2', ints)]),
+          TD('?', cfunc_alias='logical_not',
+                  dispatch=[('loops_logical', '?')]),
+          TD(ints, simd=[('avx2', ints)]),
           TD(O, f='PyNumber_Invert'),
           ),
 'left_shift':
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index f4c9f49b5..3ee0f40b0 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -746,6 +746,7 @@ src_umath = [
   src_file.process('src/umath/loops_comparison.dispatch.c.src'),
   src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
   src_file.process('src/umath/loops_hyperbolic.dispatch.c.src'),
+  src_file.process('src/umath/loops_logical.dispatch.c.src'),
   src_file.process('src/umath/loops_minmax.dispatch.c.src'),
   src_file.process('src/umath/loops_modulo.dispatch.c.src'),
   src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index da5bc64c0..1c42e99c0 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -1009,6 +1009,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
+            join('src', 'umath', 'loops_logical.dispatch.c.src'),
             join('src', 'umath', 'loops_minmax.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0b4856847..7b070a084 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -416,98 +416,6 @@ PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, vo
  *****************************************************************************
  */
 
-/**begin repeat
- * #kind = logical_and, logical_or#
- * #OP =  &&, ||#
- * #SC =  ==, !=#
- * #and = 1, 0#
- **/
-
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if(IS_BINARY_REDUCE) {
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-        /*
-         * stick with our variant for more reliable performance, only known
-         * platform which outperforms it by ~20% is an i7 with glibc 2.17
-         */
-        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-#else
-        /* for now only use libc on 32-bit/non-x86 */
-        if (steps[1] == 1) {
-            npy_bool * op = (npy_bool *)args[0];
-#if @and@
-            /* np.all(), search for a zero (false) */
-            if (*op) {
-                *op = memchr(args[1], 0, dimensions[0]) == NULL;
-            }
-#else
-            /*
-             * np.any(), search for a non-zero (true) via comparing against
-             * zero blocks, memcmp is faster than memchr on SSE4 machines
-             * with glibc >= 2.12 and memchr can only check for equal 1
-             */
-            static const npy_bool zero[4096]; /* zero by C standard */
-            npy_uintp i, n = dimensions[0];
-
-            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
-                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
-            }
-            if (!*op && n - i > 0) {
-                *op = memcmp(&args[1][i], zero, n - i) != 0;
-            }
-#endif
-            return;
-        }
-#endif
-        else {
-            BINARY_REDUCE_LOOP(npy_bool) {
-                const npy_bool in2 = *(npy_bool *)ip2;
-                io1 = io1 @OP@ in2;
-                if (io1 @SC@ 0) {
-                    break;
-                }
-            }
-            *((npy_bool *)iop1) = io1;
-        }
-    }
-    else {
-        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
-            return;
-        }
-        else {
-            BINARY_LOOP {
-                const npy_bool in1 = *(npy_bool *)ip1;
-                const npy_bool in2 = *(npy_bool *)ip2;
-                *((npy_bool *)op1) = in1 @OP@ in2;
-            }
-        }
-    }
-}
-/**end repeat**/
-
-/**begin repeat
- * #kind = absolute, logical_not#
- * #OP =  !=, ==#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
-        return;
-    }
-    else {
-        UNARY_LOOP {
-            npy_bool in1 = *(npy_bool *)ip1;
-            *((npy_bool *)op1) = in1 @OP@ 0;
-        }
-    }
-}
-/**end repeat**/
-
 NPY_NO_EXPORT void
 BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index e3a410968..411d53e94 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -10,24 +10,29 @@
     #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
 #endif
 
-#define BOOL_invert BOOL_logical_not
-#define BOOL_add BOOL_logical_or
-#define BOOL_bitwise_and BOOL_logical_and
-#define BOOL_bitwise_or BOOL_logical_or
-#define BOOL_logical_xor BOOL_not_equal
-#define BOOL_bitwise_xor BOOL_logical_xor
-#define BOOL_multiply BOOL_logical_and
-#define BOOL_maximum BOOL_logical_or
-#define BOOL_minimum BOOL_logical_and
-#define BOOL_fmax BOOL_maximum
-#define BOOL_fmin BOOL_minimum
-
 /*
  *****************************************************************************
  **                             BOOLEAN LOOPS                               **
  *****************************************************************************
  */
 
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_comparison.dispatch.h"
 #endif
@@ -39,11 +44,15 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
 /**begin repeat
- * #kind = logical_and, logical_or, absolute, logical_not#
- **/
-NPY_NO_EXPORT void
-BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+ * #kind = logical_and, logical_or, logical_not, absolute#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_@kind@,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
 /**end repeat**/
 
 NPY_NO_EXPORT void
@@ -203,7 +212,7 @@ NPY_NO_EXPORT void
 
 /**end repeat**/
 
- 
+
 #ifndef NPY_DISABLE_OPTIMIZATION
     #include "loops_unary.dispatch.h"
 #endif
diff --git a/numpy/core/src/umath/loops_logical.dispatch.c.src b/numpy/core/src/umath/loops_logical.dispatch.c.src
new file mode 100644
index 000000000..793a2af19
--- /dev/null
+++ b/numpy/core/src/umath/loops_logical.dispatch.c.src
@@ -0,0 +1,353 @@
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+    // tmp is filled with 0xff/0x00, negate and mask to boolean true
+    return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #and  = 1, 0#
+ * #scalar_op = &&, ||#
+ * #intrin = and, or#
+ * #reduce = min, max#
+ * #scalar_cmp = ==, !=#
+ * #anyall = all, any#
+ */
+static void
+simd_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 a@unroll@ = npyv_load_u8(ip1 + vstep * @unroll@);
+        npyv_u8 b@unroll@ = npyv_load_u8(ip2 + vstep * @unroll@);
+        npyv_u8 r@unroll@ = npyv_@intrin@_u8(a@unroll@, b@unroll@);
+        npyv_store_u8(op + vstep * @unroll@, byte_to_true(r@unroll@));
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = npyv_@intrin@_u8(a, b);
+        npyv_store_u8(op, byte_to_true(r));
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 @scalar_op@ *ip2;
+    }
+}
+
+static void
+simd_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_@reduce@_u8(v0, v1);
+        npyv_u8 m23 = npyv_@reduce@_u8(v2, v3);
+        npyv_u8 m45 = npyv_@reduce@_u8(v4, v5);
+        npyv_u8 m67 = npyv_@reduce@_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_@reduce@_u8(m01, m23);
+        npyv_u8 m4567 = npyv_@reduce@_u8(m45, m67);
+
+        npyv_u8 mv = npyv_@reduce@_u8(m0123, m4567);
+
+        if(npyv_@anyall@_u8(mv) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_@anyall@_u8(v0) @scalar_cmp@ 0){
+            *op = !@and@;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op @scalar_op@ *ip;
+        if (*op @scalar_cmp@ 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+/**end repeat**/ 
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #op = ==, !=#
+ * #not = 1, 0#
+ */
+static void
+simd_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if @not@
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        /**begin repeat1
+         * #unroll = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15#
+         */
+        #if UNROLL > @unroll@
+        npyv_u8 v@unroll@ = npyv_load_u8(ip + vstep * @unroll@);
+#if @not@
+        npyv_u8 r@unroll@ = mask_to_true(npyv_cmpeq_u8(v@unroll@, zero));
+#else
+        npyv_u8 r@unroll@ = byte_to_true(v@unroll@);
+#endif
+        npyv_store_u8(op + vstep * @unroll@, r@unroll@);
+        #endif
+        /**end repeat1**/
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if @not@
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip @op@ 0);
+    }
+}
+/**end repeat**/
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+/**begin repeat
+ * # kind = logical_or, logical_and#
+ */
+static NPY_INLINE int
+run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ */
+static NPY_INLINE int
+run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+/**end repeat**/
+
+
+/**begin repeat
+ * #kind = logical_and, logical_or#
+ * #OP =  &&, ||#
+ * #SC =  ==, !=#
+ * #and = 1, 0#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 @OP@ in2;
+                if (io1 @SC@ 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_@kind@_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 @OP@ in2;
+            }
+        }
+    }
+}
+/**end repeat**/
+
+/**begin repeat
+ * #kind = logical_not, absolute#
+ * #OP = ==, !=#
+ **/
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_@kind@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_@kind@_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 @OP@ 0;
+        }
+    }
+}
+/**end repeat**/
+
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 6fc1501c9..10c44ce30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -154,80 +154,6 @@ run_@kind@_simd_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *
 /**end repeat1**/
 /**end repeat**/
 
-/*
- *****************************************************************************
- **                           BOOL DISPATCHERS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2,
-                        npy_intp n);
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, npy_intp n);
-#endif
-
-static inline int
-run_binary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_BINARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_binary_@kind@_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
-                               (npy_bool*)args[1], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-
-static inline int
-run_reduce_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_reduce_@kind@_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
-                                dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- */
-
-#if defined NPY_HAVE_SSE2_INTRINSICS
-static void
-sse2_@kind@_BOOL(npy_bool *, npy_bool *, const npy_intp n);
-#endif
-
-static inline int
-run_unary_simd_@kind@_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
-{
-#if defined NPY_HAVE_SSE2_INTRINSICS
-    if (sizeof(npy_bool) == 1 &&
-            IS_BLOCKABLE_UNARY(sizeof(npy_bool), VECTOR_SIZE_BYTES)) {
-        sse2_@kind@_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
-        return 1;
-    }
-#endif
-    return 0;
-}
-
-/**end repeat**/
-
 #ifdef NPY_HAVE_SSE2_INTRINSICS
 
 /*
@@ -1005,143 +931,6 @@ AVX512F_absolute_@TYPE@(@type@ * op,
 #endif
 /**end repeat**/
 
-/*
- *****************************************************************************
- **                           BOOL LOOPS
- *****************************************************************************
- */
-
-/**begin repeat
- * # kind = logical_or, logical_and#
- * # and = 0, 1#
- * # op = ||, &&#
- * # sc = !=, ==#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vload = _mm_load_si128*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-/*
- * convert any bit set to boolean true so vectorized and normal operations are
- * consistent, should not be required if bool is used correctly everywhere but
- * you never know
- */
-#if !@and@
-NPY_FINLINE @vtype@ byte_to_true(@vtype@ v)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-    /* get 0xFF for zeros */
-    @vtype@ tmp = @vpre@_cmpeq_epi8(v, zero);
-    /* filled with 0xFF/0x00, negate and mask to boolean true */
-    return @vpre@_andnot_@vsuf@(tmp, truemask);
-}
-#endif
-
-static void
-sse2_binary_@kind@_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = ip1[i] @op@ ip2[i];
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip1[i]);
-        @vtype@ b = @vloadu@((@vtype@*)&ip2[i]);
-#if @and@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        /* get 0xFF for non zeros*/
-        @vtype@ tmp = @vpre@_cmpeq_epi8(a, zero);
-        /* andnot -> 0x00 for zeros xFF for non zeros, & with ip2 */
-        tmp = @vpre@_andnot_@vsuf@(tmp, b);
-#else
-        @vtype@ tmp = @vpre@_or_@vsuf@(a, b);
-#endif
-
-        @vstore@((@vtype@*)&op[i], byte_to_true(tmp));
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip1[i] @op@ ip2[i]);
-    }
-}
-
-
-static void
-sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
-{
-    const @vtype@ zero = @vpre@_setzero_@vsuf@();
-    LOOP_BLOCK_ALIGN_VAR(ip, npy_bool, VECTOR_SIZE_BYTES) {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-    /* unrolled once to replace a slow movmsk with a fast pmaxb */
-    LOOP_BLOCKED(npy_bool, 2 * VECTOR_SIZE_BYTES) {
-        @vtype@ v = @vload@((@vtype@*)&ip[i]);
-        @vtype@ v2 = @vload@((@vtype@*)&ip[i + VECTOR_SIZE_BYTES]);
-        v = @vpre@_cmpeq_epi8(v, zero);
-        v2 = @vpre@_cmpeq_epi8(v2, zero);
-#if @and@
-        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
-            *op = 0;
-#else
-        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
-            *op = 1;
-#endif
-            return;
-        }
-    }
-    LOOP_BLOCKED_END {
-        *op = *op @op@ ip[i];
-        if (*op @sc@ 0) {
-            return;
-        }
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
- * # kind = absolute, logical_not#
- * # op = !=, ==#
- * # not = 0, 1#
- * # vpre = _mm*2#
- * # vsuf = si128*2#
- * # vtype = __m128i*2#
- * # type = npy_bool*2#
- * # vloadu = _mm_loadu_si128*2#
- * # vstore = _mm_store_si128*2#
- */
-
-static void
-sse2_@kind@_BOOL(@type@ * op, @type@ * ip, const npy_intp n)
-{
-    LOOP_BLOCK_ALIGN_VAR(op, @type@, VECTOR_SIZE_BYTES)
-        op[i] = (ip[i] @op@ 0);
-    LOOP_BLOCKED(@type@, VECTOR_SIZE_BYTES) {
-        @vtype@ a = @vloadu@((@vtype@*)&ip[i]);
-#if @not@
-        const @vtype@ zero = @vpre@_setzero_@vsuf@();
-        const @vtype@ truemask = @vpre@_set1_epi8(1 == 1);
-        /* equivalent to byte_to_true but can skip the negation */
-        a = @vpre@_cmpeq_epi8(a, zero);
-        a = @vpre@_and_@vsuf@(a, truemask);
-#else
-        /* abs is kind of pointless but maybe its used for byte_to_true */
-        a = byte_to_true(a);
-#endif
-        @vstore@((@vtype@*)&op[i], a);
-    }
-    LOOP_BLOCKED_END {
-        op[i] = (ip[i] @op@ 0);
-    }
-}
-
-/**end repeat**/
-
 #undef VECTOR_SIZE_BYTES
 #endif  /* NPY_HAVE_SSE2_INTRINSICS */
 #endif
author	Matti Picus <matti.picus@gmail.com>	2022-12-15 05:05:09 +0200
committer	GitHub <noreply@github.com>	2022-12-15 05:05:09 +0200
commit	78a499d99d3bd080026010b0b68c387114888221 (patch)
tree	4c7cd4e97e8ae5750a4d11cc23ae9d55a9b36494
parent	32d616224e8342a77e57a2d9d135981de5e6ea67 (diff)
parent	bfa444d45c392678851f5c3dd5f8d3a02683447f (diff)
download	numpy-78a499d99d3bd080026010b0b68c387114888221.tar.gz