ENH: improve numpy.all()/any()

Unroll the loop once and use pminub/pmaxub to save a slow pmovmskb instruction. Improves performance by 50% on some AMD chips. Also add a pure libc path using memcmp and memchr for non amd64 systems. The libc path can be faster with a very modern cpu and libc version, e.g. an i7 with glibc 2.17 is about 20% faster than our code but many other tested platforms are much slower (2.12 xeon, core2duo) or same speed (2.17 phenom). The numpy code can be removed in future when faster libc versions and cpus are more commonly available.
author: Julian Taylor <jtaylor.debian@googlemail.com> 2013-07-11 20:23:36 +0200
committer: Julian Taylor <jtaylor.debian@googlemail.com> 2013-08-09 18:32:05 +0200
commit: 7819817653003fdae4554cbfab4cdbedf824c305 (patch)
tree: 4c3a662fe97f18265bf57b14fec78c25c5fd20a8
parent: 928289bf37081f4deb6755e226600998ccc23610 (diff)
download: numpy-7819817653003fdae4554cbfab4cdbedf824c305.tar.gz
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0559fb416..d99fafaf2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,8 @@
 
 #include "ufunc_object.h"
 
+#include <string.h> /* for memchr */
+
 
 /*
  * include vectorized functions and dispatchers
@@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
  * #kind = logical_and, logical_or#
  * #OP =  &&, ||#
  * #SC =  ==, !=#
+ * #and = 1, 0#
  **/
 
 NPY_NO_EXPORT void
 BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
 {
     if(IS_BINARY_REDUCE) {
+#ifdef HAVE_EMMINTRIN_H
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
         if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
             return;
         }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if @and@
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0)
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+#endif
+            return;
+        }
+#endif
         BINARY_REDUCE_LOOP(npy_bool) {
             const npy_bool in2 = *(npy_bool *)ip2;
             io1 = io1 @OP@ in2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 98e2beb30..2f1c3055b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
             return;
         }
     }
-    LOOP_BLOCKED(npy_bool, 16) {
+    /* unrolled once to replace a slow movmsk with a fast pmaxb */
+    LOOP_BLOCKED(npy_bool, 32) {
         @vtype@ v = @vload@((@vtype@*)&ip[i]);
+        @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
         v = @vpre@_cmpeq_epi8(v, zero);
+        v2 = @vpre@_cmpeq_epi8(v2, zero);
 #if @and@
-        if ((@vpre@_movemask_epi8(v) != 0)) {
+        if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
             *op = 0;
 #else
-        if ((@vpre@_movemask_epi8(v) != 0xFFFF)) {
+        if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
             *op = 1;
 #endif
             return;
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1be0f4105..782ddd687 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -245,6 +245,23 @@ class TestBoolArray(TestCase):
         self.assertTrue(self.im.any())
         self.assertFalse(self.nm.all())
         self.assertFalse(self.im.all())
+        # check bad element in all positions
+        for i in range(256 - 7):
+            d = array([False] * 256, dtype=np.bool)[7::]
+            d[i] = True
+            self.assertTrue(np.any(d))
+            e = array([True] * 256, dtype=np.bool)[7::]
+            e[i] = False
+            self.assertFalse(np.all(e))
+            assert_array_equal(e, ~d)
+        # big array test for blocked libc loops
+        for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+            d = array([False] * 100043, dtype=np.bool)
+            d[i] = True
+            self.assertTrue(np.any(d), msg="%r" % i)
+            e = array([True] * 100043, dtype=np.bool)
+            e[i] = False
+            self.assertFalse(np.all(e), msg="%r" % i)
 
     def test_logical_not_abs(self):
         assert_array_equal(~self.t, self.f)
author	Julian Taylor <jtaylor.debian@googlemail.com>	2013-07-11 20:23:36 +0200
committer	Julian Taylor <jtaylor.debian@googlemail.com>	2013-08-09 18:32:05 +0200
commit	7819817653003fdae4554cbfab4cdbedf824c305 (patch)
tree	4c3a662fe97f18265bf57b14fec78c25c5fd20a8
parent	928289bf37081f4deb6755e226600998ccc23610 (diff)
download	numpy-7819817653003fdae4554cbfab4cdbedf824c305.tar.gz