summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulian Taylor <jtaylor.debian@googlemail.com>2013-07-11 20:23:36 +0200
committerJulian Taylor <jtaylor.debian@googlemail.com>2013-08-09 18:32:05 +0200
commit7819817653003fdae4554cbfab4cdbedf824c305 (patch)
tree4c3a662fe97f18265bf57b14fec78c25c5fd20a8
parent928289bf37081f4deb6755e226600998ccc23610 (diff)
downloadnumpy-7819817653003fdae4554cbfab4cdbedf824c305.tar.gz
ENH: improve numpy.all()/any()
Unroll the loop once and use pminub/pmaxub to save a slow pmovmskb instruction. Improves performance by 50% on some AMD chips. Also add a pure libc path using memcmp and memchr for non amd64 systems. The libc path can be faster with a very modern cpu and libc version, e.g. an i7 with glibc 2.17 is about 20% faster than our code but many other tested platforms are much slower (2.12 xeon, core2duo) or same speed (2.17 phenom). The numpy code can be removed in future when faster libc versions and cpus are more commonly available.
-rw-r--r--numpy/core/src/umath/loops.c.src34
-rw-r--r--numpy/core/src/umath/simd.inc.src9
-rw-r--r--numpy/core/tests/test_numeric.py17
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0559fb416..d99fafaf2 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -20,6 +20,8 @@
#include "ufunc_object.h"
+#include <string.h> /* for memchr */
+
/*
* include vectorized functions and dispatchers
@@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
* #kind = logical_and, logical_or#
* #OP = &&, ||#
* #SC = ==, !=#
+ * #and = 1, 0#
**/
NPY_NO_EXPORT void
BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
{
if(IS_BINARY_REDUCE) {
+#ifdef HAVE_EMMINTRIN_H
+ /*
+ * stick with our variant for more reliable performance, only known
+ * platform which outperforms it by ~20% is an i7 with glibc 2.17
+ */
if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) {
return;
}
+#else
+ /* for now only use libc on 32-bit/non-x86 */
+ if (steps[1] == 1) {
+ npy_bool * op = (npy_bool *)args[0];
+#if @and@
+ /* np.all(), search for a zero (false) */
+ if (*op) {
+ *op = memchr(args[1], 0, dimensions[0]) == NULL;
+ }
+#else
+ /*
+ * np.any(), search for a non-zero (true) via comparing against
+ * zero blocks, memcmp is faster than memchr on SSE4 machines
+ * with glibc >= 2.12 and memchr can only check for equal 1
+ */
+ static const npy_bool zero[4096]; /* zero by C standard */
+ npy_uintp i, n = dimensions[0];
+ for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+ *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+ }
+ if (!*op && n - i > 0)
+ *op = memcmp(&args[1][i], zero, n - i) != 0;
+#endif
+ return;
+ }
+#endif
BINARY_REDUCE_LOOP(npy_bool) {
const npy_bool in2 = *(npy_bool *)ip2;
io1 = io1 @OP@ in2;
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 98e2beb30..2f1c3055b 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n)
return;
}
}
- LOOP_BLOCKED(npy_bool, 16) {
+ /* unrolled once to replace a slow movmsk with a fast pmaxb */
+ LOOP_BLOCKED(npy_bool, 32) {
@vtype@ v = @vload@((@vtype@*)&ip[i]);
+ @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]);
v = @vpre@_cmpeq_epi8(v, zero);
+ v2 = @vpre@_cmpeq_epi8(v2, zero);
#if @and@
- if ((@vpre@_movemask_epi8(v) != 0)) {
+ if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) {
*op = 0;
#else
- if ((@vpre@_movemask_epi8(v) != 0xFFFF)) {
+ if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) {
*op = 1;
#endif
return;
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 1be0f4105..782ddd687 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -245,6 +245,23 @@ class TestBoolArray(TestCase):
self.assertTrue(self.im.any())
self.assertFalse(self.nm.all())
self.assertFalse(self.im.all())
+ # check bad element in all positions
+ for i in range(256 - 7):
+ d = array([False] * 256, dtype=np.bool)[7::]
+ d[i] = True
+ self.assertTrue(np.any(d))
+ e = array([True] * 256, dtype=np.bool)[7::]
+ e[i] = False
+ self.assertFalse(np.all(e))
+ assert_array_equal(e, ~d)
+ # big array test for blocked libc loops
+ for i in list(range(9, 6000, 507)) + [7764, 90021, -10]:
+ d = array([False] * 100043, dtype=np.bool)
+ d[i] = True
+ self.assertTrue(np.any(d), msg="%r" % i)
+ e = array([True] * 100043, dtype=np.bool)
+ e[i] = False
+ self.assertFalse(np.all(e), msg="%r" % i)
def test_logical_not_abs(self):
assert_array_equal(~self.t, self.f)