diff options
author | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-07-11 20:23:36 +0200 |
---|---|---|
committer | Julian Taylor <jtaylor.debian@googlemail.com> | 2013-08-09 18:32:05 +0200 |
commit | 7819817653003fdae4554cbfab4cdbedf824c305 (patch) | |
tree | 4c3a662fe97f18265bf57b14fec78c25c5fd20a8 | |
parent | 928289bf37081f4deb6755e226600998ccc23610 (diff) | |
download | numpy-7819817653003fdae4554cbfab4cdbedf824c305.tar.gz |
ENH: improve numpy.all()/any()
Unroll the loop once and use pminub/pmaxub to save a slow pmovmskb
instruction. Improves performance by 50% on some AMD chips.
Also add a pure libc path using memcmp and memchr for non amd64 systems.
The libc path can be faster with a very modern cpu and libc version,
e.g. an i7 with glibc 2.17 is about 20% faster than our code but many
other tested platforms are much slower (2.12 xeon, core2duo) or same
speed (2.17 phenom).
The numpy code can be removed in future when faster libc versions and
cpus are more commonly available.
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 34 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 9 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 17 |
3 files changed, 57 insertions, 3 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0559fb416..d99fafaf2 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -20,6 +20,8 @@ #include "ufunc_object.h" +#include <string.h> /* for memchr */ + /* * include vectorized functions and dispatchers @@ -555,15 +557,47 @@ BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED * #kind = logical_and, logical_or# * #OP = &&, ||# * #SC = ==, !=# + * #and = 1, 0# **/ NPY_NO_EXPORT void BOOL_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func)) { if(IS_BINARY_REDUCE) { +#ifdef HAVE_EMMINTRIN_H + /* + * stick with our variant for more reliable performance, only known + * platform which outperforms it by ~20% is an i7 with glibc 2.17 + */ if (run_reduce_simd_@kind@_BOOL(args, dimensions, steps)) { return; } +#else + /* for now only use libc on 32-bit/non-x86 */ + if (steps[1] == 1) { + npy_bool * op = (npy_bool *)args[0]; +#if @and@ + /* np.all(), search for a zero (false) */ + if (*op) { + *op = memchr(args[1], 0, dimensions[0]) == NULL; + } +#else + /* + * np.any(), search for a non-zero (true) via comparing against + * zero blocks, memcmp is faster than memchr on SSE4 machines + * with glibc >= 2.12 and memchr can only check for equal 1 + */ + static const npy_bool zero[4096]; /* zero by C standard */ + npy_uintp i, n = dimensions[0]; + for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) { + *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0; + } + if (!*op && n - i > 0) + *op = memcmp(&args[1][i], zero, n - i) != 0; +#endif + return; + } +#endif BINARY_REDUCE_LOOP(npy_bool) { const npy_bool in2 = *(npy_bool *)ip2; io1 = io1 @OP@ in2; diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 98e2beb30..2f1c3055b 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -779,14 +779,17 @@ sse2_reduce_@kind@_BOOL(npy_bool * op, npy_bool * ip, const npy_intp n) return; } } - LOOP_BLOCKED(npy_bool, 16) { + /* unrolled once to replace a slow movmsk with a fast pmaxb */ + LOOP_BLOCKED(npy_bool, 32) { @vtype@ v = @vload@((@vtype@*)&ip[i]); + @vtype@ v2 = @vload@((@vtype@*)&ip[i + 16]); v = @vpre@_cmpeq_epi8(v, zero); + v2 = @vpre@_cmpeq_epi8(v2, zero); #if @and@ - if ((@vpre@_movemask_epi8(v) != 0)) { + if ((@vpre@_movemask_epi8(@vpre@_max_epu8(v, v2)) != 0)) { *op = 0; #else - if ((@vpre@_movemask_epi8(v) != 0xFFFF)) { + if ((@vpre@_movemask_epi8(@vpre@_min_epu8(v, v2)) != 0xFFFF)) { *op = 1; #endif return; diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index 1be0f4105..782ddd687 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -245,6 +245,23 @@ class TestBoolArray(TestCase): self.assertTrue(self.im.any()) self.assertFalse(self.nm.all()) self.assertFalse(self.im.all()) + # check bad element in all positions + for i in range(256 - 7): + d = array([False] * 256, dtype=np.bool)[7::] + d[i] = True + self.assertTrue(np.any(d)) + e = array([True] * 256, dtype=np.bool)[7::] + e[i] = False + self.assertFalse(np.all(e)) + assert_array_equal(e, ~d) + # big array test for blocked libc loops + for i in list(range(9, 6000, 507)) + [7764, 90021, -10]: + d = array([False] * 100043, dtype=np.bool) + d[i] = True + self.assertTrue(np.any(d), msg="%r" % i) + e = array([True] * 100043, dtype=np.bool) + e[i] = False + self.assertFalse(np.all(e), msg="%r" % i) def test_logical_not_abs(self): assert_array_equal(~self.t, self.f) |