diff options
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 49 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 30 |
2 files changed, 77 insertions, 2 deletions
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index bcaf9fed7..af6ee8828 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2395,6 +2395,44 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis, } /* + * count number of nonzero bytes in 16 byte block + * w must be aligned to 8 bytes + * + * even though it uses 64 bit types its faster than the bytewise sum on 32 bit + * but a 32 bit type version would make it even faster on these platforms + */ +static NPY_INLINE int +count_nonzero_bytes_128(npy_uint64 * w) +{ + npy_uint64 w1 = w[0]; + npy_uint64 w2 = w[1]; + + /* + * bytes not exclusively 0 or 1, sum them individually. + * should only happen if one does weird stuff with views or external + * buffers. + */ + if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) { + /* reload from pointer to avoid a unnecessary stack spill with gcc */ + char * c = w; + npy_uintp i, count = 0; + for (i = 0; i < 16; i++) { + count += (c[i] != 0); + } + return count; + } + + /* + * last part of sideways add popcount, first three bisections can be + * skipped as we are dealing with bytes. + * multiplication equivalent to (x + (x>>8) + (x>>16) + (x>>24)) & 0xFF + * multiplication overflow well defined for unsigned types. + * w1 + w2 guaranteed to not overflow as we only have 0 and 1 data. + */ + return ((w1 + w2) * 0x0101010101010101ULL) >> 56ULL; +} + +/* * Counts the number of True values in a raw boolean array. This * is a low-overhead function which does no heap allocations. * @@ -2425,9 +2463,16 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides) /* Special case for contiguous inner loop */ if (strides[0] == 1) { NPY_RAW_ITER_START(idim, ndim, coord, shape) { - char *d = data; /* Process the innermost dimension */ - for (i = 0; i < shape[0]; ++i, ++d) { + char * d = data; + char * e = data + shape[0]; + if (npy_is_aligned(data, sizeof(npy_uint64))) { + npy_uintp stride = 2 * sizeof(npy_uint64); + for (; d < e - (shape[0] % stride); d += stride) { + count += count_nonzero_bytes_128(d); + } + } + for (; d < e; ++d) { count += (*d != 0); } } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 93f9110fd..c41c56b9c 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -4,6 +4,11 @@ import tempfile import sys import os import warnings +if sys.version_info[0] >= 3: + import builtins +else: + import __builtin__ as builtins + import numpy as np from nose import SkipTest @@ -611,6 +616,31 @@ class TestBool(TestCase): assert_equal(d[::2].sum(), d[::2].size) assert_equal(d[::-2].sum(), d[::-2].size) + def check_count_nonzero(self, power, length): + powers = [2 ** i for i in range(length)] + for i in range(2**power): + l = [(i & x) != 0 for x in powers] + a = np.array(l, dtype=np.bool) + c = builtins.sum(l) + self.assertEqual(np.count_nonzero(a), c) + av = a.view(np.uint8) + av *= 3 + self.assertEqual(np.count_nonzero(a), c) + av *= 4 + self.assertEqual(np.count_nonzero(a), c) + av[av != 0] = 0xFF + self.assertEqual(np.count_nonzero(a), c) + + def test_count_nonzero(self): + # check all 12 bit combinations in a length 17 array + # covers most cases of the 16 byte unrolled code + self.check_count_nonzero(12, 17) + + @dec.slow + def test_count_nonzero_all(self): + # check all combinations in a length 17 array + # covers all cases of the 16 byte unrolled code + self.check_count_nonzero(17, 17) class TestMethods(TestCase): def test_test_round(self): |