summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/src/multiarray/item_selection.c49
-rw-r--r--numpy/core/tests/test_multiarray.py30
2 files changed, 77 insertions, 2 deletions
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index bcaf9fed7..af6ee8828 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2395,6 +2395,44 @@ PyArray_Compress(PyArrayObject *self, PyObject *condition, int axis,
}
/*
+ * count number of nonzero bytes in 16 byte block
+ * w must be aligned to 8 bytes
+ *
+ * even though it uses 64 bit types its faster than the bytewise sum on 32 bit
+ * but a 32 bit type version would make it even faster on these platforms
+ */
+static NPY_INLINE int
+count_nonzero_bytes_128(npy_uint64 * w)
+{
+ npy_uint64 w1 = w[0];
+ npy_uint64 w2 = w[1];
+
+ /*
+ * bytes not exclusively 0 or 1, sum them individually.
+ * should only happen if one does weird stuff with views or external
+ * buffers.
+ */
+ if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) {
+ /* reload from pointer to avoid a unnecessary stack spill with gcc */
+ char * c = w;
+ npy_uintp i, count = 0;
+ for (i = 0; i < 16; i++) {
+ count += (c[i] != 0);
+ }
+ return count;
+ }
+
+ /*
+ * last part of sideways add popcount, first three bisections can be
+ * skipped as we are dealing with bytes.
+ * multiplication equivalent to (x + (x>>8) + (x>>16) + (x>>24)) & 0xFF
+ * multiplication overflow well defined for unsigned types.
+ * w1 + w2 guaranteed to not overflow as we only have 0 and 1 data.
+ */
+ return ((w1 + w2) * 0x0101010101010101ULL) >> 56ULL;
+}
+
+/*
* Counts the number of True values in a raw boolean array. This
* is a low-overhead function which does no heap allocations.
*
@@ -2425,9 +2463,16 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides)
/* Special case for contiguous inner loop */
if (strides[0] == 1) {
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
- char *d = data;
/* Process the innermost dimension */
- for (i = 0; i < shape[0]; ++i, ++d) {
+ char * d = data;
+ char * e = data + shape[0];
+ if (npy_is_aligned(data, sizeof(npy_uint64))) {
+ npy_uintp stride = 2 * sizeof(npy_uint64);
+ for (; d < e - (shape[0] % stride); d += stride) {
+ count += count_nonzero_bytes_128(d);
+ }
+ }
+ for (; d < e; ++d) {
count += (*d != 0);
}
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 93f9110fd..c41c56b9c 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4,6 +4,11 @@ import tempfile
import sys
import os
import warnings
+if sys.version_info[0] >= 3:
+ import builtins
+else:
+ import __builtin__ as builtins
+
import numpy as np
from nose import SkipTest
@@ -611,6 +616,31 @@ class TestBool(TestCase):
assert_equal(d[::2].sum(), d[::2].size)
assert_equal(d[::-2].sum(), d[::-2].size)
+ def check_count_nonzero(self, power, length):
+ powers = [2 ** i for i in range(length)]
+ for i in range(2**power):
+ l = [(i & x) != 0 for x in powers]
+ a = np.array(l, dtype=np.bool)
+ c = builtins.sum(l)
+ self.assertEqual(np.count_nonzero(a), c)
+ av = a.view(np.uint8)
+ av *= 3
+ self.assertEqual(np.count_nonzero(a), c)
+ av *= 4
+ self.assertEqual(np.count_nonzero(a), c)
+ av[av != 0] = 0xFF
+ self.assertEqual(np.count_nonzero(a), c)
+
+ def test_count_nonzero(self):
+ # check all 12 bit combinations in a length 17 array
+ # covers most cases of the 16 byte unrolled code
+ self.check_count_nonzero(12, 17)
+
+ @dec.slow
+ def test_count_nonzero_all(self):
+ # check all combinations in a length 17 array
+ # covers all cases of the 16 byte unrolled code
+ self.check_count_nonzero(17, 17)
class TestMethods(TestCase):
def test_test_round(self):