summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmarks/benchmarks/bench_core.py2
-rw-r--r--numpy/core/src/multiarray/item_selection.c233
-rw-r--r--numpy/core/tests/test_numeric.py18
3 files changed, 183 insertions, 70 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index f302f262d..30647f4b8 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -142,7 +142,7 @@ class CountNonzero(Benchmark):
params = [
[1, 2, 3],
[100, 10000, 1000000],
- [bool, int, str, object]
+ [bool, np.int8, np.int16, np.int32, np.int64, str, object]
]
def setup(self, numaxes, size, dtype):
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 8e4b2ebe1..fb354ce54 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2130,7 +2130,6 @@ count_nonzero_bytes_384(const npy_uint64 * w)
}
#if NPY_SIMD
-
/* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
@@ -2166,18 +2165,18 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
}
return vsum16;
}
-
+#endif // NPY_SIMD
/*
* Counts the number of non-zero values in a raw array.
* The one loop process is shown below(take SSE2 with 128bits vector for example):
- * |------------16 lanes---------|
+ * |------------16 lanes---------|
*[vsum8] 255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements
* !!
- * |------------8 lanes---------|
+ * |------------8 lanes---------|
*[vsum16] 65535 65535 65535 ... 65535 count_zero_bytes_u16: counting (2*16-1)*16 elements
* 65535 65535 65535 ... 65535
* !!
- * |------------4 lanes---------|
+ * |------------4 lanes---------|
*[sum_32_0] 65535 65535 65535 65535 count_nonzero_bytes
* 65535 65535 65535 65535
*[sum_32_1] 65535 65535 65535 65535
@@ -2186,40 +2185,143 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
* (2*16-1)*16
*/
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
+count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
{
- npy_intp zero_count = 0;
- const npy_uint8 *end = d + unrollx;
- while (d < end) {
- npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16);
- npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
- npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
- zero_count += npyv_sum_u32(npyv_add_u32(
- npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
- npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
- ));
- }
- return unrollx - zero_count;
+ npy_intp count = 0;
+ if (bstride == 1) {
+ #if NPY_SIMD
+ npy_uintp len_m = len & -npyv_nlanes_u8;
+ npy_uintp zcount = 0;
+ for (const char *end = data + len_m; data < end;) {
+ npyv_u16x2 vsum16 = count_zero_bytes_u16((const npy_uint8**)&data, (const npy_uint8*)end, NPY_MAX_UINT16);
+ npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
+ npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
+ zcount += npyv_sum_u32(npyv_add_u32(
+ npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
+ npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
+ ));
+ }
+ len -= len_m;
+ count = len_m - zcount;
+ #else
+ if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) {
+ int step = 6 * sizeof(npy_uint64);
+ int left_bytes = len % step;
+ for (const char *end = data + len; data < end - left_bytes; data += step) {
+ count += count_nonzero_bytes_384((const npy_uint64 *)data);
+ }
+ len = left_bytes;
+ }
+ #endif // NPY_SIMD
+ }
+ for (; len > 0; --len, data += bstride) {
+ count += (*data != 0);
+ }
+ return count;
}
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
+{
+ npy_intp count = 0;
+#if NPY_SIMD
+ if (bstride == sizeof(npy_uint16)) {
+ npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16;
+ const npyv_u16 vone = npyv_setall_u16(1);
+ const npyv_u16 vzero = npyv_zero_u16();
+
+ for (npy_uintp lenx = len_m; lenx > 0;) {
+ npyv_u16 vsum16 = npyv_zero_u16();
+ npy_uintp max16 = PyArray_MIN(lenx, NPY_MAX_UINT16*npyv_nlanes_u16);
+
+ for (const char *end = data + max16*bstride; data < end; data += NPY_SIMD_WIDTH) {
+ npyv_u16 mask = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16((npy_uint16*)data), vzero));
+ mask = npyv_and_u16(mask, vone);
+ vsum16 = npyv_add_u16(vsum16, mask);
+ }
+ lenx -= max16;
+ zcount += npyv_sumup_u16(vsum16);
+ }
+ len -= len_m;
+ count = len_m - zcount;
+ }
+#endif
+ for (; len > 0; --len, data += bstride) {
+ count += (*(npy_uint16*)data != 0);
+ }
+ return count;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
+{
+ npy_intp count = 0;
+#if NPY_SIMD
+ if (bstride == sizeof(npy_uint32)) {
+ const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32;
+ const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32;
+ const npyv_u32 vone = npyv_setall_u32(1);
+ const npyv_u32 vzero = npyv_zero_u32();
+
+ npyv_u32 vsum32 = npyv_zero_u32();
+ for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+ npyv_u32 mask = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32((npy_uint32*)data), vzero));
+ mask = npyv_and_u32(mask, vone);
+ vsum32 = npyv_add_u32(vsum32, mask);
+ }
+ const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL));
+ npyv_u64 odd = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32);
+ npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
+ count = len_m - npyv_sum_u64(npyv_add_u64(odd, even));
+ len -= len_m;
+ }
+#endif
+ for (; len > 0; --len, data += bstride) {
+ count += (*(npy_uint32*)data != 0);
+ }
+ return count;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
+{
+ npy_intp count = 0;
+#if NPY_SIMD
+ if (bstride == sizeof(npy_uint64)) {
+ const npy_uintp len_m = len & -npyv_nlanes_u64;
+ const npyv_u64 vone = npyv_setall_u64(1);
+ const npyv_u64 vzero = npyv_zero_u64();
+
+ npyv_u64 vsum64 = npyv_zero_u64();
+ for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+ npyv_u64 mask = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64((npy_uint64*)data), vzero));
+ mask = npyv_and_u64(mask, vone);
+ vsum64 = npyv_add_u64(vsum64, mask);
+ }
+ len -= len_m;
+ count = len_m - npyv_sum_u64(vsum64);
+ }
#endif
+ for (; len > 0; --len, data += bstride) {
+ count += (*(npy_uint64*)data != 0);
+ }
+ return count;
+}
/*
* Counts the number of True values in a raw boolean array. This
* is a low-overhead function which does no heap allocations.
*
* Returns -1 on error.
*/
-NPY_NO_EXPORT npy_intp
-count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+static NPY_GCC_OPT_3 npy_intp
+count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize)
{
-
+ assert(elsize <= 8);
int idim;
npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
- npy_intp i, coord[NPY_MAXDIMS];
- npy_intp count = 0;
- NPY_BEGIN_THREADS_DEF;
+ npy_intp coord[NPY_MAXDIMS];
- /* Use raw iteration with no heap memory allocation */
+ // Use raw iteration with no heap memory allocation
if (PyArray_PrepareOneRawArrayIter(
ndim, ashape,
data, astrides,
@@ -2228,51 +2330,44 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
return -1;
}
- /* Handle zero-sized array */
+ // Handle zero-sized array
if (shape[0] == 0) {
return 0;
}
+ NPY_BEGIN_THREADS_DEF;
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
- /* Special case for contiguous inner loop */
- if (strides[0] == 1) {
- NPY_RAW_ITER_START(idim, ndim, coord, shape) {
- /* Process the innermost dimension */
- const char *d = data;
- const char *e = data + shape[0];
-#if NPY_SIMD
- npy_uintp stride = shape[0] & -npyv_nlanes_u8;
- count += count_nonzero_bytes((const npy_uint8 *)d, stride);
- d += stride;
-#else
- if (!NPY_ALIGNMENT_REQUIRED ||
- npy_is_aligned(d, sizeof(npy_uint64))) {
- npy_uintp stride = 6 * sizeof(npy_uint64);
- for (; d < e - (shape[0] % stride); d += stride) {
- count += count_nonzero_bytes_384((const npy_uint64 *)d);
- }
- }
-#endif
- for (; d < e; ++d) {
- count += (*d != 0);
- }
- } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
- }
- /* General inner loop */
- else {
- NPY_RAW_ITER_START(idim, ndim, coord, shape) {
- char *d = data;
- /* Process the innermost dimension */
- for (i = 0; i < shape[0]; ++i, d += strides[0]) {
- count += (*d != 0);
- }
- } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+
+ #define NONZERO_CASE(LEN, SFX) \
+ case LEN: \
+ NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
+ count += count_nonzero_##SFX(data, strides[0], shape[0]); \
+ } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); \
+ break
+
+ npy_intp count = 0;
+ switch(elsize) {
+ NONZERO_CASE(1, u8);
+ NONZERO_CASE(2, u16);
+ NONZERO_CASE(4, u32);
+ NONZERO_CASE(8, u64);
}
+ #undef NONZERO_CASE
NPY_END_THREADS;
-
return count;
}
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+{
+ return count_nonzero_int(ndim, data, ashape, astrides, 1);
+}
/*NUMPY_API
* Counts the number of non-zero elements in the array.
@@ -2295,14 +2390,22 @@ PyArray_CountNonzero(PyArrayObject *self)
npy_intp *strideptr, *innersizeptr;
NPY_BEGIN_THREADS_DEF;
- /* Special low-overhead version specific to the boolean type */
+ // Special low-overhead version specific to the boolean/int types
dtype = PyArray_DESCR(self);
- if (dtype->type_num == NPY_BOOL) {
- return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
- PyArray_DIMS(self), PyArray_STRIDES(self));
+ switch(dtype->kind) {
+ case 'u':
+ case 'i':
+ case 'b':
+ if (dtype->elsize > 8) {
+ break;
+ }
+ return count_nonzero_int(
+ PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+ PyArray_STRIDES(self), dtype->elsize
+ );
}
- nonzero = PyArray_DESCR(self)->f->nonzero;
+ nonzero = PyArray_DESCR(self)->f->nonzero;
/* If it's a trivial one-dimensional loop, don't use an iterator */
if (PyArray_TRIVIALLY_ITERABLE(self)) {
needs_api = PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index a697e5faf..06511822e 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -1266,20 +1266,30 @@ class TestNonzero:
assert_equal(np.count_nonzero(x), 4)
assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
- x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
- dtype=[('a', 'i4'), ('b', 'i2')])
+ # x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
+ # dtype=[('a', 'i4'), ('b', 'i2')])
+ x = np.array([(1, 2, -5, -3), (0, 0, 2, 7), (1, 1, 0, 1), (-1, 3, 1, 0), (0, 7, 0, 4)],
+ dtype=[('a', 'i4'), ('b', 'i2'), ('c', 'i1'), ('d', 'i8')])
assert_equal(np.count_nonzero(x['a']), 3)
assert_equal(np.count_nonzero(x['b']), 4)
+ assert_equal(np.count_nonzero(x['c']), 3)
+ assert_equal(np.count_nonzero(x['d']), 4)
assert_equal(np.nonzero(x['a']), ([0, 2, 3],))
assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],))
def test_nonzero_twodim(self):
x = np.array([[0, 1, 0], [2, 0, 3]])
- assert_equal(np.count_nonzero(x), 3)
+ assert_equal(np.count_nonzero(x.astype('i1')), 3)
+ assert_equal(np.count_nonzero(x.astype('i2')), 3)
+ assert_equal(np.count_nonzero(x.astype('i4')), 3)
+ assert_equal(np.count_nonzero(x.astype('i8')), 3)
assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2]))
x = np.eye(3)
- assert_equal(np.count_nonzero(x), 3)
+ assert_equal(np.count_nonzero(x.astype('i1')), 3)
+ assert_equal(np.count_nonzero(x.astype('i2')), 3)
+ assert_equal(np.count_nonzero(x.astype('i4')), 3)
+ assert_equal(np.count_nonzero(x.astype('i8')), 3)
assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2]))
x = np.array([[(0, 1), (0, 0), (1, 11)],