3 files changed, 183 insertions, 70 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py
index f302f262d..30647f4b8 100644
--- a/benchmarks/benchmarks/bench_core.py
+++ b/benchmarks/benchmarks/bench_core.py
@@ -142,7 +142,7 @@ class CountNonzero(Benchmark):
     params = [
         [1, 2, 3],
         [100, 10000, 1000000],
-        [bool, int, str, object]
+        [bool, np.int8, np.int16, np.int32, np.int64, str, object]
     ]
 
     def setup(self, numaxes, size, dtype):
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c
index 8e4b2ebe1..fb354ce54 100644
--- a/numpy/core/src/multiarray/item_selection.c
+++ b/numpy/core/src/multiarray/item_selection.c
@@ -2130,7 +2130,6 @@ count_nonzero_bytes_384(const npy_uint64 * w)
 }
 
 #if NPY_SIMD
-
 /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */
 static NPY_INLINE NPY_GCC_OPT_3 npyv_u8
 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count)
@@ -2166,18 +2165,18 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
     }
     return vsum16;
 }
-
+#endif // NPY_SIMD
 /*
  * Counts the number of non-zero values in a raw array.
  * The one loop process is shown below(take SSE2 with 128bits vector for example):
- *          |------------16 lanes---------|          
+ *          |------------16 lanes---------|
  *[vsum8]   255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements
  *                          !!
- *           |------------8 lanes---------|          
+ *           |------------8 lanes---------|
  *[vsum16]   65535 65535 65535 ...   65535  count_zero_bytes_u16: counting (2*16-1)*16 elements
  *           65535 65535 65535 ...   65535
  *                          !!
- *           |------------4 lanes---------|          
+ *           |------------4 lanes---------|
  *[sum_32_0] 65535    65535   65535   65535  count_nonzero_bytes
  *           65535    65535   65535   65535
  *[sum_32_1] 65535    65535   65535   65535
@@ -2186,40 +2185,143 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c
  *                     (2*16-1)*16
 */
 static NPY_INLINE NPY_GCC_OPT_3 npy_intp
-count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
+count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len)
 {
-    npy_intp zero_count = 0;
-    const npy_uint8 *end = d + unrollx;
-    while (d < end) {
-        npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16);
-        npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
-        npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
-        zero_count += npyv_sum_u32(npyv_add_u32(
-                npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
-                npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
-        ));
-    }
-    return unrollx - zero_count;
+    npy_intp count = 0;
+    if (bstride == 1) {
+    #if NPY_SIMD
+        npy_uintp len_m = len & -npyv_nlanes_u8;
+        npy_uintp zcount = 0;
+        for (const char *end = data + len_m; data < end;) {
+            npyv_u16x2 vsum16 = count_zero_bytes_u16((const npy_uint8**)&data, (const npy_uint8*)end, NPY_MAX_UINT16);
+            npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]);
+            npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]);
+            zcount += npyv_sum_u32(npyv_add_u32(
+                    npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]),
+                    npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1])
+            ));
+        }
+        len  -= len_m;
+        count = len_m - zcount;
+    #else
+        if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) {
+            int step = 6 * sizeof(npy_uint64);
+            int left_bytes = len % step;
+            for (const char *end = data + len; data < end - left_bytes; data += step) {
+                 count += count_nonzero_bytes_384((const npy_uint64 *)data);
+            }
+            len = left_bytes;
+        }
+    #endif // NPY_SIMD
+    }
+    for (; len > 0; --len, data += bstride) {
+        count += (*data != 0);
+    }
+    return count;
 }
 
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len)
+{
+    npy_intp count = 0;
+#if NPY_SIMD
+    if (bstride == sizeof(npy_uint16)) {
+        npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16;
+        const npyv_u16 vone  = npyv_setall_u16(1);
+        const npyv_u16 vzero = npyv_zero_u16();
+
+        for (npy_uintp lenx = len_m; lenx > 0;) {
+            npyv_u16 vsum16 = npyv_zero_u16();
+            npy_uintp max16 = PyArray_MIN(lenx, NPY_MAX_UINT16*npyv_nlanes_u16);
+
+            for (const char *end = data + max16*bstride; data < end; data += NPY_SIMD_WIDTH) {
+                npyv_u16 mask = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16((npy_uint16*)data), vzero));
+                         mask = npyv_and_u16(mask, vone);
+                       vsum16 = npyv_add_u16(vsum16, mask);
+            }
+            lenx   -= max16;
+            zcount += npyv_sumup_u16(vsum16);
+        }
+        len  -= len_m;
+        count = len_m - zcount;
+    }
+#endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint16*)data != 0);
+    }
+    return count;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len)
+{
+    npy_intp count = 0;
+#if NPY_SIMD
+    if (bstride == sizeof(npy_uint32)) {
+        const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32;
+        const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32;
+        const npyv_u32 vone   = npyv_setall_u32(1);
+        const npyv_u32 vzero  = npyv_zero_u32();
+
+        npyv_u32 vsum32 = npyv_zero_u32();
+        for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+            npyv_u32 mask = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32((npy_uint32*)data), vzero));
+                     mask = npyv_and_u32(mask, vone);
+                   vsum32 = npyv_add_u32(vsum32, mask);
+        }
+        const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL));
+        npyv_u64 odd  = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32);
+        npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn));
+        count = len_m - npyv_sum_u64(npyv_add_u64(odd, even));
+        len  -= len_m;
+    }
+#endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint32*)data != 0);
+    }
+    return count;
+}
+
+static NPY_INLINE NPY_GCC_OPT_3 npy_intp
+count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len)
+{
+    npy_intp count = 0;
+#if NPY_SIMD
+    if (bstride == sizeof(npy_uint64)) {
+        const npy_uintp len_m = len & -npyv_nlanes_u64;
+        const npyv_u64 vone   = npyv_setall_u64(1);
+        const npyv_u64 vzero  = npyv_zero_u64();
+
+        npyv_u64 vsum64 = npyv_zero_u64();
+        for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) {
+            npyv_u64 mask = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64((npy_uint64*)data), vzero));
+                     mask = npyv_and_u64(mask, vone);
+                   vsum64 = npyv_add_u64(vsum64, mask);
+        }
+        len  -= len_m;
+        count = len_m - npyv_sum_u64(vsum64);
+    }
 #endif
+    for (; len > 0; --len, data += bstride) {
+        count += (*(npy_uint64*)data != 0);
+    }
+    return count;
+}
 /*
  * Counts the number of True values in a raw boolean array. This
  * is a low-overhead function which does no heap allocations.
  *
  * Returns -1 on error.
  */
-NPY_NO_EXPORT npy_intp
-count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+static NPY_GCC_OPT_3 npy_intp
+count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize)
 {
-    
+    assert(elsize <= 8);
     int idim;
     npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
-    npy_intp i, coord[NPY_MAXDIMS];
-    npy_intp count = 0;
-    NPY_BEGIN_THREADS_DEF;
+    npy_intp coord[NPY_MAXDIMS];
 
-    /* Use raw iteration with no heap memory allocation */
+    // Use raw iteration with no heap memory allocation
     if (PyArray_PrepareOneRawArrayIter(
                     ndim, ashape,
                     data, astrides,
@@ -2228,51 +2330,44 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const
         return -1;
     }
 
-    /* Handle zero-sized array */
+    // Handle zero-sized array
     if (shape[0] == 0) {
         return 0;
     }
 
+    NPY_BEGIN_THREADS_DEF;
     NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
-    /* Special case for contiguous inner loop */
-    if (strides[0] == 1) {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            /* Process the innermost dimension */
-            const char *d = data;
-            const char *e = data + shape[0];
-#if NPY_SIMD
-            npy_uintp stride = shape[0] & -npyv_nlanes_u8;
-            count += count_nonzero_bytes((const npy_uint8 *)d, stride);
-            d += stride;
-#else
-            if (!NPY_ALIGNMENT_REQUIRED ||
-                    npy_is_aligned(d, sizeof(npy_uint64))) {
-                npy_uintp stride = 6 * sizeof(npy_uint64);
-                for (; d < e - (shape[0] % stride); d += stride) {
-                    count += count_nonzero_bytes_384((const npy_uint64 *)d);
-                }
-            }
-#endif
-            for (; d < e; ++d) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
-    }
-    /* General inner loop */
-    else {
-        NPY_RAW_ITER_START(idim, ndim, coord, shape) {
-            char *d = data;
-            /* Process the innermost dimension */
-            for (i = 0; i < shape[0]; ++i, d += strides[0]) {
-                count += (*d != 0);
-            }
-        } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
+
+    #define NONZERO_CASE(LEN, SFX) \
+        case LEN: \
+            NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
+                count += count_nonzero_##SFX(data, strides[0], shape[0]); \
+            } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); \
+            break
+
+    npy_intp count = 0;
+    switch(elsize) {
+        NONZERO_CASE(1, u8);
+        NONZERO_CASE(2, u16);
+        NONZERO_CASE(4, u32);
+        NONZERO_CASE(8, u64);
     }
+    #undef NONZERO_CASE
 
     NPY_END_THREADS;
-
     return count;
 }
+/*
+ * Counts the number of True values in a raw boolean array. This
+ * is a low-overhead function which does no heap allocations.
+ *
+ * Returns -1 on error.
+ */
+NPY_NO_EXPORT NPY_GCC_OPT_3 npy_intp
+count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides)
+{
+    return count_nonzero_int(ndim, data, ashape, astrides, 1);
+}
 
 /*NUMPY_API
  * Counts the number of non-zero elements in the array.
@@ -2295,14 +2390,22 @@ PyArray_CountNonzero(PyArrayObject *self)
     npy_intp *strideptr, *innersizeptr;
     NPY_BEGIN_THREADS_DEF;
 
-    /* Special low-overhead version specific to the boolean type */
+    // Special low-overhead version specific to the boolean/int types
     dtype = PyArray_DESCR(self);
-    if (dtype->type_num == NPY_BOOL) {
-        return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),
-                        PyArray_DIMS(self), PyArray_STRIDES(self));
+    switch(dtype->kind) {
+        case 'u':
+        case 'i':
+        case 'b':
+            if (dtype->elsize > 8) {
+                break;
+            }
+            return count_nonzero_int(
+                PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self),
+                PyArray_STRIDES(self), dtype->elsize
+            );
     }
-    nonzero = PyArray_DESCR(self)->f->nonzero;
 
+    nonzero = PyArray_DESCR(self)->f->nonzero;
     /* If it's a trivial one-dimensional loop, don't use an iterator */
     if (PyArray_TRIVIALLY_ITERABLE(self)) {
         needs_api = PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index a697e5faf..06511822e 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -1266,20 +1266,30 @@ class TestNonzero:
         assert_equal(np.count_nonzero(x), 4)
         assert_equal(np.nonzero(x), ([0, 2, 3, 6],))
 
-        x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
-                     dtype=[('a', 'i4'), ('b', 'i2')])
+        # x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)],
+        #              dtype=[('a', 'i4'), ('b', 'i2')])
+        x = np.array([(1, 2, -5, -3), (0, 0, 2, 7), (1, 1, 0, 1), (-1, 3, 1, 0), (0, 7, 0, 4)],
+                     dtype=[('a', 'i4'), ('b', 'i2'), ('c', 'i1'), ('d', 'i8')])
         assert_equal(np.count_nonzero(x['a']), 3)
         assert_equal(np.count_nonzero(x['b']), 4)
+        assert_equal(np.count_nonzero(x['c']), 3)
+        assert_equal(np.count_nonzero(x['d']), 4)
         assert_equal(np.nonzero(x['a']), ([0, 2, 3],))
         assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],))
 
     def test_nonzero_twodim(self):
         x = np.array([[0, 1, 0], [2, 0, 3]])
-        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.count_nonzero(x.astype('i1')), 3)
+        assert_equal(np.count_nonzero(x.astype('i2')), 3)
+        assert_equal(np.count_nonzero(x.astype('i4')), 3)
+        assert_equal(np.count_nonzero(x.astype('i8')), 3)
         assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2]))
 
         x = np.eye(3)
-        assert_equal(np.count_nonzero(x), 3)
+        assert_equal(np.count_nonzero(x.astype('i1')), 3)
+        assert_equal(np.count_nonzero(x.astype('i2')), 3)
+        assert_equal(np.count_nonzero(x.astype('i4')), 3)
+        assert_equal(np.count_nonzero(x.astype('i8')), 3)
         assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2]))
 
         x = np.array([[(0, 1), (0, 0), (1, 11)],