diff options
-rw-r--r-- | benchmarks/benchmarks/bench_core.py | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 233 | ||||
-rw-r--r-- | numpy/core/tests/test_numeric.py | 18 |
3 files changed, 183 insertions, 70 deletions
diff --git a/benchmarks/benchmarks/bench_core.py b/benchmarks/benchmarks/bench_core.py index f302f262d..30647f4b8 100644 --- a/benchmarks/benchmarks/bench_core.py +++ b/benchmarks/benchmarks/bench_core.py @@ -142,7 +142,7 @@ class CountNonzero(Benchmark): params = [ [1, 2, 3], [100, 10000, 1000000], - [bool, int, str, object] + [bool, np.int8, np.int16, np.int32, np.int64, str, object] ] def setup(self, numaxes, size, dtype): diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 8e4b2ebe1..fb354ce54 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2130,7 +2130,6 @@ count_nonzero_bytes_384(const npy_uint64 * w) } #if NPY_SIMD - /* Count the zero bytes between `*d` and `end`, updating `*d` to point to where to keep counting from. */ static NPY_INLINE NPY_GCC_OPT_3 npyv_u8 count_zero_bytes_u8(const npy_uint8 **d, const npy_uint8 *end, npy_uint8 max_count) @@ -2166,18 +2165,18 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c } return vsum16; } - +#endif // NPY_SIMD /* * Counts the number of non-zero values in a raw array. * The one loop process is shown below(take SSE2 with 128bits vector for example): - * |------------16 lanes---------| + * |------------16 lanes---------| *[vsum8] 255 255 255 ... 255 255 255 255 count_zero_bytes_u8: counting 255*16 elements * !! - * |------------8 lanes---------| + * |------------8 lanes---------| *[vsum16] 65535 65535 65535 ... 65535 count_zero_bytes_u16: counting (2*16-1)*16 elements * 65535 65535 65535 ... 65535 * !! - * |------------4 lanes---------| + * |------------4 lanes---------| *[sum_32_0] 65535 65535 65535 65535 count_nonzero_bytes * 65535 65535 65535 65535 *[sum_32_1] 65535 65535 65535 65535 @@ -2186,40 +2185,143 @@ count_zero_bytes_u16(const npy_uint8 **d, const npy_uint8 *end, npy_uint16 max_c * (2*16-1)*16 */ static NPY_INLINE NPY_GCC_OPT_3 npy_intp -count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx) +count_nonzero_u8(const char *data, npy_intp bstride, npy_uintp len) { - npy_intp zero_count = 0; - const npy_uint8 *end = d + unrollx; - while (d < end) { - npyv_u16x2 vsum16 = count_zero_bytes_u16(&d, end, NPY_MAX_UINT16); - npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); - npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); - zero_count += npyv_sum_u32(npyv_add_u32( - npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), - npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) - )); - } - return unrollx - zero_count; + npy_intp count = 0; + if (bstride == 1) { + #if NPY_SIMD + npy_uintp len_m = len & -npyv_nlanes_u8; + npy_uintp zcount = 0; + for (const char *end = data + len_m; data < end;) { + npyv_u16x2 vsum16 = count_zero_bytes_u16((const npy_uint8**)&data, (const npy_uint8*)end, NPY_MAX_UINT16); + npyv_u32x2 sum_32_0 = npyv_expand_u32_u16(vsum16.val[0]); + npyv_u32x2 sum_32_1 = npyv_expand_u32_u16(vsum16.val[1]); + zcount += npyv_sum_u32(npyv_add_u32( + npyv_add_u32(sum_32_0.val[0], sum_32_0.val[1]), + npyv_add_u32(sum_32_1.val[0], sum_32_1.val[1]) + )); + } + len -= len_m; + count = len_m - zcount; + #else + if (!NPY_ALIGNMENT_REQUIRED || npy_is_aligned(data, sizeof(npy_uint64))) { + int step = 6 * sizeof(npy_uint64); + int left_bytes = len % step; + for (const char *end = data + len; data < end - left_bytes; data += step) { + count += count_nonzero_bytes_384((const npy_uint64 *)data); + } + len = left_bytes; + } + #endif // NPY_SIMD + } + for (; len > 0; --len, data += bstride) { + count += (*data != 0); + } + return count; } +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_u16(const char *data, npy_intp bstride, npy_uintp len) +{ + npy_intp count = 0; +#if NPY_SIMD + if (bstride == sizeof(npy_uint16)) { + npy_uintp zcount = 0, len_m = len & -npyv_nlanes_u16; + const npyv_u16 vone = npyv_setall_u16(1); + const npyv_u16 vzero = npyv_zero_u16(); + + for (npy_uintp lenx = len_m; lenx > 0;) { + npyv_u16 vsum16 = npyv_zero_u16(); + npy_uintp max16 = PyArray_MIN(lenx, NPY_MAX_UINT16*npyv_nlanes_u16); + + for (const char *end = data + max16*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u16 mask = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16((npy_uint16*)data), vzero)); + mask = npyv_and_u16(mask, vone); + vsum16 = npyv_add_u16(vsum16, mask); + } + lenx -= max16; + zcount += npyv_sumup_u16(vsum16); + } + len -= len_m; + count = len_m - zcount; + } +#endif + for (; len > 0; --len, data += bstride) { + count += (*(npy_uint16*)data != 0); + } + return count; +} + +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_u32(const char *data, npy_intp bstride, npy_uintp len) +{ + npy_intp count = 0; +#if NPY_SIMD + if (bstride == sizeof(npy_uint32)) { + const npy_uintp max_iter = NPY_MAX_UINT32*npyv_nlanes_u32; + const npy_uintp len_m = (len > max_iter ? max_iter : len) & -npyv_nlanes_u32; + const npyv_u32 vone = npyv_setall_u32(1); + const npyv_u32 vzero = npyv_zero_u32(); + + npyv_u32 vsum32 = npyv_zero_u32(); + for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u32 mask = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32((npy_uint32*)data), vzero)); + mask = npyv_and_u32(mask, vone); + vsum32 = npyv_add_u32(vsum32, mask); + } + const npyv_u32 maskevn = npyv_reinterpret_u32_u64(npyv_setall_u64(0xffffffffULL)); + npyv_u64 odd = npyv_shri_u64(npyv_reinterpret_u64_u32(vsum32), 32); + npyv_u64 even = npyv_reinterpret_u64_u32(npyv_and_u32(vsum32, maskevn)); + count = len_m - npyv_sum_u64(npyv_add_u64(odd, even)); + len -= len_m; + } +#endif + for (; len > 0; --len, data += bstride) { + count += (*(npy_uint32*)data != 0); + } + return count; +} + +static NPY_INLINE NPY_GCC_OPT_3 npy_intp +count_nonzero_u64(const char *data, npy_intp bstride, npy_uintp len) +{ + npy_intp count = 0; +#if NPY_SIMD + if (bstride == sizeof(npy_uint64)) { + const npy_uintp len_m = len & -npyv_nlanes_u64; + const npyv_u64 vone = npyv_setall_u64(1); + const npyv_u64 vzero = npyv_zero_u64(); + + npyv_u64 vsum64 = npyv_zero_u64(); + for (const char *end = data + len_m*bstride; data < end; data += NPY_SIMD_WIDTH) { + npyv_u64 mask = npyv_cvt_u64_b64(npyv_cmpeq_u64(npyv_load_u64((npy_uint64*)data), vzero)); + mask = npyv_and_u64(mask, vone); + vsum64 = npyv_add_u64(vsum64, mask); + } + len -= len_m; + count = len_m - npyv_sum_u64(vsum64); + } #endif + for (; len > 0; --len, data += bstride) { + count += (*(npy_uint64*)data != 0); + } + return count; +} /* * Counts the number of True values in a raw boolean array. This * is a low-overhead function which does no heap allocations. * * Returns -1 on error. */ -NPY_NO_EXPORT npy_intp -count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides) +static NPY_GCC_OPT_3 npy_intp +count_nonzero_int(int ndim, char *data, const npy_intp *ashape, const npy_intp *astrides, int elsize) { - + assert(elsize <= 8); int idim; npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS]; - npy_intp i, coord[NPY_MAXDIMS]; - npy_intp count = 0; - NPY_BEGIN_THREADS_DEF; + npy_intp coord[NPY_MAXDIMS]; - /* Use raw iteration with no heap memory allocation */ + // Use raw iteration with no heap memory allocation if (PyArray_PrepareOneRawArrayIter( ndim, ashape, data, astrides, @@ -2228,51 +2330,44 @@ count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const return -1; } - /* Handle zero-sized array */ + // Handle zero-sized array if (shape[0] == 0) { return 0; } + NPY_BEGIN_THREADS_DEF; NPY_BEGIN_THREADS_THRESHOLDED(shape[0]); - /* Special case for contiguous inner loop */ - if (strides[0] == 1) { - NPY_RAW_ITER_START(idim, ndim, coord, shape) { - /* Process the innermost dimension */ - const char *d = data; - const char *e = data + shape[0]; -#if NPY_SIMD - npy_uintp stride = shape[0] & -npyv_nlanes_u8; - count += count_nonzero_bytes((const npy_uint8 *)d, stride); - d += stride; -#else - if (!NPY_ALIGNMENT_REQUIRED || - npy_is_aligned(d, sizeof(npy_uint64))) { - npy_uintp stride = 6 * sizeof(npy_uint64); - for (; d < e - (shape[0] % stride); d += stride) { - count += count_nonzero_bytes_384((const npy_uint64 *)d); - } - } -#endif - for (; d < e; ++d) { - count += (*d != 0); - } - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); - } - /* General inner loop */ - else { - NPY_RAW_ITER_START(idim, ndim, coord, shape) { - char *d = data; - /* Process the innermost dimension */ - for (i = 0; i < shape[0]; ++i, d += strides[0]) { - count += (*d != 0); - } - } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); + + #define NONZERO_CASE(LEN, SFX) \ + case LEN: \ + NPY_RAW_ITER_START(idim, ndim, coord, shape) { \ + count += count_nonzero_##SFX(data, strides[0], shape[0]); \ + } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides); \ + break + + npy_intp count = 0; + switch(elsize) { + NONZERO_CASE(1, u8); + NONZERO_CASE(2, u16); + NONZERO_CASE(4, u32); + NONZERO_CASE(8, u64); } + #undef NONZERO_CASE NPY_END_THREADS; - return count; } +/* + * Counts the number of True values in a raw boolean array. This + * is a low-overhead function which does no heap allocations. + * + * Returns -1 on error. + */ +NPY_NO_EXPORT NPY_GCC_OPT_3 npy_intp +count_boolean_trues(int ndim, char *data, npy_intp const *ashape, npy_intp const *astrides) +{ + return count_nonzero_int(ndim, data, ashape, astrides, 1); +} /*NUMPY_API * Counts the number of non-zero elements in the array. @@ -2295,14 +2390,22 @@ PyArray_CountNonzero(PyArrayObject *self) npy_intp *strideptr, *innersizeptr; NPY_BEGIN_THREADS_DEF; - /* Special low-overhead version specific to the boolean type */ + // Special low-overhead version specific to the boolean/int types dtype = PyArray_DESCR(self); - if (dtype->type_num == NPY_BOOL) { - return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self), - PyArray_DIMS(self), PyArray_STRIDES(self)); + switch(dtype->kind) { + case 'u': + case 'i': + case 'b': + if (dtype->elsize > 8) { + break; + } + return count_nonzero_int( + PyArray_NDIM(self), PyArray_BYTES(self), PyArray_DIMS(self), + PyArray_STRIDES(self), dtype->elsize + ); } - nonzero = PyArray_DESCR(self)->f->nonzero; + nonzero = PyArray_DESCR(self)->f->nonzero; /* If it's a trivial one-dimensional loop, don't use an iterator */ if (PyArray_TRIVIALLY_ITERABLE(self)) { needs_api = PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI); diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py index a697e5faf..06511822e 100644 --- a/numpy/core/tests/test_numeric.py +++ b/numpy/core/tests/test_numeric.py @@ -1266,20 +1266,30 @@ class TestNonzero: assert_equal(np.count_nonzero(x), 4) assert_equal(np.nonzero(x), ([0, 2, 3, 6],)) - x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)], - dtype=[('a', 'i4'), ('b', 'i2')]) + # x = np.array([(1, 2), (0, 0), (1, 1), (-1, 3), (0, 7)], + # dtype=[('a', 'i4'), ('b', 'i2')]) + x = np.array([(1, 2, -5, -3), (0, 0, 2, 7), (1, 1, 0, 1), (-1, 3, 1, 0), (0, 7, 0, 4)], + dtype=[('a', 'i4'), ('b', 'i2'), ('c', 'i1'), ('d', 'i8')]) assert_equal(np.count_nonzero(x['a']), 3) assert_equal(np.count_nonzero(x['b']), 4) + assert_equal(np.count_nonzero(x['c']), 3) + assert_equal(np.count_nonzero(x['d']), 4) assert_equal(np.nonzero(x['a']), ([0, 2, 3],)) assert_equal(np.nonzero(x['b']), ([0, 2, 3, 4],)) def test_nonzero_twodim(self): x = np.array([[0, 1, 0], [2, 0, 3]]) - assert_equal(np.count_nonzero(x), 3) + assert_equal(np.count_nonzero(x.astype('i1')), 3) + assert_equal(np.count_nonzero(x.astype('i2')), 3) + assert_equal(np.count_nonzero(x.astype('i4')), 3) + assert_equal(np.count_nonzero(x.astype('i8')), 3) assert_equal(np.nonzero(x), ([0, 1, 1], [1, 0, 2])) x = np.eye(3) - assert_equal(np.count_nonzero(x), 3) + assert_equal(np.count_nonzero(x.astype('i1')), 3) + assert_equal(np.count_nonzero(x.astype('i2')), 3) + assert_equal(np.count_nonzero(x.astype('i4')), 3) + assert_equal(np.count_nonzero(x.astype('i8')), 3) assert_equal(np.nonzero(x), ([0, 1, 2], [0, 1, 2])) x = np.array([[(0, 1), (0, 0), (1, 11)], |