summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/multiarray/common.h45
-rw-r--r--numpy/core/src/multiarray/dtype_transfer.c24
-rw-r--r--numpy/core/src/multiarray/mapping.c28
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.c13
-rw-r--r--numpy/core/tests/test_api.py45
6 files changed, 106 insertions, 50 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 75c129a5d..465752a40 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -894,6 +894,7 @@ def configuration(parent_package='',top_path=None):
umath_deps = [
generate_umath_py,
+ join('src', 'multiarray', 'common.h'),
join('src', 'umath', 'simd.inc.src'),
join(codegen_dir, 'generate_ufunc_api.py'),
join('src', 'private', 'ufunc_override.h')] + npymath_sources
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index f05698b9e..f94bd07d5 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -84,6 +84,51 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
}
}
+/*
+ * memchr with stride and invert argument
+ * intended for small searches where a call out to libc memchr is costly.
+ * stride must be a multiple of size.
+ * compared to memchr it returns one stride past end instead of NULL if needle
+ * is not found.
+ */
+static NPY_INLINE char *
+npy_memchr(char * haystack, char needle,
+ npy_intp stride, npy_intp size, npy_intp * subloopsize, int invert)
+{
+ char * p = haystack;
+ char * const end = haystack + size;
+ if (stride == 0) {
+ if (!invert) {
+ p = (*p != needle) ? end : haystack;
+ }
+ else {
+ p = (*p == needle) ? end : haystack;
+ }
+ *subloopsize = (p - haystack);
+ return haystack;
+ }
+
+ if (!invert) {
+ while (p < end && *p != needle) {
+ p += stride;
+ }
+ }
+ else {
+ while (p < end && *p == needle) {
+ p += stride;
+ }
+ }
+
+ /* division is very expensive */
+ if (NPY_LIKELY(stride == 1)) {
+ *subloopsize = (p - haystack);
+ }
+ else {
+ *subloopsize = (p - haystack) / stride;
+ }
+ return p;
+}
+
#include "ucsnarrow.h"
#endif
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index fdf1871d2..7ca9cc6cb 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -3059,22 +3059,14 @@ void _strided_masked_wrapper_decsrcref_transfer_function(
while (N > 0) {
/* Skip masked values, still calling decsrcref for move_references */
- subloopsize = 0;
- while (subloopsize < N && !*mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1);
decsrcref_stransfer(NULL, 0, src, src_stride,
subloopsize, src_itemsize, decsrcref_transferdata);
dst += subloopsize * dst_stride;
src += subloopsize * src_stride;
N -= subloopsize;
/* Process unmasked values */
- subloopsize = 0;
- while (subloopsize < N && *mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0);
unmasked_stransfer(dst, dst_stride, src, src_stride,
subloopsize, src_itemsize, unmasked_transferdata);
dst += subloopsize * dst_stride;
@@ -3102,20 +3094,12 @@ void _strided_masked_wrapper_transfer_function(
while (N > 0) {
/* Skip masked values */
- subloopsize = 0;
- while (subloopsize < N && !*mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1);
dst += subloopsize * dst_stride;
src += subloopsize * src_stride;
N -= subloopsize;
/* Process unmasked values */
- subloopsize = 0;
- while (subloopsize < N && *mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0);
unmasked_stransfer(dst, dst_stride, src, src_stride,
subloopsize, src_itemsize, unmasked_transferdata);
dst += subloopsize * dst_stride;
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index e1c287991..6bc4fb42d 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -725,19 +725,13 @@ array_boolean_subscript(PyArrayObject *self,
while (innersize > 0) {
/* Skip masked values */
- subloopsize = 0;
- while (subloopsize < innersize && *bmask_data == 0) {
- ++subloopsize;
- bmask_data += bmask_stride;
- }
+ bmask_data = npy_memchr(bmask_data, 0, bmask_stride,
+ innersize, &subloopsize, 1);
innersize -= subloopsize;
self_data += subloopsize * self_stride;
/* Process unmasked values */
- subloopsize = 0;
- while (subloopsize < innersize && *bmask_data != 0) {
- ++subloopsize;
- bmask_data += bmask_stride;
- }
+ bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
+ &subloopsize, 0);
stransfer(ret_data, itemsize, self_data, self_stride,
subloopsize, itemsize, transferdata);
innersize -= subloopsize;
@@ -884,19 +878,13 @@ array_ass_boolean_subscript(PyArrayObject *self,
while (innersize > 0) {
/* Skip masked values */
- subloopsize = 0;
- while (subloopsize < innersize && *bmask_data == 0) {
- ++subloopsize;
- bmask_data += bmask_stride;
- }
+ bmask_data = npy_memchr(bmask_data, 0, bmask_stride,
+ innersize, &subloopsize, 1);
innersize -= subloopsize;
self_data += subloopsize * self_stride;
/* Process unmasked values */
- subloopsize = 0;
- while (subloopsize < innersize && *bmask_data != 0) {
- ++subloopsize;
- bmask_data += bmask_stride;
- }
+ bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize,
+ &subloopsize, 0);
stransfer(self_data, self_stride, v_data, v_stride,
subloopsize, src_itemsize, transferdata);
innersize -= subloopsize;
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 36dbf6569..12d8e406b 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -23,6 +23,7 @@
#include "numpy/ufuncobject.h"
#include "ufunc_type_resolution.h"
+#include "common.h"
static const char *
npy_casting_to_string(NPY_CASTING casting)
@@ -1343,11 +1344,7 @@ unmasked_ufunc_loop_as_masked(
/* Process the data as runs of unmasked values */
do {
/* Skip masked values */
- subloopsize = 0;
- while (subloopsize < loopsize && !*mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 1);
for (iargs = 0; iargs < nargs; ++iargs) {
dataptrs[iargs] += subloopsize * strides[iargs];
}
@@ -1356,11 +1353,7 @@ unmasked_ufunc_loop_as_masked(
* Process unmasked values (assumes unmasked loop doesn't
* mess with the 'args' pointer values)
*/
- subloopsize = 0;
- while (subloopsize < loopsize && *mask) {
- ++subloopsize;
- mask += mask_stride;
- }
+ mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 0);
unmasked_innerloop(dataptrs, &subloopsize, strides,
unmasked_innerloopdata);
for (iargs = 0; iargs < nargs; ++iargs) {
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index 282288e23..d642a2237 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -331,6 +331,51 @@ def test_copyto():
# 'dst' must be an array
assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4])
+def test_copyto_permut():
+ # test all permutation of possible masks, 9 should be sufficient for
+ # current 4 byte unrolled code
+ power = 9
+ d = np.ones(power)
+ for i in range(2**power):
+ r = np.zeros(power)
+ l = [(i & x) != 0 for x in range(power)]
+ mask = np.array(l)
+ np.copyto(r, d, where=mask)
+ assert_array_equal(r == 1, l)
+ assert_equal(r.sum(), sum(l))
+
+ r = np.zeros(power)
+ np.copyto(r, d, where=mask[::-1])
+ assert_array_equal(r == 1, l[::-1])
+ assert_equal(r.sum(), sum(l))
+
+ r = np.zeros(power)
+ np.copyto(r[::2], d[::2], where=mask[::2])
+ assert_array_equal(r[::2] == 1, l[::2])
+ assert_equal(r[::2].sum(), sum(l[::2]))
+
+ r = np.zeros(power)
+ np.copyto(r[::2], d[::2], where=mask[::-2])
+ assert_array_equal(r[::2] == 1, l[::-2])
+ assert_equal(r[::2].sum(), sum(l[::-2]))
+
+ for c in [0xFF, 0x7F, 0x02, 0x10]:
+ r = np.zeros(power)
+ mask = np.array(l)
+ imask = np.array(l).view(np.uint8)
+ imask[mask != 0] = 0xFF
+ np.copyto(r, d, where=mask)
+ assert_array_equal(r == 1, l)
+ assert_equal(r.sum(), sum(l))
+
+ r = np.zeros(power)
+ np.copyto(r, d, where=True)
+ assert_equal(r.sum(), r.size)
+ r = np.ones(power)
+ d = np.zeros(power)
+ np.copyto(r, d, where=False)
+ assert_equal(r.sum(), r.size)
+
def test_copy_order():
a = np.arange(24).reshape(2, 1, 3, 4)
b = a.copy(order='F')