diff options
author | Charles Harris <charlesr.harris@gmail.com> | 2013-10-16 17:46:58 -0700 |
---|---|---|
committer | Charles Harris <charlesr.harris@gmail.com> | 2013-10-16 17:46:58 -0700 |
commit | 3b3fa76d5051078d27b1a30b77b586e09c8b889d (patch) | |
tree | dce9ea01694e4167d71d7ad6408ec3a0240ad78d /numpy/core | |
parent | cf0869ea03e671986525c20bfda52deda501b316 (diff) | |
parent | 7d4ea165817fc613c79bb92ccb3844df94d1beed (diff) | |
download | numpy-3b3fa76d5051078d27b1a30b77b586e09c8b889d.tar.gz |
Merge pull request #3931 from juliantaylor/memchr-move
move memchr like code to a function
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/include/numpy/npy_cpu.h | 4 | ||||
-rw-r--r-- | numpy/core/setup.py | 1 | ||||
-rw-r--r-- | numpy/core/setup_common.py | 1 | ||||
-rw-r--r-- | numpy/core/src/multiarray/common.c | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/common.h | 66 | ||||
-rw-r--r-- | numpy/core/src/multiarray/dtype_transfer.c | 24 | ||||
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 4 | ||||
-rw-r--r-- | numpy/core/src/multiarray/lowlevel_strided_loops.c.src | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/mapping.c | 28 | ||||
-rw-r--r-- | numpy/core/src/umath/ufunc_type_resolution.c | 13 | ||||
-rw-r--r-- | numpy/core/tests/test_api.py | 45 |
11 files changed, 136 insertions, 54 deletions
diff --git a/numpy/core/include/numpy/npy_cpu.h b/numpy/core/include/numpy/npy_cpu.h index ab14731b8..6773d3258 100644 --- a/numpy/core/include/numpy/npy_cpu.h +++ b/numpy/core/include/numpy/npy_cpu.h @@ -108,4 +108,8 @@ #endif #endif +#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) +#define NPY_CPU_HAVE_UNALIGNED_ACCESS +#endif + #endif diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 75c129a5d..465752a40 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -894,6 +894,7 @@ def configuration(parent_package='',top_path=None): umath_deps = [ generate_umath_py, + join('src', 'multiarray', 'common.h'), join('src', 'umath', 'simd.inc.src'), join(codegen_dir, 'generate_ufunc_api.py'), join('src', 'private', 'ufunc_override.h')] + npymath_sources diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index bad3607fa..4633aef84 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -116,6 +116,7 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_bswap32", '5u'), ("__builtin_bswap64", '5u'), ("__builtin_expect", '5, 0'), + ("__builtin_ctz", '5'), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2 ] diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c index 4e2d64be3..9a05133a1 100644 --- a/numpy/core/src/multiarray/common.c +++ b/numpy/core/src/multiarray/common.c @@ -706,7 +706,7 @@ _IsAligned(PyArrayObject *ap) aligned |= (npy_uintp)PyArray_STRIDES(ap)[i]; #endif /* not NPY_RELAXED_STRIDES_CHECKING */ } - return npy_is_aligned(aligned, alignment); + return npy_is_aligned((void *)aligned, alignment); } NPY_NO_EXPORT npy_bool diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index f05698b9e..3e060de3d 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -1,6 +1,7 @@ #ifndef _NPY_PRIVATE_COMMON_H_ #define _NPY_PRIVATE_COMMON_H_ #include <numpy/npy_common.h> +#include <numpy/npy_cpu.h> #define error_converting(x) (((x) == -1) && PyErr_Occurred()) @@ -84,6 +85,71 @@ npy_is_aligned(const void * p, const npy_uintp alignment) } } +/* + * memchr with stride and invert argument + * intended for small searches where a call out to libc memchr is costly. + * stride must be a multiple of size. + * compared to memchr it returns one stride past end instead of NULL if needle + * is not found. + */ +static NPY_INLINE char * +npy_memchr(char * haystack, char needle, + npy_intp stride, npy_intp size, npy_intp * subloopsize, int invert) +{ + char * p = haystack; + char * const end = haystack + size; + if (stride == 0) { + if (!invert) { + p = (*p != needle) ? end : haystack; + } + else { + p = (*p == needle) ? end : haystack; + } + *subloopsize = (p - haystack); + return haystack; + } + + if (!invert) { + /* + * this is usually the path to determine elements to process, + * performance less important here. + * memchr has large setup cost if 0 byte is close to start. + */ + while (p < end && *p != needle) { + p += stride; + } + } + else { + /* usually find elements to skip path */ +#if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS) + if (needle == 0 && stride == 1) { + while (p < end - ((npy_uintp)end % sizeof(unsigned int))) { + unsigned int v = *(unsigned int*)p; + if (v == 0) { + p += sizeof(unsigned int); + continue; + } + p += __builtin_ctz(v) / 8; + *subloopsize = (p - haystack) / stride; + return p; + } + } +#endif + while (p < end && *p == needle) { + p += stride; + } + } + + /* division is very expensive */ + if (NPY_LIKELY(stride == 1)) { + *subloopsize = (p - haystack); + } + else { + *subloopsize = (p - haystack) / stride; + } + return p; +} + #include "ucsnarrow.h" #endif diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c index fdf1871d2..7ca9cc6cb 100644 --- a/numpy/core/src/multiarray/dtype_transfer.c +++ b/numpy/core/src/multiarray/dtype_transfer.c @@ -3059,22 +3059,14 @@ void _strided_masked_wrapper_decsrcref_transfer_function( while (N > 0) { /* Skip masked values, still calling decsrcref for move_references */ - subloopsize = 0; - while (subloopsize < N && !*mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1); decsrcref_stransfer(NULL, 0, src, src_stride, subloopsize, src_itemsize, decsrcref_transferdata); dst += subloopsize * dst_stride; src += subloopsize * src_stride; N -= subloopsize; /* Process unmasked values */ - subloopsize = 0; - while (subloopsize < N && *mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0); unmasked_stransfer(dst, dst_stride, src, src_stride, subloopsize, src_itemsize, unmasked_transferdata); dst += subloopsize * dst_stride; @@ -3102,20 +3094,12 @@ void _strided_masked_wrapper_transfer_function( while (N > 0) { /* Skip masked values */ - subloopsize = 0; - while (subloopsize < N && !*mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 1); dst += subloopsize * dst_stride; src += subloopsize * src_stride; N -= subloopsize; /* Process unmasked values */ - subloopsize = 0; - while (subloopsize < N && *mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr((char *)mask, 0, mask_stride, N, &subloopsize, 0); unmasked_stransfer(dst, dst_stride, src, src_stride, subloopsize, src_itemsize, unmasked_transferdata); dst += subloopsize * dst_stride; diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index af6ee8828..72cbf9a22 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -2414,7 +2414,7 @@ count_nonzero_bytes_128(npy_uint64 * w) */ if (NPY_UNLIKELY(((w1 | w2) & 0xFEFEFEFEFEFEFEFEULL) != 0)) { /* reload from pointer to avoid a unnecessary stack spill with gcc */ - char * c = w; + char * c = (char *)w; npy_uintp i, count = 0; for (i = 0; i < 16; i++) { count += (c[i] != 0); @@ -2469,7 +2469,7 @@ count_boolean_trues(int ndim, char *data, npy_intp *ashape, npy_intp *astrides) if (npy_is_aligned(data, sizeof(npy_uint64))) { npy_uintp stride = 2 * sizeof(npy_uint64); for (; d < e - (shape[0] % stride); d += stride) { - count += count_nonzero_bytes_128(d); + count += count_nonzero_bytes_128((npy_uint64 *)d); } } for (; d < e; ++d) { diff --git a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src index c22195c16..007ec8b9b 100644 --- a/numpy/core/src/multiarray/lowlevel_strided_loops.c.src +++ b/numpy/core/src/multiarray/lowlevel_strided_loops.c.src @@ -39,7 +39,7 @@ * instructions (16 byte). * So this flag can only be enabled if autovectorization is disabled. */ -#if (defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)) +#ifdef NPY_CPU_HAVE_UNALIGNED_ACCESS # define NPY_USE_UNALIGNED_ACCESS 0 #else # define NPY_USE_UNALIGNED_ACCESS 0 diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c index e1c287991..6bc4fb42d 100644 --- a/numpy/core/src/multiarray/mapping.c +++ b/numpy/core/src/multiarray/mapping.c @@ -725,19 +725,13 @@ array_boolean_subscript(PyArrayObject *self, while (innersize > 0) { /* Skip masked values */ - subloopsize = 0; - while (subloopsize < innersize && *bmask_data == 0) { - ++subloopsize; - bmask_data += bmask_stride; - } + bmask_data = npy_memchr(bmask_data, 0, bmask_stride, + innersize, &subloopsize, 1); innersize -= subloopsize; self_data += subloopsize * self_stride; /* Process unmasked values */ - subloopsize = 0; - while (subloopsize < innersize && *bmask_data != 0) { - ++subloopsize; - bmask_data += bmask_stride; - } + bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize, + &subloopsize, 0); stransfer(ret_data, itemsize, self_data, self_stride, subloopsize, itemsize, transferdata); innersize -= subloopsize; @@ -884,19 +878,13 @@ array_ass_boolean_subscript(PyArrayObject *self, while (innersize > 0) { /* Skip masked values */ - subloopsize = 0; - while (subloopsize < innersize && *bmask_data == 0) { - ++subloopsize; - bmask_data += bmask_stride; - } + bmask_data = npy_memchr(bmask_data, 0, bmask_stride, + innersize, &subloopsize, 1); innersize -= subloopsize; self_data += subloopsize * self_stride; /* Process unmasked values */ - subloopsize = 0; - while (subloopsize < innersize && *bmask_data != 0) { - ++subloopsize; - bmask_data += bmask_stride; - } + bmask_data = npy_memchr(bmask_data, 0, bmask_stride, innersize, + &subloopsize, 0); stransfer(self_data, self_stride, v_data, v_stride, subloopsize, src_itemsize, transferdata); innersize -= subloopsize; diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c index 36dbf6569..12d8e406b 100644 --- a/numpy/core/src/umath/ufunc_type_resolution.c +++ b/numpy/core/src/umath/ufunc_type_resolution.c @@ -23,6 +23,7 @@ #include "numpy/ufuncobject.h" #include "ufunc_type_resolution.h" +#include "common.h" static const char * npy_casting_to_string(NPY_CASTING casting) @@ -1343,11 +1344,7 @@ unmasked_ufunc_loop_as_masked( /* Process the data as runs of unmasked values */ do { /* Skip masked values */ - subloopsize = 0; - while (subloopsize < loopsize && !*mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 1); for (iargs = 0; iargs < nargs; ++iargs) { dataptrs[iargs] += subloopsize * strides[iargs]; } @@ -1356,11 +1353,7 @@ unmasked_ufunc_loop_as_masked( * Process unmasked values (assumes unmasked loop doesn't * mess with the 'args' pointer values) */ - subloopsize = 0; - while (subloopsize < loopsize && *mask) { - ++subloopsize; - mask += mask_stride; - } + mask = npy_memchr(mask, 0, mask_stride, loopsize, &subloopsize, 0); unmasked_innerloop(dataptrs, &subloopsize, strides, unmasked_innerloopdata); for (iargs = 0; iargs < nargs; ++iargs) { diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py index 282288e23..d642a2237 100644 --- a/numpy/core/tests/test_api.py +++ b/numpy/core/tests/test_api.py @@ -331,6 +331,51 @@ def test_copyto(): # 'dst' must be an array assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4]) +def test_copyto_permut(): + # test all permutation of possible masks, 9 should be sufficient for + # current 4 byte unrolled code + power = 9 + d = np.ones(power) + for i in range(2**power): + r = np.zeros(power) + l = [(i & x) != 0 for x in range(power)] + mask = np.array(l) + np.copyto(r, d, where=mask) + assert_array_equal(r == 1, l) + assert_equal(r.sum(), sum(l)) + + r = np.zeros(power) + np.copyto(r, d, where=mask[::-1]) + assert_array_equal(r == 1, l[::-1]) + assert_equal(r.sum(), sum(l)) + + r = np.zeros(power) + np.copyto(r[::2], d[::2], where=mask[::2]) + assert_array_equal(r[::2] == 1, l[::2]) + assert_equal(r[::2].sum(), sum(l[::2])) + + r = np.zeros(power) + np.copyto(r[::2], d[::2], where=mask[::-2]) + assert_array_equal(r[::2] == 1, l[::-2]) + assert_equal(r[::2].sum(), sum(l[::-2])) + + for c in [0xFF, 0x7F, 0x02, 0x10]: + r = np.zeros(power) + mask = np.array(l) + imask = np.array(l).view(np.uint8) + imask[mask != 0] = 0xFF + np.copyto(r, d, where=mask) + assert_array_equal(r == 1, l) + assert_equal(r.sum(), sum(l)) + + r = np.zeros(power) + np.copyto(r, d, where=True) + assert_equal(r.sum(), r.size) + r = np.ones(power) + d = np.zeros(power) + np.copyto(r, d, where=False) + assert_equal(r.sum(), r.size) + def test_copy_order(): a = np.arange(24).reshape(2, 1, 3, 4) b = a.copy(order='F') |