From cfd81489a61c5144c9a77bb0494877817acd24d3 Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Wed, 23 Oct 2013 20:51:21 +0200 Subject: BUG: fix handling of negative strides in npy_memchr the new code did not account for them at all, add the old loops back but keep the stride 1 optimization for sparse masks. --- numpy/core/src/multiarray/common.h | 33 +++++++++++---------------------- numpy/core/tests/test_api.py | 8 ++++++++ 2 files changed, 19 insertions(+), 22 deletions(-) (limited to 'numpy/core') diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index 4b23b9442..cc8c81936 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -119,20 +119,10 @@ npy_is_aligned(const void * p, const npy_uintp alignment) */ static NPY_INLINE char * npy_memchr(char * haystack, char needle, - npy_intp stride, npy_intp size, npy_intp * subloopsize, int invert) + npy_intp stride, npy_intp size, npy_intp * psubloopsize, int invert) { char * p = haystack; - char * const end = haystack + size; - if (stride == 0) { - if (!invert) { - p = (*p != needle) ? end : haystack; - } - else { - p = (*p == needle) ? end : haystack; - } - *subloopsize = (p - haystack); - return haystack; - } + npy_intp subloopsize = 0; if (!invert) { /* @@ -140,7 +130,8 @@ npy_memchr(char * haystack, char needle, * performance less important here. * memchr has large setup cost if 0 byte is close to start. */ - while (p < end && *p != needle) { + while (subloopsize < size && *p != needle) { + subloopsize++; p += stride; } } @@ -148,6 +139,7 @@ npy_memchr(char * haystack, char needle, /* usually find elements to skip path */ #if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS) if (needle == 0 && stride == 1) { + char * const end = haystack + size; while (p < end - (size % sizeof(unsigned int))) { unsigned int v = *(unsigned int*)p; if (v == 0) { @@ -155,23 +147,20 @@ npy_memchr(char * haystack, char needle, continue; } p += __builtin_ctz(v) / 8; - *subloopsize = (p - haystack) / stride; + *psubloopsize = (p - haystack); return p; } + subloopsize = (p - haystack); } #endif - while (p < end && *p == needle) { + while (subloopsize < size && *p == needle) { + subloopsize++; p += stride; } } - /* division is very expensive */ - if (NPY_LIKELY(stride == 1)) { - *subloopsize = (p - haystack); - } - else { - *subloopsize = (p - haystack) / stride; - } + *psubloopsize = subloopsize; + return p; } diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py index d642a2237..b5214c803 100644 --- a/numpy/core/tests/test_api.py +++ b/numpy/core/tests/test_api.py @@ -332,6 +332,14 @@ def test_copyto(): assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4]) def test_copyto_permut(): + # test explicit overflow case + pad = 500 + l = [True] * pad + [True, True, True, True] + r = np.zeros(len(l)-pad) + d = np.ones(len(l)-pad) + mask = np.array(l)[pad:] + np.copyto(r, d, where=mask[::-1]) + # test all permutation of possible masks, 9 should be sufficient for # current 4 byte unrolled code power = 9 -- cgit v1.2.1 From ef229342f2bc0fdbee3264cb245d6c0a4ebfc1ff Mon Sep 17 00:00:00 2001 From: Julian Taylor Date: Wed, 23 Oct 2013 22:32:05 +0200 Subject: MAINT: remove trailing zero count path in npy_memchr its only actually faster than just running through the trailing bytewise loop if the needle is in the last byte. --- numpy/core/setup_common.py | 1 - numpy/core/src/multiarray/common.h | 17 ++++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'numpy/core') diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py index 4633aef84..bad3607fa 100644 --- a/numpy/core/setup_common.py +++ b/numpy/core/setup_common.py @@ -116,7 +116,6 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'), ("__builtin_bswap32", '5u'), ("__builtin_bswap64", '5u'), ("__builtin_expect", '5, 0'), - ("__builtin_ctz", '5'), ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2 ] diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index cc8c81936..5d77170ea 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -137,19 +137,18 @@ npy_memchr(char * haystack, char needle, } else { /* usually find elements to skip path */ -#if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS) +#if defined NPY_CPU_HAVE_UNALIGNED_ACCESS if (needle == 0 && stride == 1) { - char * const end = haystack + size; - while (p < end - (size % sizeof(unsigned int))) { + /* iterate until last multiple of 4 */ + char * block_end = haystack + size - (size % sizeof(unsigned int)); + while (p < block_end) { unsigned int v = *(unsigned int*)p; - if (v == 0) { - p += sizeof(unsigned int); - continue; + if (v != 0) { + break; } - p += __builtin_ctz(v) / 8; - *psubloopsize = (p - haystack); - return p; + p += sizeof(unsigned int); } + /* handle rest */ subloopsize = (p - haystack); } #endif -- cgit v1.2.1