From cfd81489a61c5144c9a77bb0494877817acd24d3 Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Wed, 23 Oct 2013 20:51:21 +0200
Subject: BUG: fix handling of negative strides in npy_memchr

the new code did not account for them at all, add the old loops back but
keep the stride 1 optimization for sparse masks.
---
 numpy/core/src/multiarray/common.h | 33 +++++++++++----------------------
 numpy/core/tests/test_api.py       |  8 ++++++++
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'numpy/core')

diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 4b23b9442..cc8c81936 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -119,20 +119,10 @@ npy_is_aligned(const void * p, const npy_uintp alignment)
  */
 static NPY_INLINE char *
 npy_memchr(char * haystack, char needle,
-           npy_intp stride, npy_intp size, npy_intp * subloopsize, int invert)
+           npy_intp stride, npy_intp size, npy_intp * psubloopsize, int invert)
 {
     char * p = haystack;
-    char * const end = haystack + size;
-    if (stride == 0) {
-        if (!invert) {
-            p = (*p != needle) ? end : haystack;
-        }
-        else {
-            p = (*p == needle) ? end : haystack;
-        }
-        *subloopsize = (p - haystack);
-        return haystack;
-    }
+    npy_intp subloopsize = 0;
 
     if (!invert) {
         /*
@@ -140,7 +130,8 @@ npy_memchr(char * haystack, char needle,
          * performance less important here.
          * memchr has large setup cost if 0 byte is close to start.
          */
-        while (p < end && *p != needle) {
+        while (subloopsize < size && *p != needle) {
+            subloopsize++;
             p += stride;
         }
     }
@@ -148,6 +139,7 @@ npy_memchr(char * haystack, char needle,
         /* usually find elements to skip path */
 #if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS)
         if (needle == 0 && stride == 1) {
+            char * const end = haystack + size;
             while (p < end - (size % sizeof(unsigned int))) {
                 unsigned int  v = *(unsigned int*)p;
                 if (v == 0) {
@@ -155,23 +147,20 @@ npy_memchr(char * haystack, char needle,
                     continue;
                 }
                 p += __builtin_ctz(v) / 8;
-                *subloopsize = (p - haystack) / stride;
+                *psubloopsize = (p - haystack);
                 return p;
             }
+            subloopsize = (p - haystack);
         }
 #endif
-        while (p < end && *p == needle) {
+        while (subloopsize < size && *p == needle) {
+            subloopsize++;
             p += stride;
         }
     }
 
-    /* division is very expensive */
-    if (NPY_LIKELY(stride == 1)) {
-        *subloopsize = (p - haystack);
-    }
-    else {
-        *subloopsize = (p - haystack) / stride;
-    }
+    *psubloopsize = subloopsize;
+
     return p;
 }
 
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index d642a2237..b5214c803 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -332,6 +332,14 @@ def test_copyto():
     assert_raises(TypeError, np.copyto, [1, 2, 3], [2, 3, 4])
 
 def test_copyto_permut():
+    # test explicit overflow case
+    pad = 500
+    l = [True] * pad + [True, True, True, True]
+    r = np.zeros(len(l)-pad)
+    d = np.ones(len(l)-pad)
+    mask = np.array(l)[pad:]
+    np.copyto(r, d, where=mask[::-1])
+
     # test all permutation of possible masks, 9 should be sufficient for
     # current 4 byte unrolled code
     power = 9
-- 
cgit v1.2.1


From ef229342f2bc0fdbee3264cb245d6c0a4ebfc1ff Mon Sep 17 00:00:00 2001
From: Julian Taylor <jtaylor.debian@googlemail.com>
Date: Wed, 23 Oct 2013 22:32:05 +0200
Subject: MAINT: remove trailing zero count path in npy_memchr

its only actually faster than just running through the trailing bytewise
loop if the needle is in the last byte.
---
 numpy/core/setup_common.py         |  1 -
 numpy/core/src/multiarray/common.h | 17 ++++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'numpy/core')

diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index 4633aef84..bad3607fa 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -116,7 +116,6 @@ OPTIONAL_INTRINSICS = [("__builtin_isnan", '5.'),
                        ("__builtin_bswap32", '5u'),
                        ("__builtin_bswap64", '5u'),
                        ("__builtin_expect", '5, 0'),
-                       ("__builtin_ctz", '5'),
                        ("_mm_load_ps", '(float*)0', "xmmintrin.h"), # SSE
                        ("_mm_load_pd", '(double*)0', "emmintrin.h"), # SSE2
                        ]
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index cc8c81936..5d77170ea 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -137,19 +137,18 @@ npy_memchr(char * haystack, char needle,
     }
     else {
         /* usually find elements to skip path */
-#if (defined HAVE___BUILTIN_CTZ && defined NPY_CPU_HAVE_UNALIGNED_ACCESS)
+#if defined NPY_CPU_HAVE_UNALIGNED_ACCESS
         if (needle == 0 && stride == 1) {
-            char * const end = haystack + size;
-            while (p < end - (size % sizeof(unsigned int))) {
+            /* iterate until last multiple of 4 */
+            char * block_end = haystack + size - (size % sizeof(unsigned int));
+            while (p < block_end) {
                 unsigned int  v = *(unsigned int*)p;
-                if (v == 0) {
-                    p += sizeof(unsigned int);
-                    continue;
+                if (v != 0) {
+                    break;
                 }
-                p += __builtin_ctz(v) / 8;
-                *psubloopsize = (p - haystack);
-                return p;
+                p += sizeof(unsigned int);
             }
+            /* handle rest */
             subloopsize = (p - haystack);
         }
 #endif
-- 
cgit v1.2.1