Merge branch 'master' into nagfor-compatability

updating branch
author: Andrew Lawson <andrewl@olney.nag.co.uk> 2017-10-20 11:26:27 +0100
committer: Andrew Lawson <andrewl@olney.nag.co.uk> 2017-10-20 11:26:27 +0100
commit: 62d03f5c115517b5a2e208f747c4799217a431bf (patch)
tree: fc0a334e068089d7e325307932893f8b87cdb6ee
parent: 6e01373d0b3b75ecd4534e7f928acf009f7b8c8f (diff)
parent: 928671055fde9469e68a5ef35686a244391b404d (diff)
download: numpy-62d03f5c115517b5a2e208f747c4799217a431bf.tar.gz
3 files changed, 59 insertions, 60 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 670c39ea2..789717555 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1623,13 +1623,13 @@ NPY_NO_EXPORT void
  * when updating also update similar complex floats summation
  */
 static @type@
-pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
+pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride)
 {
     if (n < 8) {
         npy_intp i;
         @type@ res = 0.;
         for (i = 0; i < n; i++) {
-            res += @trf@(a[i * stride]);
+            res += @trf@(*((@dtype@*)(a + i * stride)));
         }
         return res;
     }
@@ -1642,26 +1642,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
          * 8 times unroll reduces blocksize to 16 and allows vectorization with
          * avx without changing summation ordering
          */
-        r[0] = @trf@(a[0 * stride]);
-        r[1] = @trf@(a[1 * stride]);
-        r[2] = @trf@(a[2 * stride]);
-        r[3] = @trf@(a[3 * stride]);
-        r[4] = @trf@(a[4 * stride]);
-        r[5] = @trf@(a[5 * stride]);
-        r[6] = @trf@(a[6 * stride]);
-        r[7] = @trf@(a[7 * stride]);
+        r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+        r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+        r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+        r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+        r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+        r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+        r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+        r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
 
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
-            r[0] += @trf@(a[(i + 0) * stride]);
-            r[1] += @trf@(a[(i + 1) * stride]);
-            r[2] += @trf@(a[(i + 2) * stride]);
-            r[3] += @trf@(a[(i + 3) * stride]);
-            r[4] += @trf@(a[(i + 4) * stride]);
-            r[5] += @trf@(a[(i + 5) * stride]);
-            r[6] += @trf@(a[(i + 6) * stride]);
-            r[7] += @trf@(a[(i + 7) * stride]);
+            NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3);
+            r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+            r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+            r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+            r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+            r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+            r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+            r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+            r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
         }
 
         /* accumulate now to avoid stack spills for single peel loop */
@@ -1670,7 +1670,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
 
         /* do non multiple of 8 rest */
         for (; i < n; i++) {
-            res += @trf@(a[i * stride]);
+            res += @trf@(*((@dtype@ *)(a + i * stride)));
         }
         return res;
     }
@@ -1707,8 +1707,7 @@ NPY_NO_EXPORT void
         @type@ * iop1 = (@type@ *)args[0];
         npy_intp n = dimensions[0];
 
-        *iop1 @OP@= pairwise_sum_@TYPE@((@type@ *)args[1], n,
-                                        steps[1] / (npy_intp)sizeof(@type@));
+        *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
 #else
         BINARY_REDUCE_LOOP(@type@) {
             io1 @OP@= *(@type@ *)ip2;
@@ -2065,8 +2064,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
 #if @PW@
         npy_intp n = dimensions[0];
 
-        io1 @OP@= pairwise_sum_HALF((npy_half *)args[1], n,
-                                    steps[1] / (npy_intp)sizeof(npy_half));
+        io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
 #else
         BINARY_REDUCE_LOOP_INNER {
             io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2397,7 +2395,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
 
 /* similar to pairwise sum of real floats */
 static void
-pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
+pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n,
                     npy_intp stride)
 {
     assert(n % 2 == 0);
@@ -2406,8 +2404,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
         *rr = 0.;
         *ri = 0.;
         for (i = 0; i < n; i += 2) {
-            *rr += a[i * stride + 0];
-            *ri += a[i * stride + 1];
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
         }
         return;
     }
@@ -2420,26 +2418,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
          * 8 times unroll reduces blocksize to 16 and allows vectorization with
          * avx without changing summation ordering
          */
-        r[0] = a[0 * stride];
-        r[1] = a[0 * stride + 1];
-        r[2] = a[2 * stride];
-        r[3] = a[2 * stride + 1];
-        r[4] = a[4 * stride];
-        r[5] = a[4 * stride + 1];
-        r[6] = a[6 * stride];
-        r[7] = a[6 * stride + 1];
+        r[0] = *((@ftype@ *)(a + 0 * stride));
+        r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+        r[2] = *((@ftype@ *)(a + 2 * stride));
+        r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+        r[4] = *((@ftype@ *)(a + 4 * stride));
+        r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+        r[6] = *((@ftype@ *)(a + 6 * stride));
+        r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
 
         for (i = 8; i < n - (n % 8); i += 8) {
             /* small blocksizes seems to mess with hardware prefetch */
-            NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
-            r[0] += a[(i + 0) * stride];
-            r[1] += a[(i + 0) * stride + 1];
-            r[2] += a[(i + 2) * stride];
-            r[3] += a[(i + 2) * stride + 1];
-            r[4] += a[(i + 4) * stride];
-            r[5] += a[(i + 4) * stride + 1];
-            r[6] += a[(i + 6) * stride];
-            r[7] += a[(i + 6) * stride + 1];
+            NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3);
+            r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+            r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+            r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+            r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+            r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+            r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+            r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+            r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
         }
 
         /* accumulate now to avoid stack spills for single peel loop */
@@ -2448,8 +2446,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
 
         /* do non multiple of 8 rest */
         for (; i < n; i+=2) {
-            *rr += a[i * stride + 0];
-            *ri += a[i * stride + 1];
+            *rr += *((@ftype@ *)(a + i * stride + 0));
+            *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
         }
         return;
     }
@@ -2481,8 +2479,7 @@ NPY_NO_EXPORT void
         @ftype@ * oi = ((@ftype@ *)args[0]) + 1;
         @ftype@ rr, ri;
 
-        pairwise_sum_@TYPE@(&rr, &ri, (@ftype@ *)args[1], n * 2,
-                            steps[1] / (npy_intp)sizeof(@ftype@) / 2);
+        pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
         *or @OP@= rr;
         *oi @OP@= ri;
         return;
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 5787a5183..bebeddc92 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -213,6 +213,17 @@ class TestComparisons(object):
         assert_equal(np.not_equal(a, a), [True])
 
 
+class TestAdd(object):
+    def test_reduce_alignment(self):
+        # gh-9876
+        # make sure arrays with weird strides work with the optimizations in
+        # pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a
+        # 4 byte offset, even though its itemsize is 8.
+        a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)])
+        a['a'] = -1
+        assert_equal(a['b'].sum(), 0)
+
+
 class TestDivision(object):
     def test_division_int(self):
         # int division should follow Python
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index bfd6ac28e..2f738c8a6 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1815,14 +1815,8 @@ def slogdet(a):
     real_t = _realType(result_t)
     signature = 'D->Dd' if isComplexType(t) else 'd->dd'
     sign, logdet = _umath_linalg.slogdet(a, signature=signature)
-    if isscalar(sign):
-        sign = sign.astype(result_t)
-    else:
-        sign = sign.astype(result_t, copy=False)
-    if isscalar(logdet):
-        logdet = logdet.astype(real_t)
-    else:
-        logdet = logdet.astype(real_t, copy=False)
+    sign = sign.astype(result_t, copy=False)
+    logdet = logdet.astype(real_t, copy=False)
     return sign, logdet
 
 def det(a):
@@ -1878,10 +1872,7 @@ def det(a):
     t, result_t = _commonType(a)
     signature = 'D->D' if isComplexType(t) else 'd->d'
     r = _umath_linalg.det(a, signature=signature)
-    if isscalar(r):
-        r = r.astype(result_t)
-    else:
-        r = r.astype(result_t, copy=False)
+    r = r.astype(result_t, copy=False)
     return r
 
 # Linear Least Squares
author	Andrew Lawson <andrewl@olney.nag.co.uk>	2017-10-20 11:26:27 +0100
committer	Andrew Lawson <andrewl@olney.nag.co.uk>	2017-10-20 11:26:27 +0100
commit	62d03f5c115517b5a2e208f747c4799217a431bf (patch)
tree	fc0a334e068089d7e325307932893f8b87cdb6ee
parent	6e01373d0b3b75ecd4534e7f928acf009f7b8c8f (diff)
parent	928671055fde9469e68a5ef35686a244391b404d (diff)
download	numpy-62d03f5c115517b5a2e208f747c4799217a431bf.tar.gz