summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Lawson <andrewl@olney.nag.co.uk>2017-10-20 11:26:27 +0100
committerAndrew Lawson <andrewl@olney.nag.co.uk>2017-10-20 11:26:27 +0100
commit62d03f5c115517b5a2e208f747c4799217a431bf (patch)
treefc0a334e068089d7e325307932893f8b87cdb6ee
parent6e01373d0b3b75ecd4534e7f928acf009f7b8c8f (diff)
parent928671055fde9469e68a5ef35686a244391b404d (diff)
downloadnumpy-62d03f5c115517b5a2e208f747c4799217a431bf.tar.gz
Merge branch 'master' into nagfor-compatability
updating branch
-rw-r--r--numpy/core/src/umath/loops.c.src93
-rw-r--r--numpy/core/tests/test_umath.py11
-rw-r--r--numpy/linalg/linalg.py15
3 files changed, 59 insertions, 60 deletions
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 670c39ea2..789717555 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1623,13 +1623,13 @@ NPY_NO_EXPORT void
* when updating also update similar complex floats summation
*/
static @type@
-pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
+pairwise_sum_@TYPE@(char *a, npy_uintp n, npy_intp stride)
{
if (n < 8) {
npy_intp i;
@type@ res = 0.;
for (i = 0; i < n; i++) {
- res += @trf@(a[i * stride]);
+ res += @trf@(*((@dtype@*)(a + i * stride)));
}
return res;
}
@@ -1642,26 +1642,26 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
* 8 times unroll reduces blocksize to 16 and allows vectorization with
* avx without changing summation ordering
*/
- r[0] = @trf@(a[0 * stride]);
- r[1] = @trf@(a[1 * stride]);
- r[2] = @trf@(a[2 * stride]);
- r[3] = @trf@(a[3 * stride]);
- r[4] = @trf@(a[4 * stride]);
- r[5] = @trf@(a[5 * stride]);
- r[6] = @trf@(a[6 * stride]);
- r[7] = @trf@(a[7 * stride]);
+ r[0] = @trf@(*((@dtype@ *)(a + 0 * stride)));
+ r[1] = @trf@(*((@dtype@ *)(a + 1 * stride)));
+ r[2] = @trf@(*((@dtype@ *)(a + 2 * stride)));
+ r[3] = @trf@(*((@dtype@ *)(a + 3 * stride)));
+ r[4] = @trf@(*((@dtype@ *)(a + 4 * stride)));
+ r[5] = @trf@(*((@dtype@ *)(a + 5 * stride)));
+ r[6] = @trf@(*((@dtype@ *)(a + 6 * stride)));
+ r[7] = @trf@(*((@dtype@ *)(a + 7 * stride)));
for (i = 8; i < n - (n % 8); i += 8) {
/* small blocksizes seems to mess with hardware prefetch */
- NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
- r[0] += @trf@(a[(i + 0) * stride]);
- r[1] += @trf@(a[(i + 1) * stride]);
- r[2] += @trf@(a[(i + 2) * stride]);
- r[3] += @trf@(a[(i + 3) * stride]);
- r[4] += @trf@(a[(i + 4) * stride]);
- r[5] += @trf@(a[(i + 5) * stride]);
- r[6] += @trf@(a[(i + 6) * stride]);
- r[7] += @trf@(a[(i + 7) * stride]);
+ NPY_PREFETCH(a + (i + 512 / sizeof(@dtype@)) * stride, 0, 3);
+ r[0] += @trf@(*((@dtype@ *)(a + (i + 0) * stride)));
+ r[1] += @trf@(*((@dtype@ *)(a + (i + 1) * stride)));
+ r[2] += @trf@(*((@dtype@ *)(a + (i + 2) * stride)));
+ r[3] += @trf@(*((@dtype@ *)(a + (i + 3) * stride)));
+ r[4] += @trf@(*((@dtype@ *)(a + (i + 4) * stride)));
+ r[5] += @trf@(*((@dtype@ *)(a + (i + 5) * stride)));
+ r[6] += @trf@(*((@dtype@ *)(a + (i + 6) * stride)));
+ r[7] += @trf@(*((@dtype@ *)(a + (i + 7) * stride)));
}
/* accumulate now to avoid stack spills for single peel loop */
@@ -1670,7 +1670,7 @@ pairwise_sum_@TYPE@(@dtype@ *a, npy_uintp n, npy_intp stride)
/* do non multiple of 8 rest */
for (; i < n; i++) {
- res += @trf@(a[i * stride]);
+ res += @trf@(*((@dtype@ *)(a + i * stride)));
}
return res;
}
@@ -1707,8 +1707,7 @@ NPY_NO_EXPORT void
@type@ * iop1 = (@type@ *)args[0];
npy_intp n = dimensions[0];
- *iop1 @OP@= pairwise_sum_@TYPE@((@type@ *)args[1], n,
- steps[1] / (npy_intp)sizeof(@type@));
+ *iop1 @OP@= pairwise_sum_@TYPE@(args[1], n, steps[1]);
#else
BINARY_REDUCE_LOOP(@type@) {
io1 @OP@= *(@type@ *)ip2;
@@ -2065,8 +2064,7 @@ HALF_@kind@(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED
#if @PW@
npy_intp n = dimensions[0];
- io1 @OP@= pairwise_sum_HALF((npy_half *)args[1], n,
- steps[1] / (npy_intp)sizeof(npy_half));
+ io1 @OP@= pairwise_sum_HALF(args[1], n, steps[1]);
#else
BINARY_REDUCE_LOOP_INNER {
io1 @OP@= npy_half_to_float(*(npy_half *)ip2);
@@ -2397,7 +2395,7 @@ HALF_ldexp_long(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UN
/* similar to pairwise sum of real floats */
static void
-pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
+pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, char * a, npy_uintp n,
npy_intp stride)
{
assert(n % 2 == 0);
@@ -2406,8 +2404,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
*rr = 0.;
*ri = 0.;
for (i = 0; i < n; i += 2) {
- *rr += a[i * stride + 0];
- *ri += a[i * stride + 1];
+ *rr += *((@ftype@ *)(a + i * stride + 0));
+ *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
}
return;
}
@@ -2420,26 +2418,26 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
* 8 times unroll reduces blocksize to 16 and allows vectorization with
* avx without changing summation ordering
*/
- r[0] = a[0 * stride];
- r[1] = a[0 * stride + 1];
- r[2] = a[2 * stride];
- r[3] = a[2 * stride + 1];
- r[4] = a[4 * stride];
- r[5] = a[4 * stride + 1];
- r[6] = a[6 * stride];
- r[7] = a[6 * stride + 1];
+ r[0] = *((@ftype@ *)(a + 0 * stride));
+ r[1] = *((@ftype@ *)(a + 0 * stride + sizeof(@ftype@)));
+ r[2] = *((@ftype@ *)(a + 2 * stride));
+ r[3] = *((@ftype@ *)(a + 2 * stride + sizeof(@ftype@)));
+ r[4] = *((@ftype@ *)(a + 4 * stride));
+ r[5] = *((@ftype@ *)(a + 4 * stride + sizeof(@ftype@)));
+ r[6] = *((@ftype@ *)(a + 6 * stride));
+ r[7] = *((@ftype@ *)(a + 6 * stride + sizeof(@ftype@)));
for (i = 8; i < n - (n % 8); i += 8) {
/* small blocksizes seems to mess with hardware prefetch */
- NPY_PREFETCH(&a[(i + 512 / sizeof(a[0])) * stride], 0, 3);
- r[0] += a[(i + 0) * stride];
- r[1] += a[(i + 0) * stride + 1];
- r[2] += a[(i + 2) * stride];
- r[3] += a[(i + 2) * stride + 1];
- r[4] += a[(i + 4) * stride];
- r[5] += a[(i + 4) * stride + 1];
- r[6] += a[(i + 6) * stride];
- r[7] += a[(i + 6) * stride + 1];
+ NPY_PREFETCH(a + (i + 512 / sizeof(@ftype@)) * stride, 0, 3);
+ r[0] += *((@ftype@ *)(a + (i + 0) * stride));
+ r[1] += *((@ftype@ *)(a + (i + 0) * stride + sizeof(@ftype@)));
+ r[2] += *((@ftype@ *)(a + (i + 2) * stride));
+ r[3] += *((@ftype@ *)(a + (i + 2) * stride + sizeof(@ftype@)));
+ r[4] += *((@ftype@ *)(a + (i + 4) * stride));
+ r[5] += *((@ftype@ *)(a + (i + 4) * stride + sizeof(@ftype@)));
+ r[6] += *((@ftype@ *)(a + (i + 6) * stride));
+ r[7] += *((@ftype@ *)(a + (i + 6) * stride + sizeof(@ftype@)));
}
/* accumulate now to avoid stack spills for single peel loop */
@@ -2448,8 +2446,8 @@ pairwise_sum_@TYPE@(@ftype@ *rr, @ftype@ * ri, @ftype@ * a, npy_uintp n,
/* do non multiple of 8 rest */
for (; i < n; i+=2) {
- *rr += a[i * stride + 0];
- *ri += a[i * stride + 1];
+ *rr += *((@ftype@ *)(a + i * stride + 0));
+ *ri += *((@ftype@ *)(a + i * stride + sizeof(@ftype@)));
}
return;
}
@@ -2481,8 +2479,7 @@ NPY_NO_EXPORT void
@ftype@ * oi = ((@ftype@ *)args[0]) + 1;
@ftype@ rr, ri;
- pairwise_sum_@TYPE@(&rr, &ri, (@ftype@ *)args[1], n * 2,
- steps[1] / (npy_intp)sizeof(@ftype@) / 2);
+ pairwise_sum_@TYPE@(&rr, &ri, args[1], n * 2, steps[1] / 2);
*or @OP@= rr;
*oi @OP@= ri;
return;
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 5787a5183..bebeddc92 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -213,6 +213,17 @@ class TestComparisons(object):
assert_equal(np.not_equal(a, a), [True])
+class TestAdd(object):
+ def test_reduce_alignment(self):
+ # gh-9876
+ # make sure arrays with weird strides work with the optimizations in
+ # pairwise_sum_@TYPE@. On x86, the 'b' field will count as aligned at a
+ # 4 byte offset, even though its itemsize is 8.
+ a = np.zeros(2, dtype=[('a', np.int32), ('b', np.float64)])
+ a['a'] = -1
+ assert_equal(a['b'].sum(), 0)
+
+
class TestDivision(object):
def test_division_int(self):
# int division should follow Python
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index bfd6ac28e..2f738c8a6 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -1815,14 +1815,8 @@ def slogdet(a):
real_t = _realType(result_t)
signature = 'D->Dd' if isComplexType(t) else 'd->dd'
sign, logdet = _umath_linalg.slogdet(a, signature=signature)
- if isscalar(sign):
- sign = sign.astype(result_t)
- else:
- sign = sign.astype(result_t, copy=False)
- if isscalar(logdet):
- logdet = logdet.astype(real_t)
- else:
- logdet = logdet.astype(real_t, copy=False)
+ sign = sign.astype(result_t, copy=False)
+ logdet = logdet.astype(real_t, copy=False)
return sign, logdet
def det(a):
@@ -1878,10 +1872,7 @@ def det(a):
t, result_t = _commonType(a)
signature = 'D->D' if isComplexType(t) else 'd->d'
r = _umath_linalg.det(a, signature=signature)
- if isscalar(r):
- r = r.astype(result_t)
- else:
- r = r.astype(result_t, copy=False)
+ r = r.astype(result_t, copy=False)
return r
# Linear Least Squares